Skip to content

Commit ac091b0

Browse files
committed
fix(scraper): avoid over scraping
1 parent ebc7649 commit ac091b0

File tree

1 file changed

+18
-13
lines changed

1 file changed

+18
-13
lines changed

npiai/tools/web/scraper/app.py

+18-13
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ async def summarize_stream(
133133
batch_index = 0
134134

135135
results_queue: asyncio.Queue[SummaryChunk] = asyncio.Queue()
136+
lock = asyncio.Lock()
136137

137138
skip_item_hashes_set = set(skip_item_hashes) if skip_item_hashes else None
138139

@@ -142,14 +143,17 @@ async def run_batch():
142143
if limit != -1 and remaining <= 0:
143144
return
144145

145-
current_index = batch_index
146-
batch_index += 1
146+
async with lock:
147+
current_index = batch_index
148+
batch_index += 1
147149

148-
# calculate the number of items to summarize in the current batch
149-
requested_count = min(self._batch_size, remaining) if limit != -1 else -1
150-
# reduce the remaining count by the number of items in the current batch
151-
# so that the other tasks will not exceed the limit
152-
remaining -= requested_count
150+
# calculate the number of items to summarize in the current batch
151+
requested_count = (
152+
min(self._batch_size, remaining) if limit != -1 else -1
153+
)
154+
# reduce the remaining count by the number of items in the current batch
155+
# so that the other tasks will not exceed the limit
156+
remaining -= requested_count
153157

154158
parsed_result = await self._convert(
155159
ancestor_selector=ancestor_selector,
@@ -179,12 +183,13 @@ async def run_batch():
179183
await ctx.send_debug_message(f"[{self.name}] No items summarized")
180184
return
181185

182-
items_slice = items[:requested_count] if limit != -1 else items
183-
summarized_count = len(items_slice)
184-
count += summarized_count
185-
# correct the remaining count in case summary returned fewer items than requested
186-
if summarized_count < requested_count:
187-
remaining += requested_count - summarized_count
186+
async with lock:
187+
items_slice = items[:requested_count] if limit != -1 else items
188+
summarized_count = len(items_slice)
189+
count += summarized_count
190+
# recalculate the remaining count in case summary returned fewer items than requested
191+
if summarized_count < requested_count:
192+
remaining += requested_count - summarized_count
188193

189194
await results_queue.put(
190195
{

0 commit comments

Comments
 (0)