Skip to content

Commit a381706

Browse files
committed
fix(scraper): add lock to web content access to avoid duplicates
1 parent 1911889 commit a381706

File tree

1 file changed

+12
-6
lines changed

1 file changed

+12
-6
lines changed

npiai/tools/web/scraper/app.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,17 @@ class Scraper(BrowserTool):
8181

8282
_navigator: NavigatorAgent
8383

84+
# asyncio lock to prevent concurrent access to the webpage
85+
# to avoid retrieving the same items multiple times
86+
_webpage_access_lock: asyncio.Lock
87+
8488
def __init__(self, batch_size: int = 10, **kwargs):
8589
super().__init__(**kwargs)
8690
self._navigator = NavigatorAgent(
8791
playwright=self.playwright,
8892
)
8993
self._batch_size = batch_size
94+
self._webpage_access_lock = asyncio.Lock()
9095
self.add_tool(self._navigator)
9196

9297
@classmethod
@@ -392,13 +397,14 @@ async def _parse(
392397
limit: int = -1,
393398
skip_item_hashes: Set[str] | None = None,
394399
) -> ParsedResult | None | None:
395-
# convert relative links to absolute links
396-
await self._process_relative_links()
400+
async with self._webpage_access_lock:
401+
# convert relative links to absolute links
402+
await self._process_relative_links()
397403

398-
if items_selector is None:
399-
return await self._parse_ancestor(ancestor_selector, skip_item_hashes)
400-
else:
401-
return await self._parse_items(items_selector, limit, skip_item_hashes)
404+
if items_selector is None:
405+
return await self._parse_ancestor(ancestor_selector, skip_item_hashes)
406+
else:
407+
return await self._parse_items(items_selector, limit, skip_item_hashes)
402408

403409
async def _parse_items(
404410
self,

0 commit comments

Comments
 (0)