Skip to content

Commit d8e508d

Browse files
committed
refactor(scraper/web): add open_new_page option
1 parent 8653a57 commit d8e508d

File tree

3 files changed

+62
-33
lines changed

3 files changed

+62
-33
lines changed

npiai/tools/scrapers/page_analyzer/app.py

+46-23
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,26 @@ class PageAnalyzer(BrowserTool):
3636
"""
3737
)
3838

39-
force_captcha_detection: bool
39+
_force_captcha_detection: bool
40+
_open_new_page: bool
4041

41-
def __init__(self, force_captcha_detection: bool = False, **kwargs):
42+
def __init__(
43+
self,
44+
force_captcha_detection: bool = False,
45+
open_new_page=True,
46+
**kwargs,
47+
):
48+
"""
49+
Initialize the PageAnalyzer tool
50+
51+
Args:
52+
force_captcha_detection: Whether to force the captcha detection when loading the page.
53+
open_new_page: Whether to open a new page when analyzing the page. If set to False, the current page will be used.
54+
**kwargs: BrowserTool arguments
55+
"""
4256
super().__init__(**kwargs)
43-
self.force_captcha_detection = force_captcha_detection
57+
self._force_captcha_detection = force_captcha_detection
58+
self._open_new_page = open_new_page
4459

4560
async def _validate_pagination(
4661
self,
@@ -268,14 +283,15 @@ async def support_infinite_scroll(
268283
url: URL of the page
269284
items_selector: CSS selector of the items on the page
270285
"""
271-
# use long wait time for pages to be fully loaded
272-
await self.load_page(
273-
ctx=ctx,
274-
url=url,
275-
timeout=3000,
276-
wait_for_selector=items_selector,
277-
force_capcha_detection=self.force_captcha_detection,
278-
)
286+
if self._open_new_page:
287+
# use long wait time for pages to be fully loaded
288+
await self.load_page(
289+
ctx=ctx,
290+
url=url,
291+
timeout=3000,
292+
wait_for_selector=items_selector,
293+
force_capcha_detection=self._force_captcha_detection,
294+
)
279295

280296
return await self.playwright.page.evaluate(
281297
"""
@@ -363,9 +379,12 @@ async def get_pagination_button(
363379
url: URL of the page
364380
items_selector: CSS selector of the items on the page
365381
"""
366-
await self.load_page(
367-
ctx, url, force_capcha_detection=self.force_captcha_detection
368-
)
382+
if self._open_new_page:
383+
await self.load_page(
384+
ctx,
385+
url,
386+
force_capcha_detection=self._force_captcha_detection,
387+
)
369388

370389
# use latest page url in case of redirections
371390
page_url = await self.get_page_url()
@@ -480,9 +499,12 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> ScrapingType:
480499
ctx: NPi Context
481500
url: URL of the page
482501
"""
483-
await self.load_page(
484-
ctx, url, force_capcha_detection=self.force_captcha_detection
485-
)
502+
if self._open_new_page:
503+
await self.load_page(
504+
ctx,
505+
url,
506+
force_capcha_detection=self._force_captcha_detection,
507+
)
486508

487509
page_url = await self.get_page_url()
488510
page_title = await self.get_page_title()
@@ -564,12 +586,13 @@ async def infer_similar_items_selector(
564586
ctx: NPi Context
565587
url: URL of the page
566588
"""
567-
await self.load_page(
568-
ctx,
569-
url,
570-
timeout=3000,
571-
force_capcha_detection=self.force_captcha_detection,
572-
)
589+
if self._open_new_page:
590+
await self.load_page(
591+
ctx,
592+
url,
593+
timeout=3000,
594+
force_capcha_detection=self._force_captcha_detection,
595+
)
573596

574597
# use latest page url in case of redirections
575598
page_url = await self.get_page_url()

npiai/tools/scrapers/web/app.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@ class WebScraper(BaseScraper, BrowserTool):
3636
pagination_button_selector: str | None
3737
skip_item_hashes: Set[str] | None
3838

39-
# The maximum number of items to summarize in a single batch
40-
_batch_size: int
41-
4239
# all items loaded flag
4340
_all_items_loaded: bool = False
4441

@@ -49,6 +46,9 @@ class WebScraper(BaseScraper, BrowserTool):
4946
# The list of hashes of items that have been skipped
5047
_matched_hashes: List[str]
5148

49+
# Whether to open a new page when start scraping
50+
_open_new_page: bool
51+
5252
def __init__(
5353
self,
5454
url: str,
@@ -58,6 +58,7 @@ def __init__(
5858
pagination_button_selector: str | None = None,
5959
skip_item_hashes: List[str] | None = None,
6060
headless: bool = True,
61+
open_new_page: bool = True,
6162
playwright: PlaywrightContext = None,
6263
):
6364
BaseScraper.__init__(self)
@@ -68,6 +69,7 @@ def __init__(
6869
self.items_selector = items_selector
6970
self.pagination_button_selector = pagination_button_selector
7071
self.skip_item_hashes = set(skip_item_hashes) if skip_item_hashes else None
72+
self._open_new_page = open_new_page
7173
self._matched_hashes = []
7274
self._webpage_access_lock = asyncio.Lock()
7375

@@ -76,13 +78,15 @@ def get_matched_hashes(self) -> List[str]:
7678

7779
async def init_data(self, ctx: Context):
7880
self._matched_hashes = []
79-
await self.load_page(
80-
ctx=ctx,
81-
url=self.url,
82-
timeout=3000,
83-
wait_for_selector=self.items_selector,
84-
force_capcha_detection=True,
85-
)
81+
82+
if self._open_new_page:
83+
await self.load_page(
84+
ctx=ctx,
85+
url=self.url,
86+
timeout=3000,
87+
wait_for_selector=self.items_selector,
88+
force_capcha_detection=True,
89+
)
8690

8791
async def next_items(
8892
self,

npiai/tools/scrapers/web/presets/linkedin/posts_scraper.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def __init__(
3434
url: str,
3535
skip_item_hashes: List[str] | None = None,
3636
headless: bool = True,
37+
open_new_page: bool = True,
3738
playwright: PlaywrightContext = None,
3839
):
3940
super().__init__(
@@ -42,6 +43,7 @@ def __init__(
4243
items_selector=".fie-impression-container",
4344
skip_item_hashes=skip_item_hashes,
4445
headless=headless,
46+
open_new_page=open_new_page,
4547
playwright=playwright,
4648
)
4749

0 commit comments

Comments
 (0)