@@ -36,11 +36,26 @@ class PageAnalyzer(BrowserTool):
36
36
"""
37
37
)
38
38
39
- force_captcha_detection : bool
39
+ _force_captcha_detection : bool
40
+ _open_new_page : bool
40
41
41
- def __init__ (self , force_captcha_detection : bool = False , ** kwargs ):
42
+ def __init__ (
43
+ self ,
44
+ force_captcha_detection : bool = False ,
45
+ open_new_page = True ,
46
+ ** kwargs ,
47
+ ):
48
+ """
49
+ Initialize the PageAnalyzer tool
50
+
51
+ Args:
52
+ force_captcha_detection: Whether to force the captcha detection when loading the page.
53
+ open_new_page: Whether to open a new page when analyzing the page. If set to False, the current page will be used.
54
+ **kwargs: BrowserTool arguments
55
+ """
42
56
super ().__init__ (** kwargs )
43
- self .force_captcha_detection = force_captcha_detection
57
+ self ._force_captcha_detection = force_captcha_detection
58
+ self ._open_new_page = open_new_page
44
59
45
60
async def _validate_pagination (
46
61
self ,
@@ -268,14 +283,15 @@ async def support_infinite_scroll(
268
283
url: URL of the page
269
284
items_selector: CSS selector of the items on the page
270
285
"""
271
- # use long wait time for pages to be fully loaded
272
- await self .load_page (
273
- ctx = ctx ,
274
- url = url ,
275
- timeout = 3000 ,
276
- wait_for_selector = items_selector ,
277
- force_capcha_detection = self .force_captcha_detection ,
278
- )
286
+ if self ._open_new_page :
287
+ # use long wait time for pages to be fully loaded
288
+ await self .load_page (
289
+ ctx = ctx ,
290
+ url = url ,
291
+ timeout = 3000 ,
292
+ wait_for_selector = items_selector ,
293
+ force_capcha_detection = self ._force_captcha_detection ,
294
+ )
279
295
280
296
return await self .playwright .page .evaluate (
281
297
"""
@@ -363,9 +379,12 @@ async def get_pagination_button(
363
379
url: URL of the page
364
380
items_selector: CSS selector of the items on the page
365
381
"""
366
- await self .load_page (
367
- ctx , url , force_capcha_detection = self .force_captcha_detection
368
- )
382
+ if self ._open_new_page :
383
+ await self .load_page (
384
+ ctx ,
385
+ url ,
386
+ force_capcha_detection = self ._force_captcha_detection ,
387
+ )
369
388
370
389
# use latest page url in case of redirections
371
390
page_url = await self .get_page_url ()
@@ -480,9 +499,12 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> ScrapingType:
480
499
ctx: NPi Context
481
500
url: URL of the page
482
501
"""
483
- await self .load_page (
484
- ctx , url , force_capcha_detection = self .force_captcha_detection
485
- )
502
+ if self ._open_new_page :
503
+ await self .load_page (
504
+ ctx ,
505
+ url ,
506
+ force_capcha_detection = self ._force_captcha_detection ,
507
+ )
486
508
487
509
page_url = await self .get_page_url ()
488
510
page_title = await self .get_page_title ()
@@ -564,12 +586,13 @@ async def infer_similar_items_selector(
564
586
ctx: NPi Context
565
587
url: URL of the page
566
588
"""
567
- await self .load_page (
568
- ctx ,
569
- url ,
570
- timeout = 3000 ,
571
- force_capcha_detection = self .force_captcha_detection ,
572
- )
589
+ if self ._open_new_page :
590
+ await self .load_page (
591
+ ctx ,
592
+ url ,
593
+ timeout = 3000 ,
594
+ force_capcha_detection = self ._force_captcha_detection ,
595
+ )
573
596
574
597
# use latest page url in case of redirections
575
598
page_url = await self .get_page_url ()
0 commit comments