Skip to content

Commit de4e1de

Browse files
committed
refactor(page_analyzer): validate the pagination button in a new playwright instance to avoid side effects
1 parent d8e508d commit de4e1de

File tree

2 files changed

+127
-161
lines changed

2 files changed

+127
-161
lines changed

npiai/tools/scrapers/page_analyzer/app.py

+125-115
Original file line numberDiff line numberDiff line change
@@ -60,138 +60,145 @@ def __init__(
6060
async def _validate_pagination(
6161
self,
6262
ctx: Context,
63+
url: str,
6364
pagination_button_selector: str | None,
6465
items_selector: str | None = None,
6566
) -> bool:
6667
if not pagination_button_selector:
6768
return False
6869

69-
handle = await self.playwright.page.evaluate_handle(
70-
"selector => document.querySelector(selector)",
71-
pagination_button_selector,
72-
)
70+
# validate the pagination button in a new playwright instance to avoid side effects
71+
playwright_clone = await self.playwright.clone()
7372

74-
elem = handle.as_element()
73+
async with BrowserTool(playwright=playwright_clone) as browser:
74+
await browser.load_page(ctx, url)
7575

76-
if not elem:
77-
return False
78-
79-
await self.back_to_top()
80-
old_screenshot = await self.get_screenshot(
81-
full_page=True,
82-
max_size=_MAX_SCREENSHOT_SIZE,
83-
)
84-
old_url = await self.get_page_url()
85-
old_title = await self.get_page_title()
86-
87-
await self.clear_bboxes()
76+
handle = await browser.playwright.page.evaluate_handle(
77+
"selector => document.querySelector(selector)",
78+
pagination_button_selector,
79+
)
8880

89-
try:
90-
await self.click(elem)
91-
except PlaywrightError:
92-
return False
81+
elem = handle.as_element()
9382

94-
has_items_selector = items_selector and items_selector != "*"
83+
if not elem:
84+
return False
9585

96-
# attach mutation observer to check if new items are added
97-
if has_items_selector:
98-
await init_items_observer(
99-
playwright=self.playwright,
100-
ancestor_selector="body",
101-
items_selector=items_selector,
86+
await browser.back_to_top()
87+
old_screenshot = await browser.get_screenshot(
88+
full_page=True,
89+
max_size=_MAX_SCREENSHOT_SIZE,
10290
)
103-
104-
try:
105-
await self.playwright.page.wait_for_load_state(
106-
"domcontentloaded",
107-
timeout=3000,
91+
old_url = await browser.get_page_url()
92+
old_title = await browser.get_page_title()
93+
94+
await browser.clear_bboxes()
95+
96+
try:
97+
await browser.click(elem)
98+
except PlaywrightError:
99+
return False
100+
101+
has_items_selector = items_selector and items_selector != "*"
102+
103+
# attach mutation observer to check if new items are added
104+
if has_items_selector:
105+
await init_items_observer(
106+
playwright=browser.playwright,
107+
ancestor_selector="body",
108+
items_selector=items_selector,
109+
)
110+
111+
try:
112+
await browser.playwright.page.wait_for_load_state(
113+
"domcontentloaded",
114+
timeout=3000,
115+
)
116+
except TimeoutError:
117+
pass
118+
119+
new_url = await browser.get_page_url()
120+
121+
if new_url == old_url and has_items_selector:
122+
return await has_items_added(browser.playwright, timeout=5000)
123+
124+
new_screenshot = await browser.get_screenshot(
125+
full_page=True,
126+
max_size=_MAX_SCREENSHOT_SIZE,
108127
)
109-
except TimeoutError:
110-
pass
111-
112-
new_url = await self.get_page_url()
113-
114-
if new_url == old_url and has_items_selector:
115-
return await has_items_added(self.playwright, timeout=5000)
116-
117-
new_screenshot = await self.get_screenshot(
118-
full_page=True,
119-
max_size=_MAX_SCREENSHOT_SIZE,
120-
)
121-
new_title = await self.get_page_title()
122-
123-
def callback(is_next_page: bool):
124-
"""
125-
Callback function to determine whether the pagination button is working.
126-
127-
Args:
128-
is_next_page: A boolean value indicating whether the page is navigated to the next page or the content within pagination component is changed.
129-
"""
130-
return is_next_page
131-
132-
res = await llm_tool_call(
133-
llm=ctx.llm,
134-
tool=callback,
135-
messages=[
136-
ChatCompletionSystemMessageParam(
137-
role="system",
138-
content=dedent(
139-
"""
140-
Compare the screenshots of the page before and after clicking the pagination button to determine whether the pagination button is working.
141-
142-
## Provided Context
143-
- The URL of the page before clicking the pagination button.
144-
- The title of the page before clicking the pagination button.
145-
- The URL of the page after clicking the pagination button.
146-
- The title of the page after clicking the pagination button.
147-
- The screenshot of the page before clicking the pagination button.
148-
- The screenshot of the page after clicking the pagination button.
149-
150-
## Instructions
151-
152-
Follow the instructions to determine whether the pagination button is working:
153-
1. Review the screenshot of the page before clicking the pagination button (the first screenshot) and think if the page actually supports pagination.
154-
2. Compare the old URL and the new URL to see if the page is navigated to the next page.
155-
3. Compare the old title and the new title to see the two pages are related.
156-
4. Compare the first screenshot (the screenshot before clicking the pagination button) with the second screenshot (the screenshot after clicking the pagination button) to see if there are any differences.
157-
5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working. Note that opening or closing a popup/modal in the same page is not considered as pagination.
158-
6. If the pagination button is working, call the tool with `true`. Otherwise, call the tool with `false`.
159-
"""
128+
new_title = await browser.get_page_title()
129+
130+
def callback(is_next_page: bool):
131+
"""
132+
Callback function to determine whether the pagination button is working.
133+
134+
Args:
135+
is_next_page: A boolean value indicating whether the page is navigated to the next page or the content within pagination component is changed.
136+
"""
137+
return is_next_page
138+
139+
res = await llm_tool_call(
140+
llm=ctx.llm,
141+
tool=callback,
142+
messages=[
143+
ChatCompletionSystemMessageParam(
144+
role="system",
145+
content=dedent(
146+
"""
147+
Compare the screenshots of the page before and after clicking the pagination button to determine whether the pagination button is working.
148+
149+
## Provided Context
150+
- The URL of the page before clicking the pagination button.
151+
- The title of the page before clicking the pagination button.
152+
- The URL of the page after clicking the pagination button.
153+
- The title of the page after clicking the pagination button.
154+
- The screenshot of the page before clicking the pagination button.
155+
- The screenshot of the page after clicking the pagination button.
156+
157+
## Instructions
158+
159+
Follow the instructions to determine whether the pagination button is working:
160+
1. Review the screenshot of the page before clicking the pagination button (the first screenshot) and think if the page actually supports pagination.
161+
2. Compare the old URL and the new URL to see if the page is navigated to the next page.
162+
3. Compare the old title and the new title to see the two pages are related.
163+
4. Compare the first screenshot (the screenshot before clicking the pagination button) with the second screenshot (the screenshot after clicking the pagination button) to see if there are any differences.
164+
5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working. Note that opening or closing a popup/modal in the same page is not considered as pagination.
165+
6. If the pagination button is working, call the tool with `true`. Otherwise, call the tool with `false`.
166+
"""
167+
),
160168
),
161-
),
162-
ChatCompletionUserMessageParam(
163-
role="user",
164-
content=[
165-
{
166-
"type": "text",
167-
"text": json.dumps(
168-
{
169-
"old_url": old_url,
170-
"old_title": old_title,
171-
"new_url": new_url,
172-
"new_title": new_title,
169+
ChatCompletionUserMessageParam(
170+
role="user",
171+
content=[
172+
{
173+
"type": "text",
174+
"text": json.dumps(
175+
{
176+
"old_url": old_url,
177+
"old_title": old_title,
178+
"new_url": new_url,
179+
"new_title": new_title,
180+
},
181+
ensure_ascii=False,
182+
),
183+
},
184+
{
185+
"type": "image_url",
186+
"image_url": {
187+
"url": old_screenshot,
173188
},
174-
ensure_ascii=False,
175-
),
176-
},
177-
{
178-
"type": "image_url",
179-
"image_url": {
180-
"url": old_screenshot,
181189
},
182-
},
183-
{
184-
"type": "image_url",
185-
"image_url": {
186-
"url": new_screenshot,
190+
{
191+
"type": "image_url",
192+
"image_url": {
193+
"url": new_screenshot,
194+
},
187195
},
188-
},
189-
],
190-
),
191-
],
192-
)
196+
],
197+
),
198+
],
199+
)
193200

194-
return callback(**res.model_dump())
201+
return callback(**res.model_dump())
195202

196203
async def get_selector_of_marker(self, marker_id: int = -1) -> str | None:
197204
"""
@@ -484,7 +491,10 @@ async def get_pagination_button(
484491
)
485492

486493
is_working = await self._validate_pagination(
487-
ctx, pagination_button_selector, items_selector
494+
ctx=ctx,
495+
url=url,
496+
pagination_button_selector=pagination_button_selector,
497+
items_selector=items_selector,
488498
)
489499
await ctx.send_debug_message(f"Pagination button is working: {is_working}")
490500

npiai/tools/scrapers/web/__test__/interactive.py

+2-46
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,16 @@
11
import asyncio
2-
from typing import Literal
32

4-
from npiai.core import PlaywrightContext
53
from npiai.utils.test_utils import DebugContext
64

75
# from npiai import Context
86
from utils import auto_scrape
9-
10-
from npiai import HITL
11-
12-
13-
class TestHITL(HITL):
14-
async def confirm(
15-
self,
16-
tool_name: str,
17-
message: str,
18-
default=False,
19-
) -> bool:
20-
print(f"[HITL] confirm: {message=}, {default=}")
21-
return True
22-
23-
async def input(
24-
self,
25-
tool_name: str,
26-
message: str,
27-
default="",
28-
) -> str:
29-
print(f"[HITL] input: {message=}, {default=}")
30-
return "input"
31-
32-
async def select(
33-
self,
34-
tool_name: str,
35-
message: str,
36-
choices: list[str],
37-
default="",
38-
) -> str:
39-
print(f"[HITL] select: {message=}, {choices=}, {default=}")
40-
return "select"
41-
42-
async def web_interaction(
43-
self,
44-
tool_name: str,
45-
message: str,
46-
url: str,
47-
action: Literal["captcha", "login"],
48-
playwright: PlaywrightContext,
49-
) -> str:
50-
print(f"[HITL] web_interaction: {message=}, {url=}, {action=}")
51-
return "web_interaction"
7+
from npiai.hitl_handler import ConsoleHandler
528

539

5410
async def main():
5511
url = input("Enter the URL: ")
5612
ctx = DebugContext()
57-
ctx.use_hitl(TestHITL())
13+
ctx.use_hitl(ConsoleHandler())
5814
# url = "https://www.bardeen.ai/playbooks"
5915

6016
await auto_scrape(

0 commit comments

Comments
 (0)