Skip to content

Commit f3a8f94

Browse files
committed
refactor(page_analyzer): use screenshot when inferring similar items
1 parent fb8edb2 commit f3a8f94

File tree

1 file changed

+11
-10
lines changed
  • npiai/tools/scrapers/page_analyzer

1 file changed

+11
-10
lines changed

npiai/tools/scrapers/page_analyzer/app.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -607,15 +607,16 @@ async def infer_similar_items_selector(
607607
# use latest page url in case of redirections
608608
page_url = await self.get_page_url()
609609
page_title = await self.get_page_title()
610-
# raw_screenshot = await self.get_screenshot(full_page=True)
610+
raw_screenshot = await self.get_screenshot(full_page=True)
611611

612612
contentful_elements = await self.playwright.page.evaluate(
613613
"""
614-
() => npi.getMostContentfulElements(null, 10)
614+
(screenshot) => npi.getMostContentfulElements(screenshot, 10)
615615
""",
616+
raw_screenshot,
616617
)
617618

618-
# annotated_screenshot = await self.get_screenshot(full_page=True)
619+
annotated_screenshot = await self.get_screenshot(full_page=True)
619620

620621
elements_as_markdown = []
621622
group_element_count = {}
@@ -669,7 +670,7 @@ async def infer_similar_items_selector(
669670
670671
## Instructions
671672
672-
Follow the instructions to determine whether there is a pagination button on the current page for navigating to the next page:
673+
Follow the instructions to determine whether there are similar elements representing the most meaningful list of items:
673674
1. Examine the URL, and the title of the page to understand the context, and then think about what the current page is.
674675
2. Go through the elements array, grab the semantic information of the elements via the "content" property. Pay attention to the elements with the same group ID as they are under the same parent element.
675676
3. Check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
@@ -691,12 +692,12 @@ async def infer_similar_items_selector(
691692
ensure_ascii=False,
692693
),
693694
},
694-
# {
695-
# "type": "image_url",
696-
# "image_url": {
697-
# "url": annotated_screenshot,
698-
# },
699-
# },
695+
{
696+
"type": "image_url",
697+
"image_url": {
698+
"url": annotated_screenshot,
699+
},
700+
},
700701
],
701702
),
702703
],

0 commit comments

Comments
 (0)