Skip to content

Commit e6a64c2

Browse files
committed
refactor(page_analyzer): use screenshot when inferring similar items
1 parent fb8edb2 commit e6a64c2

File tree

1 file changed

+12
-10
lines changed
  • npiai/tools/scrapers/page_analyzer

1 file changed

+12
-10
lines changed

npiai/tools/scrapers/page_analyzer/app.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -607,15 +607,16 @@ async def infer_similar_items_selector(
607607
# use latest page url in case of redirections
608608
page_url = await self.get_page_url()
609609
page_title = await self.get_page_title()
610-
# raw_screenshot = await self.get_screenshot(full_page=True)
610+
raw_screenshot = await self.get_screenshot(full_page=True)
611611

612612
contentful_elements = await self.playwright.page.evaluate(
613613
"""
614-
() => npi.getMostContentfulElements(null, 10)
614+
(screenshot) => npi.getMostContentfulElements(screenshot, 10)
615615
""",
616+
raw_screenshot,
616617
)
617618

618-
# annotated_screenshot = await self.get_screenshot(full_page=True)
619+
annotated_screenshot = await self.get_screenshot(full_page=True)
619620

620621
elements_as_markdown = []
621622
group_element_count = {}
@@ -656,6 +657,7 @@ async def infer_similar_items_selector(
656657
- The URL of the page.
657658
- The title of the page.
658659
- An array of the most contextful elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out.
660+
- An annotated screenshot of the target page where the most contextful elements are surrounded with rectangular bounding boxes in different colors. At the top left of each bounding box is a small rectangle in the same color as the bounding box. This is the label and it contains a number indicating the ID of that box. The label number starts from 0.
659661
660662
## Element Object
661663
@@ -669,7 +671,7 @@ async def infer_similar_items_selector(
669671
670672
## Instructions
671673
672-
Follow the instructions to determine whether there is a pagination button on the current page for navigating to the next page:
674+
Follow the instructions to determine whether there are similar elements representing the most meaningful list of items:
673675
1. Examine the URL, and the title of the page to understand the context, and then think about what the current page is.
674676
2. Go through the elements array, grab the semantic information of the elements via the "content" property. Pay attention to the elements with the same group ID as they are under the same parent element.
675677
3. Check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
@@ -691,12 +693,12 @@ async def infer_similar_items_selector(
691693
ensure_ascii=False,
692694
),
693695
},
694-
# {
695-
# "type": "image_url",
696-
# "image_url": {
697-
# "url": annotated_screenshot,
698-
# },
699-
# },
696+
{
697+
"type": "image_url",
698+
"image_url": {
699+
"url": annotated_screenshot,
700+
},
701+
},
700702
],
701703
),
702704
],

0 commit comments

Comments
 (0)