Skip to content

Commit 1911889

Browse files
committed
feat(scraper): add matched_hashes to output
1 parent a6889ba commit 1911889

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

npiai/tools/web/scraper/__test__/incremental.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ async def summarize(skip_item_hashes: Set[str] | None = None):
1717
scraping_type="list-like",
1818
ancestor_selector=".playbook_list",
1919
items_selector=".playbook_list .playbook_item",
20-
limit=5,
20+
limit=20,
21+
concurrency=2,
2122
skip_item_hashes=skip_item_hashes,
2223
output_columns=[
2324
{

npiai/tools/web/scraper/app.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,15 @@ class SummaryItem(TypedDict):
4646

4747
class SummaryChunk(TypedDict):
4848
batch_id: int
49+
matched_hashes: List[str]
4950
items: List[SummaryItem]
5051

5152

5253
@dataclass
5354
class ParsedResult:
5455
markdown: str
5556
hashes: List[str]
57+
matched_hashes: List[str]
5658

5759

5860
__ID_COLUMN__ = Column(
@@ -201,6 +203,7 @@ async def run_batch():
201203
await results_queue.put(
202204
{
203205
"batch_id": current_index,
206+
"matched_hashes": parsed_result.matched_hashes,
204207
"items": items_slice,
205208
}
206209
)
@@ -432,7 +435,9 @@ async def _parse_items(
432435

433436
sections = []
434437
hashes = []
438+
matched_hashes = []
435439
count = 0
440+
436441
marking_tasks = []
437442

438443
# use element handles here to snapshot the items
@@ -441,6 +446,7 @@ async def _parse_items(
441446
markdown, md5 = self._html_to_md_and_hash(html)
442447

443448
if skip_item_hashes and md5 in skip_item_hashes:
449+
matched_hashes.append(md5)
444450
continue
445451

446452
# mark the item as visited
@@ -464,7 +470,11 @@ async def _parse_items(
464470

465471
await asyncio.gather(*marking_tasks)
466472

467-
return ParsedResult(markdown="\n".join(sections), hashes=hashes)
473+
return ParsedResult(
474+
markdown="\n".join(sections),
475+
hashes=hashes,
476+
matched_hashes=matched_hashes,
477+
)
468478

469479
async def _parse_ancestor(
470480
self,
@@ -518,12 +528,14 @@ async def _parse_ancestor(
518528

519529
sections = []
520530
hashes = []
531+
matched_hashes = []
521532
count = 0
522533

523534
for html in htmls:
524535
markdown, md5 = self._html_to_md_and_hash(html)
525536

526537
if skip_item_hashes and md5 in skip_item_hashes:
538+
matched_hashes.append(md5)
527539
continue
528540

529541
sections.append(f'<section id="{count}">\n{markdown}\n</section>')
@@ -533,7 +545,11 @@ async def _parse_ancestor(
533545
if not count:
534546
return None
535547

536-
return ParsedResult(markdown="\n".join(sections), hashes=hashes)
548+
return ParsedResult(
549+
markdown="\n".join(sections),
550+
hashes=hashes,
551+
matched_hashes=matched_hashes,
552+
)
537553

538554
@staticmethod
539555
def _html_to_md_and_hash(html):

0 commit comments

Comments
 (0)