@@ -46,13 +46,15 @@ class SummaryItem(TypedDict):
46
46
47
47
class SummaryChunk (TypedDict ):
48
48
batch_id : int
49
+ matched_hashes : List [str ]
49
50
items : List [SummaryItem ]
50
51
51
52
52
53
@dataclass
53
54
class ParsedResult :
54
55
markdown : str
55
56
hashes : List [str ]
57
+ matched_hashes : List [str ]
56
58
57
59
58
60
__ID_COLUMN__ = Column (
@@ -201,6 +203,7 @@ async def run_batch():
201
203
await results_queue .put (
202
204
{
203
205
"batch_id" : current_index ,
206
+ "matched_hashes" : parsed_result .matched_hashes ,
204
207
"items" : items_slice ,
205
208
}
206
209
)
@@ -432,7 +435,9 @@ async def _parse_items(
432
435
433
436
sections = []
434
437
hashes = []
438
+ matched_hashes = []
435
439
count = 0
440
+
436
441
marking_tasks = []
437
442
438
443
# use element handles here to snapshot the items
@@ -441,6 +446,7 @@ async def _parse_items(
441
446
markdown , md5 = self ._html_to_md_and_hash (html )
442
447
443
448
if skip_item_hashes and md5 in skip_item_hashes :
449
+ matched_hashes .append (md5 )
444
450
continue
445
451
446
452
# mark the item as visited
@@ -464,7 +470,11 @@ async def _parse_items(
464
470
465
471
await asyncio .gather (* marking_tasks )
466
472
467
- return ParsedResult (markdown = "\n " .join (sections ), hashes = hashes )
473
+ return ParsedResult (
474
+ markdown = "\n " .join (sections ),
475
+ hashes = hashes ,
476
+ matched_hashes = matched_hashes ,
477
+ )
468
478
469
479
async def _parse_ancestor (
470
480
self ,
@@ -518,12 +528,14 @@ async def _parse_ancestor(
518
528
519
529
sections = []
520
530
hashes = []
531
+ matched_hashes = []
521
532
count = 0
522
533
523
534
for html in htmls :
524
535
markdown , md5 = self ._html_to_md_and_hash (html )
525
536
526
537
if skip_item_hashes and md5 in skip_item_hashes :
538
+ matched_hashes .append (md5 )
527
539
continue
528
540
529
541
sections .append (f'<section id="{ count } ">\n { markdown } \n </section>' )
@@ -533,7 +545,11 @@ async def _parse_ancestor(
533
545
if not count :
534
546
return None
535
547
536
- return ParsedResult (markdown = "\n " .join (sections ), hashes = hashes )
548
+ return ParsedResult (
549
+ markdown = "\n " .join (sections ),
550
+ hashes = hashes ,
551
+ matched_hashes = matched_hashes ,
552
+ )
537
553
538
554
@staticmethod
539
555
def _html_to_md_and_hash (html ):
0 commit comments