Skip to content

Commit

Permalink
Log document URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 20, 2024
1 parent fd6df64 commit 42fdc1e
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion src/dom_tokenizers/pre_tokenizers/dom_snapshot.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from collections import defaultdict
from dataclasses import make_dataclass
from xml.dom import Node
Expand All @@ -11,6 +13,8 @@
from .splitter import TextSplitter, Flags as Split
from .token_buffer import TokenBuffer

logger = logging.getLogger(__name__)


class DOMSnapshotPreTokenizer(PreTokenizer):
"""Pre-tokenizer that consumes JSON-serialized DOM snapshots
Expand All @@ -29,7 +33,12 @@ def pre_tokenize_dom(self, buf: TokenBuffer, serialized: str):

split = TokenCache(snapshot["strings"], self._splitter).get

for document in snapshot["documents"]:
for doc_index, document in enumerate(snapshot["documents"]):
logger.info(
"doc %d: %s",
doc_index,
snapshot["strings"][document["documentURL"]])

stack = [self._SENTINEL]
for node in _Node.each(document["nodes"]):
while stack[-1].index != node.parent_index:
Expand Down

0 comments on commit 42fdc1e

Please sign in to comment.