From 42fdc1ee3f82fbd79e8927126f29d7f45ea686b7 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Thu, 20 Jun 2024 21:03:26 +0100 Subject: [PATCH] Log document URLs --- src/dom_tokenizers/pre_tokenizers/dom_snapshot.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py index f0849fe..3c1b636 100644 --- a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py +++ b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py @@ -1,3 +1,5 @@ +import logging + from collections import defaultdict from dataclasses import make_dataclass from xml.dom import Node @@ -11,6 +13,8 @@ from .splitter import TextSplitter, Flags as Split from .token_buffer import TokenBuffer +logger = logging.getLogger(__name__) + class DOMSnapshotPreTokenizer(PreTokenizer): """Pre-tokenizer that consumes JSON-serialized DOM snapshots @@ -29,7 +33,12 @@ def pre_tokenize_dom(self, buf: TokenBuffer, serialized: str): split = TokenCache(snapshot["strings"], self._splitter).get - for document in snapshot["documents"]: + for doc_index, document in enumerate(snapshot["documents"]): + logger.info( + "doc %d: %s", + doc_index, + snapshot["strings"][document["documentURL"]]) + stack = [self._SENTINEL] for node in _Node.each(document["nodes"]): while stack[-1].index != node.parent_index: