Handle multiply-escaped input

gbenson · Jun 19, 2024 · 4315666 · 4315666
1 parent 3db0b71
commit 4315666
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 6 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dom-tokenizers"
-version = "0.0.16"
+version = "0.0.17"
 authors = [{ name = "Gary Benson", email = "gary@gbenson.net" }]
 description = "DOM-aware tokenization for 🤗 Hugging Face language models"
 readme = "README.md"

diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py
@@ -303,9 +303,14 @@ def _sub_js_escape(self, splits, cursor):
         cursor_limit = cursor + 1
 
         # Ensure `curr` holds a complete sequence, minus the initial backslash
+        # or backslashes.  (We need to handle more than one at a time to deal
+        # with multiply-escaped text, which we can't otherwise handle as we're
+        # de-escaping out-of-context and so can't determine which order things
+        # should be happening in.)
+        curr = curr.lstrip("\\")
         if len(curr) > 1:
-            # Trim the initial backslash
-            curr = curr[1:]
+            # Backslash followed by nonword
+            pass
         elif cursor_limit >= len(splits):
             # Terminal backslash
             splits.pop(cursor)

diff --git a/tests/test_splitter.py b/tests/test_splitter.py
@@ -85,6 +85,9 @@ def test_first_split_re(text, expect_splits):
      (r"hell\u006f\u020\u0077orld", ["hello", "u020world"]),  # mixd {,in}valid
      (r"hello\'\u0020world", ["hello", "world"]),
      # XXX N.B. Javascript is UTF-16 internal, so, surrogates?
+     (r"\\u0041", ["A"]),
+     (r"\\u0042\u0043", ["BC"]),
+     (r"\u0044\\u0045", ["DE"]),
 
      # Javascript hex escapes
      (r"hello\x20world", ["hello", "world"]),
@@ -163,11 +166,11 @@ def test_first_split_re(text, expect_splits):
      (r"hello,\'&#119;orld", ["hello", "world"]),
      (r"hello,\"&#119;orld", ["hello", "world"]),
 
-     (r"hello\\\'\x77orld", ["hello", "world"]),
+     (r"hello\\\'\x77orld", ["hello'world"]),
      (r"hello\\\"\x77orld", ["hello", "world"]),
-     (r"hello\\\'%77orld", ["hello", "world"]),
+     (r"hello\\\'%77orld", ["hello'world"]),
      (r"hello\\\"%77orld", ["hello", "world"]),
-     (r"hello\\\'&#119;orld", ["hello", "world"]),
+     (r"hello\\\'&#119;orld", ["hello'world"]),
      (r"hello\\\"&#119;orld", ["hello", "world"]),
 
      ("hell&#111;&apos;world", ["hello'world"]),
@@ -251,6 +254,8 @@ def test_sub_js_escape_crasher():
       ["src", "url", "fonts", "gstatic", "com", "s", "roboto", "v18",
        "[BASE64]", "woff2", "format", "woff2", "unicode",
        "range", "U", "0000", "00FF"]),
+     (r"kNEu9lE8g2RGVVvZ6clo\\u003d\x22,1,0,null",
+      ["[BASE64]", "1", "0", "null"]),
      ))
 def test_regressions(text, expect_tokens):
     """Check that things we improve stay improved.