Skip to content

Commit

Permalink
Handle multiply-escaped input
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 19, 2024
1 parent 3db0b71 commit 4315666
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dom-tokenizers"
version = "0.0.16"
version = "0.0.17"
authors = [{ name = "Gary Benson", email = "gary@gbenson.net" }]
description = "DOM-aware tokenization for 🤗 Hugging Face language models"
readme = "README.md"
Expand Down
9 changes: 7 additions & 2 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,14 @@ def _sub_js_escape(self, splits, cursor):
cursor_limit = cursor + 1

# Ensure `curr` holds a complete sequence, minus the initial backslash
# or backslashes. (We need to handle more than one at a time to deal
# with multiply-escaped text, which we can't otherwise handle as we're
# de-escaping out-of-context and so can't determine which order things
# should be happening in.)
curr = curr.lstrip("\\")
if len(curr) > 1:
# Trim the initial backslash
curr = curr[1:]
# Backslash followed by nonword
pass
elif cursor_limit >= len(splits):
# Terminal backslash
splits.pop(cursor)
Expand Down
11 changes: 8 additions & 3 deletions tests/test_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def test_first_split_re(text, expect_splits):
(r"hell\u006f\u020\u0077orld", ["hello", "u020world"]), # mixd {,in}valid
(r"hello\'\u0020world", ["hello", "world"]),
# XXX N.B. Javascript is UTF-16 internal, so, surrogates?
(r"\\u0041", ["A"]),
(r"\\u0042\u0043", ["BC"]),
(r"\u0044\\u0045", ["DE"]),
# Javascript hex escapes
(r"hello\x20world", ["hello", "world"]),
Expand Down Expand Up @@ -163,11 +166,11 @@ def test_first_split_re(text, expect_splits):
(r"hello,\'world", ["hello", "world"]),
(r"hello,\"world", ["hello", "world"]),
(r"hello\\\'\x77orld", ["hello", "world"]),
(r"hello\\\'\x77orld", ["hello'world"]),
(r"hello\\\"\x77orld", ["hello", "world"]),
(r"hello\\\'%77orld", ["hello", "world"]),
(r"hello\\\'%77orld", ["hello'world"]),
(r"hello\\\"%77orld", ["hello", "world"]),
(r"hello\\\'world", ["hello", "world"]),
(r"hello\\\'world", ["hello'world"]),
(r"hello\\\"world", ["hello", "world"]),
("hello'world", ["hello'world"]),
Expand Down Expand Up @@ -251,6 +254,8 @@ def test_sub_js_escape_crasher():
["src", "url", "fonts", "gstatic", "com", "s", "roboto", "v18",
"[BASE64]", "woff2", "format", "woff2", "unicode",
"range", "U", "0000", "00FF"]),
(r"kNEu9lE8g2RGVVvZ6clo\\u003d\x22,1,0,null",
["[BASE64]", "1", "0", "null"]),
))
def test_regressions(text, expect_tokens):
"""Check that things we improve stay improved.
Expand Down

0 comments on commit 4315666

Please sign in to comment.