maybe need a tokensequence class

gbenson · May 20, 2024 · b992fd2 · b992fd2
1 parent 9a8a117
commit b992fd2
Showing 1 changed file with 85 additions and 6 deletions.
diff --git a/src/dom_tokenizers/pre_tokenizers/pipeline.py b/src/dom_tokenizers/pre_tokenizers/pipeline.py
@@ -1,9 +1,88 @@
+from itertools import chain
+
+CURLY_APOSTROPHE = "\u2019"
+
+COMBINERS = "'-._"
+PRENORMALIZED_COMBINERS = {CURLY_APOSTROPHE}.union(COMBINERS)
+
+
+def normalize_word(word):
+    """Lowercase letters, straighten curly apostrophes.
+    """
+    if word == CURLY_APOSTROPHE:
+        return "'"
+    return word.lower()
+
+
+def zap(token):
+    token[0] = ""
+
+
 def filter_tokens(tokens):
-    # Normalize curly apostrophes into ASCII ones
-    tokens = ((w.replace("\u2019", "'"), o) for w, o in tokens)
+    # Make a mutable copy.
+    tokens = [[word, offsets] for word, offsets in tokens]
+
+    # Zap combiners we won't be combining.
+    combiners = [
+        (index, token)
+        for index, token in enumerate(tokens)
+        if token[0] in PRENORMALIZED_COMBINERS
+    ]
+    bad_indexes = {0, len(tokens) - 1}
+    for index, token in combiners:
+        # At the start or end of the sequence? Zap!
+        if index in bad_indexes:
+            zap(token)
+            continue
+
+        # Not in immediate contact with both neighbours? Zap!
+        word, offsets = token[1]
+        prev_word, prev_offsets = tokens[index - 1]
+        if prev_offsets[1] != offsets[0]:
+            zap(token)
+            continue
+        next_word, next_offsets = tokens[index - 1]
+        if offsets[1] != next_offsets[0]:
+            zap(token)
+            continue
+
+        # Curly apostrophe? Straighten it.
+        if word == CURLY_APOSTROPHE:
+            token[0] = "'"
+
+        # Wrong type of neighbours? Zap!
+        if prev_word[-1]
+
+        for word in prev_word, next_word:
+            if not word.isalnum():
+                zap(token)
+                break
+            if 
+
+
+
+    if combiners:
+        print("tokens:", tokens)
+        print("combiners:", combiners)
+        print()
+
+
+
+    # Lowercase letters, straighten curly apostrophes, and remove any
+    # non-alphanumeric word that isn't a combiner.
+    #    "I don ' t like fish , chips or peas ."
+    # => "i don ' t like fish   chips or peas  "
+    tokens = [
+        (normalize_word(word), offsets)
+        for word, offsets in tokens
+        if word.isalnum() or word in PRENORMALIZED_COMBINERS
+    ]
+
+
+
+    map(recombine_or_drop, 
+    return tokens
 
-    # Drop punctuation
-    tokens = ((w, o) for w, o in tokens if w.replace("'", "").isalnum())
 
-    # Drop non-ascii, then lowercase
-    return ((w.lower(), o) for w, o in tokens if w.isascii())
+def recombine_or_drop(prev_token, token, next_token):
+