From b992fd2835cac308d566d7cda7cec6ed41083b73 Mon Sep 17 00:00:00 2001
From: Gary Benson <gary@gbenson.net>
Date: Mon, 20 May 2024 10:30:36 +0100
Subject: [PATCH] maybe need a tokensequence class

---
 src/dom_tokenizers/pre_tokenizers/pipeline.py | 91 +++++++++++++++++--
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/src/dom_tokenizers/pre_tokenizers/pipeline.py b/src/dom_tokenizers/pre_tokenizers/pipeline.py
index ddbe03c..30874cc 100644
--- a/src/dom_tokenizers/pre_tokenizers/pipeline.py
+++ b/src/dom_tokenizers/pre_tokenizers/pipeline.py
@@ -1,9 +1,88 @@
+from itertools import chain
+
+CURLY_APOSTROPHE = "\u2019"
+
+COMBINERS = "'-._"
+PRENORMALIZED_COMBINERS = {CURLY_APOSTROPHE}.union(COMBINERS)
+
+
+def normalize_word(word):
+    """Lowercase letters, straighten curly apostrophes.
+    """
+    if word == CURLY_APOSTROPHE:
+        return "'"
+    return word.lower()
+
+
+def zap(token):
+    token[0] = ""
+
+
 def filter_tokens(tokens):
-    # Normalize curly apostrophes into ASCII ones
-    tokens = ((w.replace("\u2019", "'"), o) for w, o in tokens)
+    # Make a mutable copy.
+    tokens = [[word, offsets] for word, offsets in tokens]
+
+    # Zap combiners we won't be combining.
+    combiners = [
+        (index, token)
+        for index, token in enumerate(tokens)
+        if token[0] in PRENORMALIZED_COMBINERS
+    ]
+    bad_indexes = {0, len(tokens) - 1}
+    for index, token in combiners:
+        # At the start or end of the sequence? Zap!
+        if index in bad_indexes:
+            zap(token)
+            continue
+
+        # Not in immediate contact with both neighbours? Zap!
+        word, offsets = token[1]
+        prev_word, prev_offsets = tokens[index - 1]
+        if prev_offsets[1] != offsets[0]:
+            zap(token)
+            continue
+        next_word, next_offsets = tokens[index - 1]
+        if offsets[1] != next_offsets[0]:
+            zap(token)
+            continue
+
+        # Curly apostrophe? Straighten it.
+        if word == CURLY_APOSTROPHE:
+            token[0] = "'"
+
+        # Wrong type of neighbours? Zap!
+        if prev_word[-1]
+        
+        for word in prev_word, next_word:
+            if not word.isalnum():
+                zap(token)
+                break
+            if 
+
+
+
+    if combiners:
+        print("tokens:", tokens)
+        print("combiners:", combiners)
+        print()
+    
+        
+    
+    # Lowercase letters, straighten curly apostrophes, and remove any
+    # non-alphanumeric word that isn't a combiner.
+    #    "I don ' t like fish , chips or peas ."
+    # => "i don ' t like fish   chips or peas  "
+    tokens = [
+        (normalize_word(word), offsets)
+        for word, offsets in tokens
+        if word.isalnum() or word in PRENORMALIZED_COMBINERS
+    ]
+
+
+
+    map(recombine_or_drop, 
+    return tokens
 
-    # Drop punctuation
-    tokens = ((w, o) for w, o in tokens if w.replace("'", "").isalnum())
 
-    # Drop non-ascii, then lowercase
-    return ((w.lower(), o) for w, o in tokens if w.isascii())
+def recombine_or_drop(prev_token, token, next_token):
+