From b992fd2835cac308d566d7cda7cec6ed41083b73 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Mon, 20 May 2024 10:30:36 +0100 Subject: [PATCH] maybe need a tokensequence class --- src/dom_tokenizers/pre_tokenizers/pipeline.py | 91 +++++++++++++++++-- 1 file changed, 85 insertions(+), 6 deletions(-) diff --git a/src/dom_tokenizers/pre_tokenizers/pipeline.py b/src/dom_tokenizers/pre_tokenizers/pipeline.py index ddbe03c..30874cc 100644 --- a/src/dom_tokenizers/pre_tokenizers/pipeline.py +++ b/src/dom_tokenizers/pre_tokenizers/pipeline.py @@ -1,9 +1,88 @@ +from itertools import chain + +CURLY_APOSTROPHE = "\u2019" + +COMBINERS = "'-._" +PRENORMALIZED_COMBINERS = {CURLY_APOSTROPHE}.union(COMBINERS) + + +def normalize_word(word): + """Lowercase letters, straighten curly apostrophes. + """ + if word == CURLY_APOSTROPHE: + return "'" + return word.lower() + + +def zap(token): + token[0] = "" + + def filter_tokens(tokens): - # Normalize curly apostrophes into ASCII ones - tokens = ((w.replace("\u2019", "'"), o) for w, o in tokens) + # Make a mutable copy. + tokens = [[word, offsets] for word, offsets in tokens] + + # Zap combiners we won't be combining. + combiners = [ + (index, token) + for index, token in enumerate(tokens) + if token[0] in PRENORMALIZED_COMBINERS + ] + bad_indexes = {0, len(tokens) - 1} + for index, token in combiners: + # At the start or end of the sequence? Zap! + if index in bad_indexes: + zap(token) + continue + + # Not in immediate contact with both neighbours? Zap! + word, offsets = token[1] + prev_word, prev_offsets = tokens[index - 1] + if prev_offsets[1] != offsets[0]: + zap(token) + continue + next_word, next_offsets = tokens[index - 1] + if offsets[1] != next_offsets[0]: + zap(token) + continue + + # Curly apostrophe? Straighten it. + if word == CURLY_APOSTROPHE: + token[0] = "'" + + # Wrong type of neighbours? Zap! + if prev_word[-1] + + for word in prev_word, next_word: + if not word.isalnum(): + zap(token) + break + if + + + + if combiners: + print("tokens:", tokens) + print("combiners:", combiners) + print() + + + + # Lowercase letters, straighten curly apostrophes, and remove any + # non-alphanumeric word that isn't a combiner. + # "I don ' t like fish , chips or peas ." + # => "i don ' t like fish chips or peas " + tokens = [ + (normalize_word(word), offsets) + for word, offsets in tokens + if word.isalnum() or word in PRENORMALIZED_COMBINERS + ] + + + + map(recombine_or_drop, + return tokens - # Drop punctuation - tokens = ((w, o) for w, o in tokens if w.replace("'", "").isalnum()) - # Drop non-ascii, then lowercase - return ((w.lower(), o) for w, o in tokens if w.isascii()) +def recombine_or_drop(prev_token, token, next_token): +