Skip to content

Commit

Permalink
maybe need a tokensequence class
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 20, 2024
1 parent 9a8a117 commit b992fd2
Showing 1 changed file with 85 additions and 6 deletions.
91 changes: 85 additions & 6 deletions src/dom_tokenizers/pre_tokenizers/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,88 @@
from itertools import chain

CURLY_APOSTROPHE = "\u2019"

COMBINERS = "'-._"
PRENORMALIZED_COMBINERS = {CURLY_APOSTROPHE}.union(COMBINERS)


def normalize_word(word):
"""Lowercase letters, straighten curly apostrophes.
"""
if word == CURLY_APOSTROPHE:
return "'"
return word.lower()


def zap(token):
token[0] = ""


def filter_tokens(tokens):
# Normalize curly apostrophes into ASCII ones
tokens = ((w.replace("\u2019", "'"), o) for w, o in tokens)
# Make a mutable copy.
tokens = [[word, offsets] for word, offsets in tokens]

# Zap combiners we won't be combining.
combiners = [
(index, token)
for index, token in enumerate(tokens)
if token[0] in PRENORMALIZED_COMBINERS
]
bad_indexes = {0, len(tokens) - 1}
for index, token in combiners:
# At the start or end of the sequence? Zap!
if index in bad_indexes:
zap(token)
continue

# Not in immediate contact with both neighbours? Zap!
word, offsets = token[1]
prev_word, prev_offsets = tokens[index - 1]
if prev_offsets[1] != offsets[0]:
zap(token)
continue
next_word, next_offsets = tokens[index - 1]
if offsets[1] != next_offsets[0]:
zap(token)
continue

# Curly apostrophe? Straighten it.
if word == CURLY_APOSTROPHE:
token[0] = "'"

# Wrong type of neighbours? Zap!
if prev_word[-1]

for word in prev_word, next_word:
if not word.isalnum():
zap(token)
break
if



if combiners:
print("tokens:", tokens)
print("combiners:", combiners)
print()



# Lowercase letters, straighten curly apostrophes, and remove any
# non-alphanumeric word that isn't a combiner.
# "I don ' t like fish , chips or peas ."
# => "i don ' t like fish chips or peas "
tokens = [
(normalize_word(word), offsets)
for word, offsets in tokens
if word.isalnum() or word in PRENORMALIZED_COMBINERS
]



map(recombine_or_drop,
return tokens

# Drop punctuation
tokens = ((w, o) for w, o in tokens if w.replace("'", "").isalnum())

# Drop non-ascii, then lowercase
return ((w.lower(), o) for w, o in tokens if w.isascii())
def recombine_or_drop(prev_token, token, next_token):

0 comments on commit b992fd2

Please sign in to comment.