Skip to content

Commit

Permalink
support es in word_tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
kuo committed Feb 9, 2021
1 parent a96f610 commit 58c6b73
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions transcription_compare/tokenizer/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
from nltk.tokenize import word_tokenize
from ..tokens import Token
import re
import string
# brackets_allowed = ['[', ']', ')', ">", '(', "<"]

FILL_WORD_LIST = {"um", "mhmm", "hmm", "uh", "huh"}

whitespace = ' \t\n\r\v\f'
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ascii_letters = ascii_lowercase + ascii_uppercase
ascii_letters = string.ascii_letters
digits = '0123456789'
pun = "'"
allow_character = set(digits + ascii_letters + whitespace + pun)
es_allow_character = set(digits + ascii_letters + whitespace + "ñáéíóúü")


class TokenWithPrePostFlag:
Expand All @@ -23,6 +23,9 @@ def __init__(self, word, should_put_to_pre_post):

class WordTokenizer(AbstractTokenizer):

def __init__(self, lang="en"):
self.lang = lang

def tokenize(self, token_string, brackets_list=None, to_lower=False, remove_punctuation=False, use_alternative_spelling=False):
"""
:param brackets_list,
Expand All @@ -41,8 +44,12 @@ def clean_words_dont_have_brackets(s):
# punctuation = r"""!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""
# s = s.translate(str.maketrans('', '', punctuation))
new_s = ''
if self.lang == "en":
ac = allow_character
else:
ac = es_allow_character
for one_character in s:
if one_character in allow_character:
if one_character in ac:
new_s += one_character
else:
new_s += " "
Expand Down

0 comments on commit 58c6b73

Please sign in to comment.