support es in word_tokenizer

voicegain · Feb 9, 2021 · 58c6b73 · 58c6b73
1 parent a96f610
commit 58c6b73
Showing 1 changed file with 11 additions and 4 deletions.
diff --git a/transcription_compare/tokenizer/word_tokenizer.py b/transcription_compare/tokenizer/word_tokenizer.py
@@ -2,17 +2,17 @@
 from nltk.tokenize import word_tokenize
 from ..tokens import Token
 import re
+import string
 # brackets_allowed = ['[', ']', ')', ">", '(', "<"]
 
 FILL_WORD_LIST = {"um", "mhmm", "hmm", "uh", "huh"}
 
 whitespace = ' \t\n\r\v\f'
-ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
-ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-ascii_letters = ascii_lowercase + ascii_uppercase
+ascii_letters = string.ascii_letters
 digits = '0123456789'
 pun = "'"
 allow_character = set(digits + ascii_letters + whitespace + pun)
+es_allow_character = set(digits + ascii_letters + whitespace + "ñáéíóúü")
 
 
 class TokenWithPrePostFlag:
@@ -23,6 +23,9 @@ def __init__(self, word, should_put_to_pre_post):
 
 class WordTokenizer(AbstractTokenizer):
 
+    def __init__(self, lang="en"):
+        self.lang = lang
+
     def tokenize(self, token_string, brackets_list=None, to_lower=False, remove_punctuation=False, use_alternative_spelling=False):
         """
         :param brackets_list,
@@ -41,8 +44,12 @@ def clean_words_dont_have_brackets(s):
                 # punctuation = r"""!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""
                 # s = s.translate(str.maketrans('', '', punctuation))
                 new_s = ''
+                if self.lang == "en":
+                    ac = allow_character
+                else:
+                    ac = es_allow_character
                 for one_character in s:
-                    if one_character in allow_character:
+                    if one_character in ac:
                         new_s += one_character
                     else:
                         new_s += " "