keras tokenizer import error caused by the deprecation of keras_prepr…

…ocessing is resolved
vngrs-ai · Mar 2, 2023 · a7ad94d · a7ad94d
1 parent f4c826d
commit a7ad94d
Show file tree

Hide file tree

Showing 24 changed files with 55 additions and 61 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -10,10 +10,7 @@ build:
   os: ubuntu-22.04
   tools:
     python: "3.9"
-    # You can also specify other tool versions:
-    # nodejs: "19"
-    # rust: "1.64"
-    # golang: "1.19"
+
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:

diff --git a/vnlp/dependency_parser/resources/DP_label_tokenizer.pickle b/vnlp/dependency_parser/resources/DP_label_tokenizer.pickle
diff --git a/vnlp/dependency_parser/spu_context_dp.py b/vnlp/dependency_parser/spu_context_dp.py
@@ -3,11 +3,12 @@
 import pickle
 
 import numpy as np
+import tensorflow as tf
 
 import sentencepiece as spm
 
 from ..tokenizer import TreebankWordTokenize
-from ..utils import check_and_download
+from ..utils import check_and_download, load_keras_tokenizer
 from .utils import dp_pos_to_displacy_format, decode_arc_label_vector
 from ._spu_context_utils import create_spucontext_dp_model, process_single_word_input
 
@@ -30,7 +31,7 @@
 WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/SPUTokenized_word_embedding_16k.matrix"
 
 SPU_TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/SPU_word_tokenizer_16k.model'))
-TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.pickle"
+TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.json"
 
 # Data Preprocessing Config
 TOKEN_PIECE_MAX_LEN = 8 # 0.9995 quantile is 8 for 16k_vocab, 7 for 32k_vocab
@@ -39,8 +40,7 @@
 # Loading Tokenizers
 spu_tokenizer_word = spm.SentencePieceProcessor(SPU_TOKENIZER_WORD_LOC)
 
-with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
-    tokenizer_label = pickle.load(handle)
+tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)
 
 sp_key_to_index = {spu_tokenizer_word.id_to_piece(id): id for id in range(spu_tokenizer_word.get_piece_size())}
 sp_index_to_key = {id: spu_tokenizer_word.id_to_piece(id) for id in range(spu_tokenizer_word.get_piece_size())}

diff --git a/vnlp/dependency_parser/treestack_dp.py b/vnlp/dependency_parser/treestack_dp.py
@@ -8,7 +8,7 @@
 from ..stemmer_morph_analyzer import StemmerAnalyzer
 from ..part_of_speech_tagger import PoSTagger
 from ..tokenizer import TreebankWordTokenize
-from ..utils import check_and_download
+from ..utils import check_and_download, load_keras_tokenizer
 from .utils import dp_pos_to_displacy_format, decode_arc_label_vector
 from ._treestack_utils import (create_dependency_parser_model, process_single_word_input)
 
@@ -32,10 +32,10 @@
 EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/DP_TreeStack_eval.weights"
 WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/TBWTokenized_word_embedding.matrix"
 
-TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.pickle'))
-TOKENIZER_POS_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'part_of_speech_tagger/resources/PoS_label_tokenizer.pickle')) # using the tokenizer of part_of_speech_tagger
-TOKENIZER_TAG_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.pickle')) # using the tokenizer of stemmer_morph_analyzer
-TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.pickle"
+TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.json'))
+TOKENIZER_POS_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'part_of_speech_tagger/resources/PoS_label_tokenizer.json')) # using the tokenizer of part_of_speech_tagger
+TOKENIZER_TAG_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.json')) # using the tokenizer of stemmer_morph_analyzer
+TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.json"
 
 # Data Preprocessing Config
 SENTENCE_MAX_LEN = 40
@@ -45,17 +45,10 @@
 
 # Loading Tokenizers
 # Have to load tokenizers here because model config depends on them
-with open(TOKENIZER_WORD_LOC, 'rb') as handle:
-    tokenizer_word = pickle.load(handle)
-
-with open(TOKENIZER_POS_LOC, 'rb') as handle:
-    tokenizer_pos = pickle.load(handle)
-
-with open(TOKENIZER_TAG_LOC, 'rb') as handle: # This is transferred from StemmerAnalyzer
-    tokenizer_tag = pickle.load(handle)
-
-with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
-    tokenizer_label = pickle.load(handle)
+tokenizer_word = load_keras_tokenizer(TOKENIZER_WORD_LOC)
+tokenizer_pos = load_keras_tokenizer(TOKENIZER_POS_LOC)
+tokenizer_tag = load_keras_tokenizer(TOKENIZER_TAG_LOC)
+tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)
 
 LABEL_VOCAB_SIZE = len(tokenizer_label.word_index)
 POS_VOCAB_SIZE = len(tokenizer_pos.word_index)

diff --git a/vnlp/named_entity_recognizer/charner.py b/vnlp/named_entity_recognizer/charner.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from ..tokenizer import WordPunctTokenize
-from ..utils import check_and_download
+from ..utils import check_and_download, load_keras_tokenizer
 from .utils import ner_to_displacy_format
 from ._charner_utils import create_charner_model
 
@@ -27,8 +27,8 @@
 PROD_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_prod.weights"
 EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_eval.weights"
 
-TOKENIZER_CHAR_LOC = RESOURCES_PATH + "CharNER_char_tokenizer.pickle"
-TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.pickle"
+TOKENIZER_CHAR_LOC = RESOURCES_PATH + "CharNER_char_tokenizer.json"
+TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.json"
 
 CHAR_VOCAB_SIZE = 150
 SEQ_LEN_MAX = 256
@@ -74,11 +74,8 @@ def __init__(self, evaluate):
         # Set model weights
         self.model.set_weights(model_weights)
 
-        with open(TOKENIZER_CHAR_LOC, 'rb') as handle:
-            tokenizer_char = pickle.load(handle)
-
-        with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
-            tokenizer_label = pickle.load(handle)
+        tokenizer_char = load_keras_tokenizer(TOKENIZER_CHAR_LOC)
+        tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)
 
         self.tokenizer_char = tokenizer_char
         self.tokenizer_label = tokenizer_label

diff --git a/vnlp/named_entity_recognizer/resources/CharNER_char_tokenizer.json b/vnlp/named_entity_recognizer/resources/CharNER_char_tokenizer.json
diff --git a/vnlp/named_entity_recognizer/resources/CharNER_char_tokenizer.pickle b/vnlp/named_entity_recognizer/resources/CharNER_char_tokenizer.pickle
diff --git a/vnlp/named_entity_recognizer/resources/NER_CharNER_eval.weights b/vnlp/named_entity_recognizer/resources/NER_CharNER_eval.weights
diff --git a/vnlp/named_entity_recognizer/resources/NER_label_tokenizer.json b/vnlp/named_entity_recognizer/resources/NER_label_tokenizer.json
@@ -0,0 +1 @@
+{"class_name": "Tokenizer", "config": {"num_words": null, "filters": null, "lower": false, "split": " ", "char_level": false, "oov_token": null, "document_count": 2696802, "word_counts": "{\"O\": 2019178, \"ORG\": 251913, \"LOC\": 180990, \"PER\": 244721}", "word_docs": "{\"O\": 2019178, \"ORG\": 251913, \"LOC\": 180990, \"PER\": 244721}", "index_docs": "{\"1\": 2019178, \"2\": 251913, \"4\": 180990, \"3\": 244721}", "index_word": "{\"1\": \"O\", \"2\": \"ORG\", \"3\": \"PER\", \"4\": \"LOC\"}", "word_index": "{\"O\": 1, \"ORG\": 2, \"PER\": 3, \"LOC\": 4}"}}
diff --git a/vnlp/named_entity_recognizer/resources/NER_label_tokenizer.pickle b/vnlp/named_entity_recognizer/resources/NER_label_tokenizer.pickle
diff --git a/vnlp/named_entity_recognizer/spu_context_ner.py b/vnlp/named_entity_recognizer/spu_context_ner.py
@@ -7,7 +7,7 @@
 import sentencepiece as spm
 
 from ..tokenizer import TreebankWordTokenize
-from ..utils import check_and_download
+from ..utils import check_and_download, load_keras_tokenizer
 from .utils import ner_to_displacy_format
 from ._spu_context_utils import create_spucontext_ner_model, process_single_word_input
 
@@ -30,7 +30,7 @@
 WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/SPUTokenized_word_embedding_16k.matrix"
 
 SPU_TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/SPU_word_tokenizer_16k.model'))
-TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.pickle"
+TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.json"
 
 # Data Preprocessing Config
 TOKEN_PIECE_MAX_LEN = 8
@@ -39,8 +39,7 @@
 # Loading Tokenizers
 spu_tokenizer_word = spm.SentencePieceProcessor(SPU_TOKENIZER_WORD_LOC)
 
-with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
-    tokenizer_label = pickle.load(handle)
+tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)
 
 sp_key_to_index = {spu_tokenizer_word.id_to_piece(id): id for id in range(spu_tokenizer_word.get_piece_size())}
 sp_index_to_key = {id: spu_tokenizer_word.id_to_piece(id) for id in range(spu_tokenizer_word.get_piece_size())}

diff --git a/vnlp/normalizer/normalizer.py b/vnlp/normalizer/normalizer.py
@@ -1,7 +1,7 @@
 from typing import List
 from pathlib import Path
 
-from hunspell import Hunspell
+from spylls.hunspell import Dictionary
 
 from ._deasciifier import Deasciifier
 from ..stemmer_morph_analyzer import StemmerAnalyzer

diff --git a/vnlp/part_of_speech_tagger/resources/PoS_label_tokenizer.json b/vnlp/part_of_speech_tagger/resources/PoS_label_tokenizer.json
@@ -0,0 +1 @@
+{"class_name": "Tokenizer", "config": {"num_words": null, "filters": null, "lower": false, "split": " ", "char_level": false, "oov_token": null, "document_count": 733342, "word_counts": "{\"NOUN\": 243249, \"ADV\": 41668, \"VERB\": 98518, \"PUNCT\": 115705, \"PROPN\": 45376, \"PRON\": 17308, \"ADJ\": 78449, \"ADP\": 16359, \"DET\": 28435, \"CCONJ\": 22146, \"NUM\": 16908, \"AUX\": 7191, \"INTJ\": 644, \"SCONJ\": 729, \"X\": 650, \"SYM\": 6, \"PART\": 1}", "word_docs": "{\"NOUN\": 243249, \"ADV\": 41668, \"VERB\": 98518, \"PUNCT\": 115705, \"PROPN\": 45376, \"PRON\": 17308, \"ADJ\": 78449, \"ADP\": 16359, \"DET\": 28435, \"CCONJ\": 22146, \"NUM\": 16908, \"AUX\": 7191, \"INTJ\": 644, \"SCONJ\": 729, \"X\": 650, \"SYM\": 6, \"PART\": 1}", "index_docs": "{\"1\": 243249, \"6\": 41668, \"3\": 98518, \"2\": 115705, \"5\": 45376, \"9\": 17308, \"4\": 78449, \"11\": 16359, \"7\": 28435, \"8\": 22146, \"10\": 16908, \"12\": 7191, \"15\": 644, \"13\": 729, \"14\": 650, \"16\": 6, \"17\": 1}", "index_word": "{\"1\": \"NOUN\", \"2\": \"PUNCT\", \"3\": \"VERB\", \"4\": \"ADJ\", \"5\": \"PROPN\", \"6\": \"ADV\", \"7\": \"DET\", \"8\": \"CCONJ\", \"9\": \"PRON\", \"10\": \"NUM\", \"11\": \"ADP\", \"12\": \"AUX\", \"13\": \"SCONJ\", \"14\": \"X\", \"15\": \"INTJ\", \"16\": \"SYM\", \"17\": \"PART\"}", "word_index": "{\"NOUN\": 1, \"PUNCT\": 2, \"VERB\": 3, \"ADJ\": 4, \"PROPN\": 5, \"ADV\": 6, \"DET\": 7, \"CCONJ\": 8, \"PRON\": 9, \"NUM\": 10, \"ADP\": 11, \"AUX\": 12, \"SCONJ\": 13, \"X\": 14, \"INTJ\": 15, \"SYM\": 16, \"PART\": 17}"}}
diff --git a/vnlp/part_of_speech_tagger/resources/PoS_label_tokenizer.pickle b/vnlp/part_of_speech_tagger/resources/PoS_label_tokenizer.pickle
diff --git a/vnlp/part_of_speech_tagger/spu_context_pos.py b/vnlp/part_of_speech_tagger/spu_context_pos.py
@@ -7,7 +7,7 @@
 import sentencepiece as spm
 
 from ..tokenizer import TreebankWordTokenize
-from ..utils import check_and_download
+from ..utils import check_and_download, load_keras_tokenizer
 from ._spu_context_utils import create_spucontext_pos_model, process_single_word_input
 
 # Resolving parent dependencies
@@ -29,7 +29,7 @@
 WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/SPUTokenized_word_embedding_16k.matrix"
 
 SPU_TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/SPU_word_tokenizer_16k.model'))
-TOKENIZER_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.pickle"
+TOKENIZER_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.json"
 
 # Data Preprocessing Config
 TOKEN_PIECE_MAX_LEN = 8
@@ -38,8 +38,7 @@
 # Loading Tokenizers
 spu_tokenizer_word = spm.SentencePieceProcessor(SPU_TOKENIZER_WORD_LOC)
 
-with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
-    tokenizer_label = pickle.load(handle)
+tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)
 
 sp_key_to_index = {spu_tokenizer_word.id_to_piece(id): id for id in range(spu_tokenizer_word.get_piece_size())}
 sp_index_to_key = {id: spu_tokenizer_word.id_to_piece(id) for id in range(spu_tokenizer_word.get_piece_size())}

diff --git a/vnlp/part_of_speech_tagger/treestack_pos.py b/vnlp/part_of_speech_tagger/treestack_pos.py
@@ -7,7 +7,7 @@
 
 from ..stemmer_morph_analyzer import StemmerAnalyzer
 from ..tokenizer import TreebankWordTokenize
-from ..utils import check_and_download
+from ..utils import check_and_download, load_keras_tokenizer
 from ._treestack_utils import (create_pos_tagger_model, process_single_word_input)
 
 # Resolving parent dependencies
@@ -29,9 +29,9 @@
 EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/PoS_TreeStack_eval.weights"
 WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/TBWTokenized_word_embedding.matrix"
 
-TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.pickle'))
-TOKENIZER_MORPH_TAG_LOC_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.pickle')) # using the tokenizer of stemmer_morph_analyzer
-TOKENIZER_POS_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.pickle"
+TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.json'))
+TOKENIZER_MORPH_TAG_LOC_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.json')) # using the tokenizer of stemmer_morph_analyzer
+TOKENIZER_POS_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.json"
 
 # Data Preprocessing Config
 SENTENCE_MAX_LEN = 40
@@ -41,14 +41,10 @@
 
 # Loading Tokenizers
 # Have to load tokenizers here because model config depends on them
-with open(TOKENIZER_WORD_LOC, 'rb') as handle:
-    tokenizer_word = pickle.load(handle)
-
-with open(TOKENIZER_MORPH_TAG_LOC_LOC, 'rb') as handle: # This is transferred from StemmerAnalyzer
-    tokenizer_morph_tag = pickle.load(handle)
-
-with open(TOKENIZER_POS_LABEL_LOC, 'rb') as handle:
-    tokenizer_pos_label = pickle.load(handle)
+tokenizer_word = load_keras_tokenizer(TOKENIZER_WORD_LOC)
+# This is transferred from StemmerAnalyzer
+tokenizer_morph_tag = load_keras_tokenizer(TOKENIZER_MORPH_TAG_LOC_LOC)
+tokenizer_pos_label = load_keras_tokenizer(TOKENIZER_POS_LABEL_LOC)
 
 POS_VOCAB_SIZE = len(tokenizer_pos_label.word_index)
 

diff --git a/vnlp/resources/TB_word_tokenizer.json b/vnlp/resources/TB_word_tokenizer.json
diff --git a/vnlp/resources/TB_word_tokenizer.pickle b/vnlp/resources/TB_word_tokenizer.pickle
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"class_name": "Tokenizer", "config": {"num_words": null, "filters": null, "lower": false, "split": " ", "char_level": false, "oov_token": null, "document_count": 2696802, "word_counts": "{\"O\": 2019178, \"ORG\": 251913, \"LOC\": 180990, \"PER\": 244721}", "word_docs": "{\"O\": 2019178, \"ORG\": 251913, \"LOC\": 180990, \"PER\": 244721}", "index_docs": "{\"1\": 2019178, \"2\": 251913, \"4\": 180990, \"3\": 244721}", "index_word": "{\"1\": \"O\", \"2\": \"ORG\", \"3\": \"PER\", \"4\": \"LOC\"}", "word_index": "{\"O\": 1, \"ORG\": 2, \"PER\": 3, \"LOC\": 4}"}}