Skip to content

Commit

Permalink
keras tokenizer import error caused by the deprecation of keras_prepr…
Browse files Browse the repository at this point in the history
…ocessing is resolved
  • Loading branch information
meliksahturker committed Mar 2, 2023
1 parent f4c826d commit a7ad94d
Show file tree
Hide file tree
Showing 24 changed files with 55 additions and 61 deletions.
5 changes: 1 addition & 4 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@ build:
os: ubuntu-22.04
tools:
python: "3.9"
# You can also specify other tool versions:
# nodejs: "19"
# rust: "1.64"
# golang: "1.19"


# Build documentation in the docs/ directory with Sphinx
sphinx:
Expand Down
Binary file not shown.
8 changes: 4 additions & 4 deletions vnlp/dependency_parser/spu_context_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import pickle

import numpy as np
import tensorflow as tf

import sentencepiece as spm

from ..tokenizer import TreebankWordTokenize
from ..utils import check_and_download
from ..utils import check_and_download, load_keras_tokenizer
from .utils import dp_pos_to_displacy_format, decode_arc_label_vector
from ._spu_context_utils import create_spucontext_dp_model, process_single_word_input

Expand All @@ -30,7 +31,7 @@
WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/SPUTokenized_word_embedding_16k.matrix"

SPU_TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/SPU_word_tokenizer_16k.model'))
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.pickle"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.json"

# Data Preprocessing Config
TOKEN_PIECE_MAX_LEN = 8 # 0.9995 quantile is 8 for 16k_vocab, 7 for 32k_vocab
Expand All @@ -39,8 +40,7 @@
# Loading Tokenizers
spu_tokenizer_word = spm.SentencePieceProcessor(SPU_TOKENIZER_WORD_LOC)

with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
tokenizer_label = pickle.load(handle)
tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

sp_key_to_index = {spu_tokenizer_word.id_to_piece(id): id for id in range(spu_tokenizer_word.get_piece_size())}
sp_index_to_key = {id: spu_tokenizer_word.id_to_piece(id) for id in range(spu_tokenizer_word.get_piece_size())}
Expand Down
25 changes: 9 additions & 16 deletions vnlp/dependency_parser/treestack_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ..stemmer_morph_analyzer import StemmerAnalyzer
from ..part_of_speech_tagger import PoSTagger
from ..tokenizer import TreebankWordTokenize
from ..utils import check_and_download
from ..utils import check_and_download, load_keras_tokenizer
from .utils import dp_pos_to_displacy_format, decode_arc_label_vector
from ._treestack_utils import (create_dependency_parser_model, process_single_word_input)

Expand All @@ -32,10 +32,10 @@
EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/DP_TreeStack_eval.weights"
WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/TBWTokenized_word_embedding.matrix"

TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.pickle'))
TOKENIZER_POS_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'part_of_speech_tagger/resources/PoS_label_tokenizer.pickle')) # using the tokenizer of part_of_speech_tagger
TOKENIZER_TAG_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.pickle')) # using the tokenizer of stemmer_morph_analyzer
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.pickle"
TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.json'))
TOKENIZER_POS_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'part_of_speech_tagger/resources/PoS_label_tokenizer.json')) # using the tokenizer of part_of_speech_tagger
TOKENIZER_TAG_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.json')) # using the tokenizer of stemmer_morph_analyzer
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "DP_label_tokenizer.json"

# Data Preprocessing Config
SENTENCE_MAX_LEN = 40
Expand All @@ -45,17 +45,10 @@

# Loading Tokenizers
# Have to load tokenizers here because model config depends on them
with open(TOKENIZER_WORD_LOC, 'rb') as handle:
tokenizer_word = pickle.load(handle)

with open(TOKENIZER_POS_LOC, 'rb') as handle:
tokenizer_pos = pickle.load(handle)

with open(TOKENIZER_TAG_LOC, 'rb') as handle: # This is transferred from StemmerAnalyzer
tokenizer_tag = pickle.load(handle)

with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
tokenizer_label = pickle.load(handle)
tokenizer_word = load_keras_tokenizer(TOKENIZER_WORD_LOC)
tokenizer_pos = load_keras_tokenizer(TOKENIZER_POS_LOC)
tokenizer_tag = load_keras_tokenizer(TOKENIZER_TAG_LOC)
tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

LABEL_VOCAB_SIZE = len(tokenizer_label.word_index)
POS_VOCAB_SIZE = len(tokenizer_pos.word_index)
Expand Down
13 changes: 5 additions & 8 deletions vnlp/named_entity_recognizer/charner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import tensorflow as tf

from ..tokenizer import WordPunctTokenize
from ..utils import check_and_download
from ..utils import check_and_download, load_keras_tokenizer
from .utils import ner_to_displacy_format
from ._charner_utils import create_charner_model

Expand All @@ -27,8 +27,8 @@
PROD_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_prod.weights"
EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_eval.weights"

TOKENIZER_CHAR_LOC = RESOURCES_PATH + "CharNER_char_tokenizer.pickle"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.pickle"
TOKENIZER_CHAR_LOC = RESOURCES_PATH + "CharNER_char_tokenizer.json"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.json"

CHAR_VOCAB_SIZE = 150
SEQ_LEN_MAX = 256
Expand Down Expand Up @@ -74,11 +74,8 @@ def __init__(self, evaluate):
# Set model weights
self.model.set_weights(model_weights)

with open(TOKENIZER_CHAR_LOC, 'rb') as handle:
tokenizer_char = pickle.load(handle)

with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
tokenizer_label = pickle.load(handle)
tokenizer_char = load_keras_tokenizer(TOKENIZER_CHAR_LOC)
tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

self.tokenizer_char = tokenizer_char
self.tokenizer_label = tokenizer_label
Expand Down

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class_name": "Tokenizer", "config": {"num_words": null, "filters": null, "lower": false, "split": " ", "char_level": false, "oov_token": null, "document_count": 2696802, "word_counts": "{\"O\": 2019178, \"ORG\": 251913, \"LOC\": 180990, \"PER\": 244721}", "word_docs": "{\"O\": 2019178, \"ORG\": 251913, \"LOC\": 180990, \"PER\": 244721}", "index_docs": "{\"1\": 2019178, \"2\": 251913, \"4\": 180990, \"3\": 244721}", "index_word": "{\"1\": \"O\", \"2\": \"ORG\", \"3\": \"PER\", \"4\": \"LOC\"}", "word_index": "{\"O\": 1, \"ORG\": 2, \"PER\": 3, \"LOC\": 4}"}}
Binary file not shown.
7 changes: 3 additions & 4 deletions vnlp/named_entity_recognizer/spu_context_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sentencepiece as spm

from ..tokenizer import TreebankWordTokenize
from ..utils import check_and_download
from ..utils import check_and_download, load_keras_tokenizer
from .utils import ner_to_displacy_format
from ._spu_context_utils import create_spucontext_ner_model, process_single_word_input

Expand All @@ -30,7 +30,7 @@
WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/SPUTokenized_word_embedding_16k.matrix"

SPU_TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/SPU_word_tokenizer_16k.model'))
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.pickle"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.json"

# Data Preprocessing Config
TOKEN_PIECE_MAX_LEN = 8
Expand All @@ -39,8 +39,7 @@
# Loading Tokenizers
spu_tokenizer_word = spm.SentencePieceProcessor(SPU_TOKENIZER_WORD_LOC)

with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
tokenizer_label = pickle.load(handle)
tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

sp_key_to_index = {spu_tokenizer_word.id_to_piece(id): id for id in range(spu_tokenizer_word.get_piece_size())}
sp_index_to_key = {id: spu_tokenizer_word.id_to_piece(id) for id in range(spu_tokenizer_word.get_piece_size())}
Expand Down
2 changes: 1 addition & 1 deletion vnlp/normalizer/normalizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List
from pathlib import Path

from hunspell import Hunspell
from spylls.hunspell import Dictionary

from ._deasciifier import Deasciifier
from ..stemmer_morph_analyzer import StemmerAnalyzer
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class_name": "Tokenizer", "config": {"num_words": null, "filters": null, "lower": false, "split": " ", "char_level": false, "oov_token": null, "document_count": 733342, "word_counts": "{\"NOUN\": 243249, \"ADV\": 41668, \"VERB\": 98518, \"PUNCT\": 115705, \"PROPN\": 45376, \"PRON\": 17308, \"ADJ\": 78449, \"ADP\": 16359, \"DET\": 28435, \"CCONJ\": 22146, \"NUM\": 16908, \"AUX\": 7191, \"INTJ\": 644, \"SCONJ\": 729, \"X\": 650, \"SYM\": 6, \"PART\": 1}", "word_docs": "{\"NOUN\": 243249, \"ADV\": 41668, \"VERB\": 98518, \"PUNCT\": 115705, \"PROPN\": 45376, \"PRON\": 17308, \"ADJ\": 78449, \"ADP\": 16359, \"DET\": 28435, \"CCONJ\": 22146, \"NUM\": 16908, \"AUX\": 7191, \"INTJ\": 644, \"SCONJ\": 729, \"X\": 650, \"SYM\": 6, \"PART\": 1}", "index_docs": "{\"1\": 243249, \"6\": 41668, \"3\": 98518, \"2\": 115705, \"5\": 45376, \"9\": 17308, \"4\": 78449, \"11\": 16359, \"7\": 28435, \"8\": 22146, \"10\": 16908, \"12\": 7191, \"15\": 644, \"13\": 729, \"14\": 650, \"16\": 6, \"17\": 1}", "index_word": "{\"1\": \"NOUN\", \"2\": \"PUNCT\", \"3\": \"VERB\", \"4\": \"ADJ\", \"5\": \"PROPN\", \"6\": \"ADV\", \"7\": \"DET\", \"8\": \"CCONJ\", \"9\": \"PRON\", \"10\": \"NUM\", \"11\": \"ADP\", \"12\": \"AUX\", \"13\": \"SCONJ\", \"14\": \"X\", \"15\": \"INTJ\", \"16\": \"SYM\", \"17\": \"PART\"}", "word_index": "{\"NOUN\": 1, \"PUNCT\": 2, \"VERB\": 3, \"ADJ\": 4, \"PROPN\": 5, \"ADV\": 6, \"DET\": 7, \"CCONJ\": 8, \"PRON\": 9, \"NUM\": 10, \"ADP\": 11, \"AUX\": 12, \"SCONJ\": 13, \"X\": 14, \"INTJ\": 15, \"SYM\": 16, \"PART\": 17}"}}
Binary file not shown.
7 changes: 3 additions & 4 deletions vnlp/part_of_speech_tagger/spu_context_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sentencepiece as spm

from ..tokenizer import TreebankWordTokenize
from ..utils import check_and_download
from ..utils import check_and_download, load_keras_tokenizer
from ._spu_context_utils import create_spucontext_pos_model, process_single_word_input

# Resolving parent dependencies
Expand All @@ -29,7 +29,7 @@
WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/SPUTokenized_word_embedding_16k.matrix"

SPU_TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/SPU_word_tokenizer_16k.model'))
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.pickle"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.json"

# Data Preprocessing Config
TOKEN_PIECE_MAX_LEN = 8
Expand All @@ -38,8 +38,7 @@
# Loading Tokenizers
spu_tokenizer_word = spm.SentencePieceProcessor(SPU_TOKENIZER_WORD_LOC)

with open(TOKENIZER_LABEL_LOC, 'rb') as handle:
tokenizer_label = pickle.load(handle)
tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

sp_key_to_index = {spu_tokenizer_word.id_to_piece(id): id for id in range(spu_tokenizer_word.get_piece_size())}
sp_index_to_key = {id: spu_tokenizer_word.id_to_piece(id) for id in range(spu_tokenizer_word.get_piece_size())}
Expand Down
20 changes: 8 additions & 12 deletions vnlp/part_of_speech_tagger/treestack_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from ..stemmer_morph_analyzer import StemmerAnalyzer
from ..tokenizer import TreebankWordTokenize
from ..utils import check_and_download
from ..utils import check_and_download, load_keras_tokenizer
from ._treestack_utils import (create_pos_tagger_model, process_single_word_input)

# Resolving parent dependencies
Expand All @@ -29,9 +29,9 @@
EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/PoS_TreeStack_eval.weights"
WORD_EMBEDDING_MATRIX_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/TBWTokenized_word_embedding.matrix"

TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.pickle'))
TOKENIZER_MORPH_TAG_LOC_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.pickle')) # using the tokenizer of stemmer_morph_analyzer
TOKENIZER_POS_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.pickle"
TOKENIZER_WORD_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'resources/TB_word_tokenizer.json'))
TOKENIZER_MORPH_TAG_LOC_LOC = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'stemmer_morph_analyzer/resources/Stemmer_morph_tag_tokenizer.json')) # using the tokenizer of stemmer_morph_analyzer
TOKENIZER_POS_LABEL_LOC = RESOURCES_PATH + "PoS_label_tokenizer.json"

# Data Preprocessing Config
SENTENCE_MAX_LEN = 40
Expand All @@ -41,14 +41,10 @@

# Loading Tokenizers
# Have to load tokenizers here because model config depends on them
with open(TOKENIZER_WORD_LOC, 'rb') as handle:
tokenizer_word = pickle.load(handle)

with open(TOKENIZER_MORPH_TAG_LOC_LOC, 'rb') as handle: # This is transferred from StemmerAnalyzer
tokenizer_morph_tag = pickle.load(handle)

with open(TOKENIZER_POS_LABEL_LOC, 'rb') as handle:
tokenizer_pos_label = pickle.load(handle)
tokenizer_word = load_keras_tokenizer(TOKENIZER_WORD_LOC)
# This is transferred from StemmerAnalyzer
tokenizer_morph_tag = load_keras_tokenizer(TOKENIZER_MORPH_TAG_LOC_LOC)
tokenizer_pos_label = load_keras_tokenizer(TOKENIZER_POS_LABEL_LOC)

POS_VOCAB_SIZE = len(tokenizer_pos_label.word_index)

Expand Down
1 change: 1 addition & 0 deletions vnlp/resources/TB_word_tokenizer.json

Large diffs are not rendered by default.

Binary file removed vnlp/resources/TB_word_tokenizer.pickle
Binary file not shown.
Loading

0 comments on commit a7ad94d

Please sign in to comment.