-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_unseeen_dict.py
47 lines (38 loc) · 1.32 KB
/
create_unseeen_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gensim
import global_options
import pandas as pd
import pickle
from collections import OrderedDict, Counter
import itertools
from pprint import pprint
from culture import unseen_dictionary, culture_dictionary
from pathlib import Path
## load skip-gram model
ft_model = gensim.models.fasttext.FastText.load(
"models/w2v/w2v_ar_all"
)
expanded_words = unseen_dictionary.expand_words_dimension_mean(
word2vec_model=ft_model,
seed_words=global_options.SEED_WORDS,
restrict=global_options.DICT_RESTRICT_VOCAB,
n=global_options.N_WORDS_DIM,
)
expanded_words = unseen_dictionary.deduplicate_keywords(
word2vec_model=ft_model,
expanded_words=expanded_words,
seed_words=global_options.SEED_WORDS,
)
expanded_words = unseen_dictionary.rank_by_sim(
expanded_words, global_options.SEED_WORDS, ft_model, limit=100
)
# print(expanded_words["data analytics"])
filename = "expanded_dict_us_ESGAndFintech.csv"
culture_dictionary.write_dict_to_csv(
culture_dict=expanded_words,
file_name=str(Path(global_options.OUTPUT_FOLDER, "dict", filename)),
)
print("Dictionary saved at {}".format(str(Path(global_options.OUTPUT_FOLDER, "dict", filename))))
## similarity operations
# model_wrapper.similarity("night", "nights") ## similarity
# model_wrapper.most_similar("nights")
## compute distance from manual words