-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlocal_change.py
150 lines (123 loc) · 6.12 KB
/
local_change.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# script to compute the local semantic change of words
# the main function is local_measure(word, knn, bins)
# INPUT
# word string is the word that you want to analyze
# knn int is the number of nearest neighbours that you want to consider
# bins list is a series of numbers used to store the sequence of time periods
# OUTPUT
# The output will be in tsv format
# new_words_ words that are in t+1 but not in t
# lost_words_ words that are in t but not in t+1
# changes_ words that are in t and t+1 and their differences
# Local_Similarity_ matrix in which are stored the overall differences for each pair of bins
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import cosine_distances
import csv
#### ---- FILES PATH ---- ####
### probably you need to change them
# where the embeddings are located
base_all = 'emb/'
# where the results are saved
base = 'local/'
#### ---- EMBEDDING PARAMETERS ---- ####
### probably you need to change them
### used to construct the file name of the embeddings and to load them
win = 5 # window size
size = 300 # vector dimensions
min_count = 25 # minimal number of occurrences
name = 'WV' # word2vec skip-gram
# ----------------
# time bins used to create the file name of the corresponding embeddings and to load them
bins = [1789, 1830, 1841, 1848, 1855, 1861, 1866, 1870, 1874, 1877, 1880, 1883, 1886, 1889, 1891, 1893, 1895, 1897, 1899, 1901, 1903, 1905, 1907, 1909, 1911, 1913, 1915]
def writeMat(base, name, Mat):
# Mat = Mat.todense()
with open(base + name + '.tsv', 'w') as f:
w = csv.writer(f, delimiter='\t')
for row in Mat:
w.writerow(row)
def getSim(embed,word,knn):
sim = []
for k in knn:
if k in embed.vocab:
s = embed.similarity(word,k)
else:
s = 0
sim.append(s)
return sim
def comp_changes(knn_t,knn_t1):
t_words = [w[0] for w in knn_t]
t1_words = [w[0] for w in knn_t1]
new_words = [(w,[w1[1] for w1 in knn_t1 if w1[0] == w][0]) for w in t1_words if w not in t_words]
lost_words = [(w,[w1[1] for w1 in knn_t if w1[0] == w][0]) for w in t_words if w not in t1_words]
same_words = [w for w in t_words if w in t1_words]
diffs = [(w,abs([w1[1] for w1 in knn_t if w1[0] == w][0] - [w1[1] for w1 in knn_t1 if w1[0] == w][0])) for w in same_words]
new_words.sort(key=lambda tup: tup[1], reverse=True)
lost_words.sort(key=lambda tup: tup[1], reverse=True)
diffs.sort(key=lambda tup: tup[1], reverse=True)
return new_words, lost_words, diffs
def local_measure(word,knn,bins):
# the matrix of size len(bins) x len(bins)
# where the cosine distance of each pair of bins is computed
S = []
print(word, knn)
for xx in range(len(bins) - 2):
bin1 = bins[xx]
bin2 = bins[xx+1]
time1 = str(bin1)
time2 = str(bin2)
# path to the embeddings
path = base_all + '3EMB-' + name + '-' + 'win_' + str(win) + '-size_' + str(
size) + '-min_count_' + str(min_count) + '-iter_' + str(time1) + '-' + str(time2)
# load the embeddings at time t0 using gensim KeyedVectors
embed = KeyedVectors.load(path)
# check if the word is in the embedding wocabulary
if word not in embed.wv.vocab:
print(word + ' not in base_embed\'s vocabulary')
continue
else:
knn_t = embed.most_similar(word, topn=knn)
knn_t_words = [k[0] for k in knn_t]
knn_t_sims = [k[1] for k in knn_t]
if xx > 0:
S.append([0]*xx)
else:
S.append([])
knn_t0 = knn_t
knn_t0_words = knn_t_words
time0 = str(bin1) + '_' + str(bin2)
# only the values of S above the main diagonal are non-zero
# this because cosine distance is simmetric and the values on the diagonal are 0
for x in range(xx+1,len(bins)-1):
time11 = str(bins[x])
time22 = str(bins[x+1])
time = time1 + '-' + time2 + '_' + time11 + '-' + time22
time00 = time0 + '_' + time11 + '-' + time22
print(time)
path_t1 = base_all + '3EMB-' + name + '-' + 'win_' + str(win) + '-size_' + str(size) + '-min_count_' + str(min_count) + '-iter_' + time11 + '-' + time22
# load the embeddings at time t1 using gensim KeyedVectors
embed_t1 = KeyedVectors.load(path_t1)
if word not in embed_t1.wv.vocab:
print(word + ' not in embed\'s vocabulary')
continue
else:
knn_t1 = embed_t1.most_similar(word, topn=knn)
knn_t1_words = [k[0] for k in knn_t1]
knn_t1_sims = [k[1] for k in knn_t1]
# create the second order vector as in:
# Hamilton, William L., Jure Leskovec, and Dan Jurafsky. "Cultural shift or linguistic drift? comparing two computational measures of semantic change." Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing. Vol. 2016. NIH Public Access, 2016.
# Equation 2
s_t = getSim(embed,word,knn_t_words+knn_t1_words)
s_t1 = getSim(embed_t1,word,knn_t_words+knn_t1_words)
dist = cosine_distances([s_t,s_t1]).tolist()[0][1]
new_words, lost_words, diffs = comp_changes(knn_t,knn_t1)
new_words0, lost_words0, diffs0 = comp_changes(knn_t0,knn_t1)
writeMat(base, 'new_words_' + time + '_' + word + '_' + str(knn), new_words)
writeMat(base, 'lost_words_' + time + '_' + word + '_' + str(knn), lost_words)
writeMat(base, 'changes_' + time + '_' + word + '_' + str(knn), diffs)
writeMat(base, 'new_words_' + time00 + '_' + word + '_' + str(knn), new_words0)
writeMat(base, 'lost_words_' + time00 + '_' + word + '_' + str(knn), lost_words0)
writeMat(base, 'changes_' + time00 + '_' + word + '_' + str(knn), diffs0)
S[xx].append(dist)
writeMat(base, 'Local_Similarity_' + name + '_' + word + '_' + str(knn), S)
# EXAMPLE
local_measure('juif',100,bins)