-
Notifications
You must be signed in to change notification settings - Fork 50
/
Copy pathsensegram.py
139 lines (113 loc) · 5.91 KB
/
sensegram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os.path
import codecs
import numpy as np
from gensim.models import word2vec
from collections import defaultdict
from traceback import format_exc
import gensim
DEFAULT_COUNT = 100 # Should be larger than min_count of vec object, which is 5 by default
SEP_SENSE = "#"
INVENTORY_EXT = ".inventory.csv"
class SenseGram(word2vec.Word2Vec):
def __init__(self, *args, **kwargs):
super(SenseGram, self).__init__(*args, **kwargs)
self.inventory = defaultdict(lambda: defaultdict(float))
def max_pairwise_sim(self, word_i, word_j, ignore_case=False):
""" Calculates maximal pairwise similarity between all senses. """
senses_word_i = [s for s, p in self.get_senses(word_i, ignore_case=ignore_case)]
senses_word_j = [s for s, p in self.get_senses(word_j, ignore_case=ignore_case)]
sims_ij = []
for n, s_i in enumerate(senses_word_i):
for m, s_j in enumerate(senses_word_j):
sims_ij.append((self.similarity(s_i, s_j), s_i, s_j))
sims_ij = sorted(sims_ij, reverse=True)
if len(sims_ij) > 0:
max_sim_ij = sims_ij[0][0]
return max_sim_ij
else:
return 0.0
def create_zero_vectors(self, senses_num, vector_dim):
""" Resets existing word vectors and creates new vectors.
This is useful if you try to create a model from scratch. """
self.wv.syn0 = np.zeros((senses_num, vector_dim), dtype=np.float32)
def get_senses(self, word, ignore_case=False):
""" Returns a list of all available senses for a given word.
example: 'mouse' -> [('mouse#0', 0.33), ('mouse#1', 0.66)] """
words = set([word])
senses = []
if ignore_case:
words.add(word.title())
words.add(word.lower())
for word in words:
if word not in self.inventory: continue
for sense_id in self.inventory[word]:
sense = word + SEP_SENSE + str(sense_id)
if sense not in self.wv.vocab: continue
senses.append((sense, self.inventory[word][sense_id]))
return senses
def get_most_probable_sense(self, word, ignore_case=True):
senses = self.get_senses(word, ignore_case=ignore_case)
most_probable_sense, prob = sorted(senses, key=lambda s: s[1], reverse=True)[0]
return most_probable_sense, prob
def save_word2vec_format(self, fname, fvocab=None, binary=False):
""" Saves SenseGram model in the word2vec format. In addition a CSV
file with word sense inventory is saved containing a priory probabilities."""
# Save the word2vec format model
self.wv.save_word2vec_format(fname, fvocab, binary)
# Save the extra file 'word#sense_id<TAB>prob-of-the-sense' with sense inventory info
prob_file = fname + INVENTORY_EXT
with codecs.open(prob_file, 'w', encoding='utf-8') as out:
for word in self.inventory:
for sense_id in self.inventory[word]:
out.write("%s#%s\t%.6f\n" % (word, sense_id, self.inventory[word][sense_id]))
@classmethod
def load_word2vec_format(cls, model_fpath, fvocab=None, binary=False, norm_only=True, encoding='utf8', unicode_errors='strict'):
""" Load the model from word2vec format (the vectors) and optionally loads word sense inventory
from a CSV file located next to the word vectors. """
# Load word vectors
wv_obj = gensim.models.KeyedVectors.load_word2vec_format(model_fpath, fvocab, binary, encoding, unicode_errors)
result = cls(size=wv_obj.syn0.shape[1])
result.wv.syn0 = wv_obj.syn0
result.wv.vocab = wv_obj.vocab
result.wv.index2word = wv_obj.index2word
# Load the inventory
inventory_fpath = model_fpath + INVENTORY_EXT
if os.path.isfile(inventory_fpath):
with codecs.open(inventory_fpath, 'r', encoding='utf-8') as inventory_file:
for line in inventory_file:
try:
sense, prob = line.split('\t')
f = sense.split(SEP_SENSE)
word = SEP_SENSE.join(f[0:len(f)-1]) # some words can contains sep
sense_id = f[-1]
if len(word) == 0 or len(sense_id) == 0: continue
result.inventory[word][sense_id] = float(prob)
except:
print(("Bad line '%s'" % line))
print((format_exc()))
else:
for sense in result.wv.index2word:
try:
word, sense_id = sense.split(SEP_SENSE)
result.inventory[word][sense_id] = 1.0
except:
print(format_exc())
return result
def add_sense(self, word, sense_id, vector, prob):
""" Add a new sense to the model, where sense is an
identifier composed composed of a word and an integer sense id, e.g. 'python#2'.
The vector is a regular word2vec vector in the form of ndarray.
The prob is a priory probability of the word sense among all senses of the word,
e.g. "python#1" is 0.33 and "python#2" is 0.67. """
# Update the word2vec model: vector and word2vec vocabulary
if hasattr(self.wv, 'syn0'):
word_id = len(self.wv.vocab)
sense = word.replace(" ","_") + SEP_SENSE + str(sense_id) # w2v format accepts no whitespaces
self.wv.vocab[sense] = word2vec.Vocab(index=word_id, count=DEFAULT_COUNT)
self.wv.syn0[word_id] = vector
self.wv.index2word.append(sense)
assert sense == self.wv.index2word[self.wv.vocab[sense].index]
else:
raise RuntimeError("Error: you should initialize syn0 matrix before adding words")
# Update the custom word sense inventory
self.inventory[word][sense_id] = prob