-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild_graph.py
146 lines (125 loc) · 4.7 KB
/
build_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
from tqdm.auto import tqdm
import scipy.sparse as sp
from math import log
import numpy as np
def ordered_word_pair(a, b):
if a > b:
return b, a
else:
return a, b
def get_adj(tokenize_sentences,train_size,word_id_map,word_list,args):
window_size = 20
total_W = 0
word_occurrence = {}
word_pair_occurrence = {}
node_size = train_size + len(word_list)
vocab_length = len(word_list)
def update_word_and_word_pair_occurrence(q):
unique_q = list(set(q))
for i in unique_q:
try:
word_occurrence[i] += 1
except:
word_occurrence[i] = 1
for i in range(len(unique_q)):
for j in range(i+1, len(unique_q)):
word1 = unique_q[i]
word2 = unique_q[j]
word1, word2 = ordered_word_pair(word1, word2)
try:
word_pair_occurrence[(word1, word2)] += 1
except:
word_pair_occurrence[(word1, word2)] = 1
if not args.easy_copy:
print("Calculating PMI")
for ind in range(train_size):
words = tokenize_sentences[ind]
q = []
# push the first (window_size) words into a queue
for i in range(min(window_size, len(words))):
q += [word_id_map[words[i]]]
# update the total number of the sliding windows
total_W += 1
# update the number of sliding windows that contain each word and word pair
update_word_and_word_pair_occurrence(q)
now_next_word_index = window_size
# pop the first word out and let the next word in, keep doing this until the end of the document
while now_next_word_index<len(words):
q.pop(0)
q += [word_id_map[words[now_next_word_index]]]
now_next_word_index += 1
# update the total number of the sliding windows
total_W += 1
# update the number of sliding windows that contain each word and word pair
update_word_and_word_pair_occurrence(q)
# calculate PMI for edges
row = []
col = []
weight = []
for word_pair in word_pair_occurrence:
i = word_pair[0]
j = word_pair[1]
count = word_pair_occurrence[word_pair]
word_freq_i = word_occurrence[i]
word_freq_j = word_occurrence[j]
pmi = log((count * total_W) / (word_freq_i * word_freq_j))
if pmi <=0:
continue
row.append(train_size + i)
col.append(train_size + j)
weight.append(pmi)
row.append(train_size + j)
col.append(train_size + i)
weight.append(pmi)
if not args.easy_copy:
print("PMI finished.")
#get each word appears in which document
word_doc_list = {}
for word in word_list:
word_doc_list[word]=[]
for i in range(train_size):
doc_words = tokenize_sentences[i]
unique_words = set(doc_words)
for word in unique_words:
exsit_list = word_doc_list[word]
exsit_list.append(i)
word_doc_list[word] = exsit_list
#document frequency
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
word_doc_freq[word] = len(doc_list)
# term frequency
doc_word_freq = {}
for doc_id in range(train_size):
words = tokenize_sentences[doc_id]
for word in words:
word_id = word_id_map[word]
doc_word_str = str(doc_id) + ',' + str(word_id)
if doc_word_str in doc_word_freq:
doc_word_freq[doc_word_str] += 1
else:
doc_word_freq[doc_word_str] = 1
doc_emb = np.zeros((train_size, vocab_length))
for i in range(train_size):
words = tokenize_sentences[i]
doc_word_set = set()
for word in words:
if word in doc_word_set:
continue
j = word_id_map[word]
key = str(i) + ',' + str(j)
freq = doc_word_freq[key]
row.append(i)
col.append(train_size + j)
idf = log(1.0 * train_size / word_doc_freq[word_list[j]])
w = freq * idf
weight.append(w)
doc_word_set.add(word)
doc_emb[i][j] = w/len(words)
adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))
# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
return adj, doc_emb, word_doc_freq