forked from barronalex/Dynamic-Memory-Networks-in-TensorFlow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdmn_plus.py
335 lines (247 loc) · 12.1 KB
/
dmn_plus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
from __future__ import print_function
from __future__ import division
import sys
import time
import numpy as np
from copy import deepcopy
import tensorflow as tf
from attention_gru_cell import AttentionGRUCell
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
import babi_input
class Config(object):
"""Holds model hyperparams and data information."""
batch_size = 100
embed_size = 80
hidden_size = 80
max_epochs = 256
early_stopping = 20
dropout = 0.9
lr = 0.001
l2 = 0.001
cap_grads = False
max_grad_val = 10
noisy_grads = False
word2vec_init = False
embedding_init = np.sqrt(3)
# NOTE not currently used hence non-sensical anneal_threshold
anneal_threshold = 1000
anneal_by = 1.5
num_hops = 3
num_attention_features = 4
max_allowed_inputs = 130
num_train = 9000
floatX = np.float32
babi_id = "1"
babi_test_id = ""
train_mode = True
def _add_gradient_noise(t, stddev=1e-3, name=None):
"""Adds gradient noise as described in http://arxiv.org/abs/1511.06807
The input Tensor `t` should be a gradient.
The output will be `t` + gaussian noise.
0.001 was said to be a good fixed value for memory networks."""
with tf.variable_scope('gradient_noise'):
gn = tf.random_normal(tf.shape(t), stddev=stddev)
return tf.add(t, gn)
# from https://github.com/domluna/memn2n
def _position_encoding(sentence_size, embedding_size):
"""We could have used RNN for parsing sentence but that tends to overfit.
The simpler choice would be to take sum of embedding but we loose loose positional information.
Position encoding is described in section 4.1 in "End to End Memory Networks" in more detail (http://arxiv.org/pdf/1503.08895v5.pdf)"""
encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
ls = sentence_size+1
le = embedding_size+1
for i in range(1, le):
for j in range(1, ls):
encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
encoding = 1 + 4 * encoding / embedding_size / sentence_size
return np.transpose(encoding)
class DMN_PLUS(object):
def load_data(self, debug=False):
"""Loads train/valid/test data and sentence encoding"""
if self.config.train_mode:
self.train, self.valid, self.word_embedding, self.max_q_len, self.max_sentences, self.max_sen_len, self.vocab_size = babi_input.load_babi(self.config, split_sentences=True)
else:
self.test, self.word_embedding, self.max_q_len, self.max_sentences, self.max_sen_len, self.vocab_size = babi_input.load_babi(self.config, split_sentences=True)
self.encoding = _position_encoding(self.max_sen_len, self.config.embed_size)
def add_placeholders(self):
"""add data placeholder to graph"""
self.question_placeholder = tf.placeholder(tf.int32, shape=(self.config.batch_size, self.max_q_len))
self.input_placeholder = tf.placeholder(tf.int32, shape=(self.config.batch_size, self.max_sentences, self.max_sen_len))
self.question_len_placeholder = tf.placeholder(tf.int32, shape=(self.config.batch_size,))
self.input_len_placeholder = tf.placeholder(tf.int32, shape=(self.config.batch_size,))
self.answer_placeholder = tf.placeholder(tf.int64, shape=(self.config.batch_size,))
self.dropout_placeholder = tf.placeholder(tf.float32)
def get_predictions(self, output):
preds = tf.nn.softmax(output)
pred = tf.argmax(preds, 1)
return pred
def add_loss_op(self, output):
"""Calculate loss"""
loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, labels=self.answer_placeholder))
# add l2 regularization for all variables except biases
for v in tf.trainable_variables():
if not 'bias' in v.name.lower():
loss += self.config.l2*tf.nn.l2_loss(v)
tf.summary.scalar('loss', loss)
return loss
def add_training_op(self, loss):
"""Calculate and apply gradients"""
opt = tf.train.AdamOptimizer(learning_rate=self.config.lr)
gvs = opt.compute_gradients(loss)
# optionally cap and noise gradients to regularize
if self.config.cap_grads:
gvs = [(tf.clip_by_norm(grad, self.config.max_grad_val), var) for grad, var in gvs]
if self.config.noisy_grads:
gvs = [(_add_gradient_noise(grad), var) for grad, var in gvs]
train_op = opt.apply_gradients(gvs)
return train_op
def get_question_representation(self):
"""Get question vectors via embedding and GRU"""
questions = tf.nn.embedding_lookup(self.embeddings, self.question_placeholder)
gru_cell = tf.contrib.rnn.GRUCell(self.config.hidden_size)
_, q_vec = tf.nn.dynamic_rnn(gru_cell,
questions,
dtype=np.float32,
sequence_length=self.question_len_placeholder
)
return q_vec
def get_input_representation(self):
"""Get fact (sentence) vectors via embedding, positional encoding and bi-directional GRU"""
# get word vectors from embedding
inputs = tf.nn.embedding_lookup(self.embeddings, self.input_placeholder)
# use encoding to get sentence representation
inputs = tf.reduce_sum(inputs * self.encoding, 2)
forward_gru_cell = tf.contrib.rnn.GRUCell(self.config.hidden_size)
backward_gru_cell = tf.contrib.rnn.GRUCell(self.config.hidden_size)
outputs, _ = tf.nn.bidirectional_dynamic_rnn(
forward_gru_cell,
backward_gru_cell,
inputs,
dtype=np.float32,
sequence_length=self.input_len_placeholder
)
# sum forward and backward output vectors
fact_vecs = tf.reduce_sum(tf.stack(outputs), axis=0)
# apply dropout
fact_vecs = tf.nn.dropout(fact_vecs, self.dropout_placeholder)
return fact_vecs
def get_attention(self, q_vec, prev_memory, fact_vec, reuse):
"""Use question vector and previous memory to create scalar attention for current fact"""
with tf.variable_scope("attention", reuse=reuse):
features = [fact_vec*q_vec,
fact_vec*prev_memory,
tf.abs(fact_vec - q_vec),
tf.abs(fact_vec - prev_memory)]
feature_vec = tf.concat(features, 1)
attention = tf.contrib.layers.fully_connected(feature_vec,
self.config.embed_size,
activation_fn=tf.nn.tanh,
reuse=reuse, scope="fc1")
attention = tf.contrib.layers.fully_connected(attention,
1,
activation_fn=None,
reuse=reuse, scope="fc2")
return attention
def generate_episode(self, memory, q_vec, fact_vecs, hop_index):
"""Generate episode by applying attention to current fact vectors through a modified GRU"""
attentions = [tf.squeeze(
self.get_attention(q_vec, memory, fv, bool(hop_index) or bool(i)), axis=1)
for i, fv in enumerate(tf.unstack(fact_vecs, axis=1))]
attentions = tf.transpose(tf.stack(attentions))
self.attentions.append(attentions)
attentions = tf.nn.softmax(attentions)
attentions = tf.expand_dims(attentions, axis=-1)
reuse = True if hop_index > 0 else False
# concatenate fact vectors and attentions for input into attGRU
gru_inputs = tf.concat([fact_vecs, attentions], 2)
with tf.variable_scope('attention_gru', reuse=reuse):
_, episode = tf.nn.dynamic_rnn(AttentionGRUCell(self.config.hidden_size),
gru_inputs,
dtype=np.float32,
sequence_length=self.input_len_placeholder
)
return episode
def add_answer_module(self, rnn_output, q_vec):
"""Linear softmax answer module"""
rnn_output = tf.nn.dropout(rnn_output, self.dropout_placeholder)
output = tf.layers.dense(tf.concat([rnn_output, q_vec], 1),
self.vocab_size,
activation=None)
return output
def inference(self):
"""Performs inference on the DMN model"""
# input fusion module
with tf.variable_scope("question", initializer=tf.contrib.layers.xavier_initializer()):
print('==> get question representation')
q_vec = self.get_question_representation()
with tf.variable_scope("input", initializer=tf.contrib.layers.xavier_initializer()):
print('==> get input representation')
fact_vecs = self.get_input_representation()
# keep track of attentions for possible strong supervision
self.attentions = []
# memory module
with tf.variable_scope("memory", initializer=tf.contrib.layers.xavier_initializer()):
print('==> build episodic memory')
# generate n_hops episodes
prev_memory = q_vec
for i in range(self.config.num_hops):
# get a new episode
print('==> generating episode', i)
episode = self.generate_episode(prev_memory, q_vec, fact_vecs, i)
# untied weights for memory update
with tf.variable_scope("hop_%d" % i):
prev_memory = tf.layers.dense(tf.concat([prev_memory, episode, q_vec], 1),
self.config.hidden_size,
activation=tf.nn.relu)
output = prev_memory
# pass memory module output through linear answer module
with tf.variable_scope("answer", initializer=tf.contrib.layers.xavier_initializer()):
output = self.add_answer_module(output, q_vec)
return output
def run_epoch(self, session, data, num_epoch=0, train_writer=None, train_op=None, verbose=2, train=False):
config = self.config
dp = config.dropout
if train_op is None:
train_op = tf.no_op()
dp = 1
total_steps = len(data[0]) // config.batch_size
total_loss = []
accuracy = 0
# shuffle data
p = np.random.permutation(len(data[0]))
qp, ip, ql, il, im, a = data
qp, ip, ql, il, im, a = qp[p], ip[p], ql[p], il[p], im[p], a[p]
for step in range(total_steps):
index = range(step*config.batch_size,(step+1)*config.batch_size)
feed = {self.question_placeholder: qp[index],
self.input_placeholder: ip[index],
self.question_len_placeholder: ql[index],
self.input_len_placeholder: il[index],
self.answer_placeholder: a[index],
self.dropout_placeholder: dp}
loss, pred, summary, _ = session.run(
[self.calculate_loss, self.pred, self.merged, train_op], feed_dict=feed)
if train_writer is not None:
train_writer.add_summary(summary, num_epoch*total_steps + step)
answers = a[step*config.batch_size:(step+1)*config.batch_size]
accuracy += np.sum(pred == answers)/float(len(answers))
total_loss.append(loss)
if verbose and step % verbose == 0:
sys.stdout.write('\r{} / {} : loss = {}'.format(
step, total_steps, np.mean(total_loss)))
sys.stdout.flush()
if verbose:
sys.stdout.write('\r')
return np.mean(total_loss), accuracy/float(total_steps)
def __init__(self, config):
self.config = config
self.variables_to_save = {}
self.load_data(debug=False)
self.add_placeholders()
# set up embedding
self.embeddings = tf.Variable(self.word_embedding.astype(np.float32), name="Embedding")
self.output = self.inference()
self.pred = self.get_predictions(self.output)
self.calculate_loss = self.add_loss_op(self.output)
self.train_step = self.add_training_op(self.calculate_loss)
self.merged = tf.summary.merge_all()