-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_annotations_processed.py
291 lines (239 loc) · 11.7 KB
/
extract_annotations_processed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env python
"""
Preprocess a train/test pair of interim json data files.
Caption: Use NLTK or split function to get tokens.
"""
from random import shuffle, seed
import sys
import os.path
import argparse
import numpy as np
import pdb
import json
import csv
import re
import math
import pickle
# import pprint
def get_top_answers(examples, nans=3000):
counts = {}
for ex in examples:
ans = ex['answer']
counts[ans] = counts.get(ans, 0) + 1
cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
print('Top answer and their counts:')
print('\n'.join(map(str, cw[:20])))
vocab = []
for i in range(nans):
vocab.append(cw[i][1])
return vocab[:nans]
def remove_examples(examples, ans_to_aid):
new_examples = []
for i, ex in enumerate(examples):
if ex['answer'] in ans_to_aid:
new_examples.append(ex)
print('Number of examples reduced from %d to %d ' % (len(examples), len(new_examples)))
return new_examples
def tokenize(sentence):
return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if
i != '' and i != ' ' and i != '\n'];
def tokenize_mcb(s):
t_str = s.lower()
for i in [r'\?', r'\!', r'\'', r'\"', r'\$', r'\:', r'\@', r'\(', r'\)', r'\,', r'\.', r'\;']:
t_str = re.sub(i, '', t_str)
for i in [r'\-', r'\/']:
t_str = re.sub(i, ' ', t_str)
q_list = re.sub(r'\?', '', t_str.lower()).split(' ')
q_list = list(filter(lambda x: len(x) > 0, q_list))
return q_list
def preprocess_questions(examples, nlp='nltk'):
if nlp == 'nltk':
from nltk.tokenize import word_tokenize
print('Example of generated tokens after preprocessing some questions:')
for i, ex in enumerate(examples):
s = ex['question']
if nlp == 'nltk':
ex['question_words'] = word_tokenize(str(s).lower())
elif nlp == 'mcb':
ex['question_words'] = tokenize_mcb(s)
else:
ex['question_words'] = tokenize(s)
if i < 10:
print(ex['question_words'])
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(examples), i * 100.0 / len(examples)))
sys.stdout.flush()
return examples
def remove_long_tail_train(examples, minwcount=0):
# Replace words which are in the long tail (counted less than 'minwcount' times) by the UNK token.
# Also create vocab, a list of the final words.
# count up the number of words
counts = {}
for ex in examples:
for w in ex['question_words']:
counts[w] = counts.get(w, 0) + 1
cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
print('Top words and their counts:')
print('\n'.join(map(str, cw[:20])))
total_words = sum(counts.values())
print('Total words:', total_words)
bad_words = [w for w, n in counts.items() if n <= minwcount]
vocab = [w for w, n in counts.items() if n > minwcount]
bad_count = sum(counts[w] for w in bad_words)
print('Number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
print('Number of words in vocab would be %d' % (len(vocab),))
print('Number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count * 100.0 / total_words))
print('Insert the special UNK token')
vocab.append('UNK')
for ex in examples:
words = ex['question_words']
question = [w if counts.get(w, 0) > minwcount else 'UNK' for w in words]
ex['question_words_UNK'] = question
return examples, vocab
def remove_long_tail_test(examples, word_to_wid):
for ex in examples:
ex['question_words_UNK'] = [w if w in word_to_wid else 'UNK' for w in ex['question_words']]
return examples
def encode_question(examples, word_to_wid, maxlength=15, pad='left'):
# Add to tuple question_wids and question_length
for i, ex in enumerate(examples):
ex['question_length'] = min(maxlength, len(ex['question_words_UNK'])) # record the length of this sequence
ex['question_wids'] = [0] * maxlength
for k, w in enumerate(ex['question_words_UNK']):
if k < maxlength:
if pad == 'right':
ex['question_wids'][k] = word_to_wid[w]
else: # ['pad'] == 'left'
new_k = k + maxlength - len(ex['question_words_UNK'])
ex['question_wids'][new_k] = word_to_wid[w]
ex['seq_length'] = len(ex['question_words_UNK'])
return examples
def encode_answer(examples, ans_to_aid):
print('Warning: aid of answer not in vocab is 1999')
for i, ex in enumerate(examples):
ex['answer_aid'] = ans_to_aid.get(ex['answer'], 1999) # -1 means answer not in vocab
return examples
def encode_answers_occurence(examples, ans_to_aid):
for i, ex in enumerate(examples):
answers = []
answers_aid = []
answers_count = []
for ans in ex['answers_occurence']:
aid = ans_to_aid.get(ans[0], -1) # -1 means answer not in vocab
if aid != -1:
answers.append(ans[0])
answers_aid.append(aid)
answers_count.append(ans[1])
ex['answers'] = answers
ex['answers_aid'] = answers_aid
ex['answers_count'] = answers_count
return examples
def vqa_processed(params):
#####################################################
## Read input files
#####################################################
path_train = os.path.join(params['dir'], 'interim', params['trainsplit'] + '_questions_annotations.json')
if params['trainsplit'] == 'train':
path_val = os.path.join(params['dir'], 'interim', 'val_questions_annotations.json')
path_test = os.path.join(params['dir'], 'interim', 'test_questions.json')
path_testdev = os.path.join(params['dir'], 'interim', 'testdev_questions.json')
# An example is a tuple (question, image, answer)
# /!\ test and test-dev have no answer
trainset = json.load(open(path_train, 'r'))
if params['trainsplit'] == 'train':
valset = json.load(open(path_val, 'r'))
testset = json.load(open(path_test, 'r'))
testdevset = json.load(open(path_testdev, 'r'))
#####################################################
## Preprocess examples (questions and answers)
#####################################################
top_answers = get_top_answers(trainset, params['nans'])
aid_to_ans = [a for i, a in enumerate(top_answers)]
ans_to_aid = {a: i for i, a in enumerate(top_answers)}
# Remove examples if answer is not in top answers
trainset = remove_examples(trainset, ans_to_aid)
# Add 'question_words' to the initial tuple
trainset = preprocess_questions(trainset, params['nlp'])
if params['trainsplit'] == 'train':
valset = preprocess_questions(valset, params['nlp'])
testset = preprocess_questions(testset, params['nlp'])
testdevset = preprocess_questions(testdevset, params['nlp'])
# Also process top_words which contains a UNK char
trainset, top_words = remove_long_tail_train(trainset, params['minwcount'])
wid_to_word = {i + 1: w for i, w in enumerate(top_words)}
word_to_wid = {w: i + 1 for i, w in enumerate(top_words)}
if params['trainsplit'] == 'train':
valset = remove_long_tail_test(valset, word_to_wid)
testset = remove_long_tail_test(testset, word_to_wid)
testdevset = remove_long_tail_test(testdevset, word_to_wid)
trainset = encode_question(trainset, word_to_wid, params['maxlength'], params['pad'])
if params['trainsplit'] == 'train':
valset = encode_question(valset, word_to_wid, params['maxlength'], params['pad'])
testset = encode_question(testset, word_to_wid, params['maxlength'], params['pad'])
testdevset = encode_question(testdevset, word_to_wid, params['maxlength'], params['pad'])
trainset = encode_answer(trainset, ans_to_aid)
trainset = encode_answers_occurence(trainset, ans_to_aid)
if params['trainsplit'] == 'train':
valset = encode_answer(valset, ans_to_aid)
valset = encode_answers_occurence(valset, ans_to_aid)
#####################################################
## Write output files
#####################################################
# Paths to output files
# Ex: data/vqa/processed/nans,3000_maxlength,15_..._trainsplit,train_testsplit,val/id_to_word.json
subdirname = 'nans,' + str(params['nans'])
for param in ['maxlength', 'minwcount', 'nlp', 'pad', 'trainsplit']:
subdirname += '_' + param + ',' + str(params[param])
os.system('mkdir -p ' + os.path.join(params['dir'], 'processed', subdirname))
path_wid_to_word = os.path.join(params['dir'], 'processed', subdirname, 'wid_to_word.pickle')
path_word_to_wid = os.path.join(params['dir'], 'processed', subdirname, 'word_to_wid.pickle')
path_aid_to_ans = os.path.join(params['dir'], 'processed', subdirname, 'aid_to_ans.pickle')
path_ans_to_aid = os.path.join(params['dir'], 'processed', subdirname, 'ans_to_aid.pickle')
if params['trainsplit'] == 'train':
path_trainset = os.path.join(params['dir'], 'processed', subdirname, 'trainset.pickle')
path_valset = os.path.join(params['dir'], 'processed', subdirname, 'valset.pickle')
elif params['trainsplit'] == 'trainval':
path_trainset = os.path.join(params['dir'], 'processed', subdirname, 'trainvalset.pickle')
path_testset = os.path.join(params['dir'], 'processed', subdirname, 'testset.pickle')
path_testdevset = os.path.join(params['dir'], 'processed', subdirname, 'testdevset.pickle')
print('Write wid_to_word to', path_wid_to_word)
with open(path_wid_to_word, 'wb') as handle:
pickle.dump(wid_to_word, handle)
print('Write word_to_wid to', path_word_to_wid)
with open(path_word_to_wid, 'wb') as handle:
pickle.dump(word_to_wid, handle)
print('Write aid_to_ans to', path_aid_to_ans)
with open(path_aid_to_ans, 'wb') as handle:
pickle.dump(aid_to_ans, handle)
print('Write ans_to_aid to', path_ans_to_aid)
with open(path_ans_to_aid, 'wb') as handle:
pickle.dump(ans_to_aid, handle)
print('Write trainset to', path_trainset)
with open(path_trainset, 'wb') as handle:
pickle.dump(trainset, handle)
if params['trainsplit'] == 'train':
print('Write valset to', path_valset)
with open(path_valset, 'wb') as handle:
pickle.dump(valset, handle)
print('Write testset to', path_testset)
with open(path_testset, 'wb') as handle:
pickle.dump(testset, handle)
print('Write testdevset to', path_testdevset)
with open(path_testdevset, 'wb') as handle:
pickle.dump(testdevset, handle)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--dir', default='data', type=str,
help='Root directory containing raw, interim and processed directories')
parser.add_argument('--trainsplit', default='train', type=str, help='Options: train | trainval')
parser.add_argument('--nans', default=2000, type=int, help='Number of top answers for the final classifications')
parser.add_argument('--maxlength', default=26, type=int,
help='Max number of words in a caption. Captions longer get clipped')
parser.add_argument('--minwcount', default=0, type=int,
help='Words that occur less than that are removed from vocab')
parser.add_argument('--nlp', default='mcb', type=str, help='Token method ; Options: nltk | mcb | naive')
parser.add_argument('--pad', default='left', type=str,
help='Padding ; Options: right (finish by zeros) | left (begin by zeros)')
args = parser.parse_args()
opt_vqa = vars(args)
vqa_processed(opt_vqa)