-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrunWorstCaseICSMTests.py
305 lines (246 loc) · 11.6 KB
/
runWorstCaseICSMTests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import os
from unicodeMagic import UnicodeReader, UnicodeWriter
from lsaAlgorithm import LSAAlgo
from dictUtils import MyDict
from filters import *
import os
import csv
class ICSMTests():
"""
Grabs the output file of the LSA algorithm and compares it to the oracle.
Appends the type of result to each row {tp, fp}.
Also appends the missing results (fn) to the output file.
Returns with the numbers for all types of results, including precision, recall and f-measure.
"""
def computeResults(self, dataPath, filename, outfilename, oracle, nameEmailData):
from gensim import models, similarities, matutils, corpora, utils
if os.path.exists(os.path.join(dataPath, filename)):
indices = range(len(nameEmailData.keys()))
documents = MyDict(os.path.join(dataPath, 'documents.dict'))
directLookup = MyDict(os.path.join(dataPath, 'directLookup.dict')) #input data index -> document index
index = similarities.docsim.Similarity.load(os.path.join(dataPath, 'index'))
# Initial values
tp = 0.0
fp = 0.0
fn = 0.0
tn = 0.0
matchedTuples = set()
# Write all matched indexes to a set
f = open(os.path.join(dataPath, filename), 'rb')
reader = UnicodeReader(f)
g = open(os.path.join(dataPath, outfilename), 'wb')
writer = csv.writer(g, delimiter=';')
for row in reader:
header = row #Ignore header
header.append('kind')
writer.writerow(header)
break
for row in reader:
# Put the smallest value first.
if int(row[0]) <= int(row[1]):
idx1 = int(row[0])
idx2 = int(row[1])
else:
idx2 = int(row[0])
idx1 = int(row[1])
matchedTuples.add((idx1, idx2))
if oracle[(idx1, idx2)] == 1:
row.append('tp')
else:
row.append('fp')
writer.writerow(row)
f.close()
#Iterate all combinations to compute all tp, fp, fn, tn
for idx1, idx2 in itertools.combinations(indices, 2):
docId1 = directLookup[idx1]
docId2 = directLookup[idx2]
if (idx1, idx2) in matchedTuples: #The tuple was matched by the algorithm
# tp or fp
if oracle[(idx1, idx2)] == 1: #Correctly matched
tp += 1
else: #Incorrectly matched
fp += 1
else: #The tuple was not matched by the algorithm
if oracle[(idx1, idx2)] == 1: #It should have been matched
fn += 1
# Add the fn to the results file
writer.writerow([str(idx1), str(idx2), nameEmailData[idx1], nameEmailData[idx2], documents[docId1], documents[docId2], index.similarity_by_id(docId1)[docId2], 'fn'])
else: #Did not match correctly
tn += 1
g.close()
os.remove(os.path.join(dataPath, filename))
#Add reflexive results to the tp's :-)
for idx in indices:
tp += 1
try:
precision = tp / (tp + fp)
except:
precision = 0.0
try:
recall = tp / (tp + fn)
except:
recall = 0.0
try:
fmeasure = 2 * precision * recall / (precision + recall)
except:
fmeasure = 0.0
return [tp, fp, fn, tn, precision, recall, fmeasure]
else:
return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
"""
Loads the data set for each iteration, running all parameter combinations on the LSA algorithm.
The output files for each parameter combination are augmented with the type of result (tp, fp, fn) using the function self.computeResults().
For each iteration, a separate output file is written, containing each paramemter combination, including the precision, recall and f-measure.
In: for 0 <= i <= nrIterations :: trainingWorse_i.csv
Out: for 0 <= j <= parameterCombination :: for 0 <= i <= nrIterations :: resultsWorse_i_j.csv
for 0 <= i <= nrIterations :: resultsTrainingWorse_i.csv
"""
def runTraining(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', levThrRange=[0.7, 0.8, 0.9], minLenRange=[2, 3, 4], kRange=[0.9, 0.95, 1.0], cosThrRange=[0.7, 0.75, 0.8, 0.85], nrIterations=10, nrProcesses=1):
#Load iteration of the training set. (../data/icsmData/training_0.csv)
#Run this iteration on all parameter combinations.
#Compare to oracle and augment the output file from the LSA algorithm.
#Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv)
#Make sure the resultsPath exists. If not, create it.
if not os.path.exists(resultsPath):
os.makedirs(resultsPath)
#Similar for the resultsPath/training directory, as training and testing is separated.
if not os.path.exists(resultsPath + '/trainingWorse'):
os.makedirs(resultsPath + '/trainingWorse')
#Loop all iterations
for itIdx in range(nrIterations):
#Load the data
nameEmailData = MyDict()
f = open(os.path.join(dataPath, 'trainingWorse_%d.csv' % itIdx), 'rb')
reader = UnicodeReader(f)
idx = 0
for row in reader:
try:
alias = row[0]
email = unspam(row[1])
nameEmailData[idx] = (alias, email)
except:
print row
idx += 1
f.close()
#Load the oracle for the data
aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
oracle = computeOracle(nameEmailData, aliasToIdName)
nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(cosThrRange)
g = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'wb')
writer = csv.writer(g, delimiter=';')
writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f'])
run = 0
for levThr in levThrRange:
for minLen in minLenRange:
for k in kRange:
for cosThr in cosThrRange:
#Load the parameters
parameters = {}
parameters["levenshteinSimRatio"] = levThr
parameters["minLength"] = minLen
parameters["cosineSimRatio"] = cosThr
parameters["rankReductionRatio"] = k
#Run the LSA algorithm on these parameters
lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='trainingWorse', resultsFileName='resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False)
lsaAlgo.run()
#Now check the results using the oracle
[tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'trainingWorse'), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f.csv' % (itIdx, levThr, minLen, k, cosThr), oracle, nameEmailData)
writer.writerow([levThr,minLen,k,cosThr,tp,fp,fn,tn,precision,recall,fmeasure])
run += 1
print 'Run %d out of %d in iteration %d...' % (run, nrRuns, itIdx)
g.close()
"""
Loads the results of the training for each iteration, grabbing the parameter combination with the best f-measure.
This parameter combination is used for the LSA algorithm on the worst case testing set.
Similar to training, the output files are augmented with the type of result (tp, fp, fn) using the function self.computeResults().
For each iteration, a separate output file is written, containing the results from the testing set with the best parameter combination, including the precision, recall and f-measure.
In: for 0 <= i <= nrIterations :: resultsTrainingWorse_i.csv
for 0 <= i <= nrIterations :: testWorse_i.csv
Out: for 0 <= i <= nrIterations :: resultsWorse_i.csv
for 0 <= i <= nrIterations :: resultsTestingWorse_i.csv
"""
def runTesting(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', nrIterations=10, nrProcesses=1):
#For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv)
#Grab the parameter combination with the highest f-measure.
#Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv)
#Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv)
#Make sure the resultsPath exists. If not, create it.
if not os.path.exists(resultsPath):
os.makedirs(resultsPath)
#Similar for the resultsPath/testing directory, as training and testing is separated.
if not os.path.exists(resultsPath + '/testingWorse'):
os.makedirs(resultsPath + '/testingWorse')
#Load the oracle for the data
aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
#Loop all iterations
for itIdx in range(nrIterations):
resultList = []
#Load the data
f = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'rb')
reader = csv.reader(f, delimiter=';')
for row in reader:
header = row #skip header
break
for row in reader:
#levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f
levThr = row[0]
minLen = row[1]
k = row[2]
cosThr = row[3]
tp = row[4]
fp = row[5]
fn = row[6]
tn = row[7]
precision = row[8]
recall = row[9]
fMeasure = row[10]
resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fMeasure))
f.close()
#Order results by f-measure
resultList = sorted(resultList, key=lambda tuple: -float(tuple[10]))
#Grab the first record, containing the best parameters.
bestLevThr = float(resultList[0][0])
bestMinLen = int(resultList[0][1])
bestK = float(resultList[0][2])
bestCosThr = float(resultList[0][3])
g = open(os.path.join(resultsPath, 'testingWorse', 'resultsTestingWorse_%d.csv' % itIdx), 'wb')
writer = csv.writer(g, delimiter=';')
writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f'])
parameters = {}
parameters["levenshteinSimRatio"] = bestLevThr
parameters["minLength"] = bestMinLen
parameters["rankReductionRatio"] = bestK
parameters["cosineSimRatio"] = bestCosThr
#Read the data from the testing subset
nameEmailData = MyDict()
f = open(os.path.join(dataPath, 'testWorse_%d.csv' % (itIdx)), 'rb')
reader = UnicodeReader(f)
idx = 0
for row in reader:
try:
alias = row[0]
email = unspam(row[1])
nameEmailData[idx] = (alias, email)
except:
print row
idx += 1
f.close()
lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='testingWorse', resultsFileName='resultsWorse_%d_preoracle.csv' % (itIdx), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False)
lsaAlgo.run()
#Compute the oracle to verify results
oracle = computeOracle(nameEmailData, aliasToIdName)
#Now check the results using the oracle
[tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'testingWorse'), 'resultsWorse_%d_preoracle.csv' % (itIdx), 'resultsWorse_%d.csv' % (itIdx), oracle, nameEmailData)
writer.writerow([bestLevThr,bestMinLen,bestK,bestCosThr,tp,fp,fn,tn,precision,recall,fmeasure])
print 'Done computing results on iteration %d' % (itIdx)
g.close()
if __name__=="__main__":
nrIterations = 10 #out of 10
nrProcesses = 2 #Number of CPU cores to use in the LSA algorithm
levThrRange = [0.5, 0.6, 0.7, 0.8, 0.9]
minLenRange = [2, 3, 4]
kRange = [0.8, 0.85, 0.9, 0.95, 1.0]
cosThrRange = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85]
icsmTester = ICSMTests()
icsmTester.runTraining(levThrRange=levThrRange, minLenRange=minLenRange, kRange=kRange, cosThrRange=cosThrRange, nrIterations=nrIterations, nrProcesses=nrProcesses)
icsmTester.runTesting(nrIterations=nrIterations, nrProcesses=nrProcesses)