-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutil.py
135 lines (120 loc) · 4.94 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os, random, operator, sys
from collections import Counter
def dotProduct(d1, d2):
"""
@param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
@param dict d2: same as d1
@return float: the dot product between d1 and d2
"""
if len(d1) < len(d2):
return dotProduct(d2, d1)
else:
return sum(d1.get(f, 0) * v for f, v in list(d2.items()))
def increment(d1, scale, d2):
"""
Implements d1 += scale * d2 for sparse vectors.
@param dict d1: the feature vector which is mutated.
@param float scale
@param dict d2: a feature vector.
"""
for f, v in list(d2.items()):
d1[f] = d1.get(f, 0) + v * scale
def readExamples(path):
'''
Reads a set of training examples.
'''
examples = []
for line in open(path, encoding = "ISO-8859-1"):
# Format of each line: <output label (+1 or -1)> <input sentence>
y, x = line.split(' ', 1)
examples.append((x.strip(), int(y)))
print('Read %d examples from %s' % (len(examples), path))
return examples
def evaluatePredictor(examples, predictor):
'''
predictor: a function that takes an x and returns a predicted y.
Given a list of examples (x, y), makes predictions based on |predict| and returns the fraction
of misclassiied examples.
'''
error = 0
for x, y in examples:
if predictor(x) != y:
error += 1
return 1.0 * error / len(examples)
def outputWeights(weights, path):
print("%d weights" % len(weights))
out = open(path, 'w', encoding='utf-8')
for f, v in sorted(list(weights.items()), key=lambda f_v : -f_v[1]):
print('\t'.join([str(f), str(v)]), file=out)
out.close()
def verbosePredict(phi, y, weights, out):
yy = 1 if dotProduct(phi, weights) > 0 else -1
if y:
print('Truth: %s, Prediction: %s [%s]' % (y, yy, 'CORRECT' if y == yy else 'WRONG'), file=out)
else:
print('Prediction:', yy, file=out)
for f, v in sorted(list(phi.items()), key=lambda f_v1 : -f_v1[1] * weights.get(f_v1[0], 0)):
w = weights.get(f, 0)
print("%-30s%s * %s = %s" % (f, v, w, v * w), file=out)
return yy
def outputErrorAnalysis(examples, featureExtractor, weights, path):
out = open(path, 'w', encoding='utf-8')
for x, y in examples:
print('===', x, file=out)
verbosePredict(featureExtractor(x), y, weights, out)
out.close()
def interactivePrompt(featureExtractor, weights):
while True:
print('> ', end=' ')
x = sys.stdin.readline()
if not x: break
phi = featureExtractor(x)
verbosePredict(phi, None, weights, sys.stdout)
############################################################
def generateClusteringExamples(numExamples, numWordsPerTopic, numFillerWords):
'''
Generate artificial examples inspired by sentiment for clustering.
Each review has a hidden sentiment (positive or negative) and a topic (plot, acting, or music).
The actual review consists of 2 sentiment words, 4 topic words and 2 filler words, for example:
good:1 great:1 plot1:2 plot7:1 plot9:1 filler0:1 filler10:1
numExamples: Number of examples to generate
numWordsPerTopic: Number of words per topic (e.g., plot0, plot1, ...)
numFillerWords: Number of words per filler (e.g., filler0, filler1, ...)
'''
sentiments = [['bad', 'awful', 'worst', 'terrible'], ['good', 'great', 'fantastic', 'excellent']]
topics = ['plot', 'acting', 'music']
def generateExample():
x = Counter()
# Choose 2 sentiment words according to some sentiment
sentimentWords = random.choice(sentiments)
x[random.choice(sentimentWords)] += 1
x[random.choice(sentimentWords)] += 1
# Choose 4 topic words from a fixed topic
topic = random.choice(topics)
x[topic + str(random.randint(0, numWordsPerTopic-1))] += 1
x[topic + str(random.randint(0, numWordsPerTopic-1))] += 1
x[topic + str(random.randint(0, numWordsPerTopic-1))] += 1
x[topic + str(random.randint(0, numWordsPerTopic-1))] += 1
# Choose 2 filler words
x['filler' + str(random.randint(0, numFillerWords-1))] += 1
return x
random.seed(42)
examples = [generateExample() for _ in range(numExamples)]
return examples
def outputClusters(path, examples, centers, assignments):
'''
Output the clusters to the given path.
'''
print('Outputting clusters to %s' % path)
out = open(path, 'w')
for j in range(len(centers)):
print('====== Cluster %s' % j, file=out)
print('--- Centers:', file=out)
for k, v in sorted(list(centers[j].items()), key = lambda k_v : -k_v[1]):
if v != 0:
print('%s\t%s' % (k, v), file=out)
print('--- Assigned points:', file=out)
for i, z in enumerate(assignments):
if z == j:
print(' '.join(list(examples[i].keys())), file=out)
out.close()