forked from crwong/cs224u-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf_parse.py
153 lines (134 loc) · 4.79 KB
/
tfidf_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import buildwd
import shallownn
import random
import numpy as np
from sklearn import linear_model
from sklearn import neighbors
SUFFIX = 'micro'
TRAIN_FILE = 'data/topics_%s/ALL_CLEAN_%s.txt' % (SUFFIX, SUFFIX)
def tfidf(mat=None, rownames=None):
"""TF-IDF on mat. rownames is unused; it's an argument only
for consistency with other methods used here"""
colsums = np.sum(mat, axis=0)
doccount = mat.shape[1]
w = np.array([_tfidf_row_func(row, colsums, doccount) for row in mat])
return (w, rownames)
def _tfidf_row_func(row, colsums, doccount):
df = float(len([x for x in row if x > 0]))
idf = 0.0
# This ensures a defined IDF value >= 0.0:
if df > 0.0 and df != doccount:
idf = np.log(doccount / df)
tfs = row/colsums
return tfs * idf
def get_tfidf_logreg(train_file):
wd = buildwd.buildWD(train_file)
colnames = wd[1]
rownames = wd[2]
subjects = wd[3]
idf = tfidf(wd[0], rownames)
trainMat = np.zeros((len(colnames), wd[0].shape[1]))
f = open(train_file)
matCol = 0
for line in f:
words = line.split()
if words[0] in colnames:
trainRow = np.zeros(wd[0].shape[1])
numWords = 0
for word in words[2:]:
pword = buildwd.processWord(word)
if pword in rownames:
numWords += 1
trainRow = trainRow + idf[0][rownames.index(pword)]
if (numWords > 0): trainRow = (trainRow*1.0) / numWords
trainMat[matCol,:] = trainRow
matCol += 1
f.close()
trainVals = buildwd.trainValsFromSubjects(subjects)
# RANDOMIZE
random.seed(17)
shuffle = range(len(subjects))
random.shuffle(shuffle)
train = []
labels = []
index = 0
for i in shuffle:
train.append(trainMat[i])
labels.append(trainVals[i])
index += 1
cutoff = int(index*0.7)
logreg = linear_model.LogisticRegression()
logreg.fit(train[0:cutoff], labels[0:cutoff])
return logreg, train, labels, cutoff
def tfidf_logreg(train_file):
logreg, train, labels, cutoff = get_tfidf_logreg(train_file)
return logreg.score(train[cutoff:], labels[cutoff:])
def tfidf_knn(train_file):
wd = buildwd.buildWD(train_file)
colnames = wd[1]
rownames = wd[2]
subjects = wd[3]
idf = tfidf(wd[0], rownames)
trainMat = np.zeros((len(colnames), wd[0].shape[1]))
f = open(train_file)
matCol = 0
for line in f:
words = line.split()
if words[0] in colnames:
trainRow = np.zeros(wd[0].shape[1])
numWords = 0
for word in words[2:]:
pword = buildwd.processWord(word)
if pword in rownames:
numWords += 1
trainRow = trainRow + idf[0][rownames.index(pword)]
trainRow = (trainRow*1.0) / numWords
trainMat[matCol,:] = trainRow
matCol += 1
f.close()
trainVals = buildwd.trainValsFromSubjects(subjects)
knn = neighbors.KNeighborsClassifier(n_neighbors=10)
knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
return knn.score(trainMat[(trainMat.shape[0]*0.7):,:], trainVals[(trainMat.shape[0]*0.7):])
"""
Doesn't really work
"""
def tfidf_shallownn(train_file):
wd = buildwd.buildWD(train_file)
colnames = wd[1]
rownames = wd[2]
subjects = wd[3]
idf = tfidf(wd[0], rownames)
trainMat = np.zeros((len(colnames), wd[0].shape[1]))
f = open(train_file)
matCol = 0
for line in f:
words = line.split()
if words[0] in colnames:
trainRow = np.zeros(wd[0].shape[1])
numWords = 0
for word in words[2:]:
pword = buildwd.processWord(word)
if pword in rownames:
numWords += 1
trainRow = trainRow + idf[0][rownames.index(pword)]
trainRow = (trainRow*1.0) / numWords
trainMat[matCol,:] = trainRow
matCol += 1
f.close()
trainVals = np.zeros((len(subjects),2))
for s in enumerate(subjects):
if s[1] == 'Sports':
trainVals[s[0],0] = 1
elif s[1] == 'Politics':
trainVals[s[0],1] = 1
snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2)
snn.train(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7),:], display_progress=True, maxiter=10)
return snn.score(trainMat[(trainMat.shape[0]*0.7):,:], trainVals[(trainMat.shape[0]*0.7):,:])
if __name__ == "__main__":
# score_shallownn = tfidf_shallownn(TRAIN_FILE)
# print 'ShallowNN:', score_shallownn
# score_knn = tfidf_knn(TRAIN_FILE)
# print 'KNN:', score_knn
score_logreg = tfidf_logreg(TRAIN_FILE)
print 'LogReg: ', score_logreg