-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathmultibagged.py
46 lines (40 loc) · 1.85 KB
/
multibagged.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import multiprocess
import random
import numpy as np
from sklearn import linear_model
TRAIN_FILES = ['android_small.txt', 'basic_small.txt', 'coffee_small.txt',
'dontjudgeme_small.txt', 'earthquake_small.txt', 'egypt_small.txt',
'election_small.txt', 'freedom_small.txt', 'god_small.txt',
'haiti_small.txt', 'happy_small.txt', 'harrypotter_small.txt',
'healthcare_small.txt', 'immigration_small.txt', 'indonesia_small.txt',
'ipod_small.txt', 'love_small.txt', 'mubarak_small.txt', 'obama_small.txt',
'obamacare_small.txt', 'question_small.txt', 'sotu_small.txt',
'teaparty_small.txt', 'tsunami_small.txt', 'usa_small.txt', 'win_small.txt',
'wiunion_small.txt']
def bag_logreg():
mat, vals = multiprocess.buildDataset(TRAIN_FILES)
row_sums = mat.sum(axis=1)
mat = mat / row_sums[:, np.newaxis]
trainSize = 0.7*len(vals)
trainIndexes = random.sample(range(len(vals)), len(vals))
trainMat = np.zeros((trainSize, mat.shape[1]))
trainVals = np.zeros(trainSize)
testMat = np.zeros((len(vals)-trainSize, mat.shape[1]))
testVals = np.zeros(len(vals)-trainSize)
print trainMat.shape
index = 0
for ti in trainIndexes:
if index < trainMat.shape[0]:
trainMat[index, :] = mat[ti, :]
trainVals[index] = vals[ti]
elif index - trainMat.shape[0] < testMat.shape[0]:
testMat[index - trainMat.shape[0], :] = mat[ti, :]
testVals[index - trainMat.shape[0]] = vals[ti]
index += 1
logreg = linear_model.LogisticRegression()
print "Beginning training of model"
logreg.fit(trainMat, trainVals)
print "Finished training of model"
print logreg.score(trainMat, trainVals)
return logreg.score(testMat, testVals)
score = bag_logreg()