-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path035_UNIVARIATE_VARIABLES_SELECTION.py
67 lines (44 loc) · 1.82 KB
/
035_UNIVARIATE_VARIABLES_SELECTION.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
exec(open("Utils.py").read(), globals())
SEED = 741
#exec(open("8015_SPLITTING_DATA.py").read(), globals())
dir_var_sel = 'results/VARIABLE_SELECTION/' + str(SEED) + '/'
create_dir(dir_var_sel)
dir_data = 'DATA/CLASSIFICATION/' + str(SEED) +'/'
variable_sub_dataset = pd.read_csv( dir_data + "pre_training_set.csv" )
njobs = 2
print 'The dimension of dataset for variable selection is', variable_sub_dataset.shape
target_variable = 'Y'
col_energy = 'ENERGY'
predictors = variable_sub_dataset.columns.drop([target_variable, col_energy])
X = variable_sub_dataset[predictors]#.astype('float32')
X = X.fillna( method = 'ffill')
# print pd.isnull(X).sum() > 0
Y = variable_sub_dataset[ target_variable ]
variable_score = pd.DataFrame()
log = LogisticRegression()
variable_score[ 'VARIABLE' ] = predictors
F_value, p_value = f_classif(X, Y)
variable_score[ 'ANOVA_pvalue' ] = p_value
IG = mutual_info_classif(X, Y)
variable_score[ 'INFORMATION_GAIN' ] = IG
indexes_var = np.percentile( IG, 90)
predictors[ np.where( p_value>0.01) ]
accuracy = []
for var in predictors:
# var = variables[ 2 ]
x = pd.DataFrame(X[ var ])
pred = log.fit( x, Y ).predict_proba(x)
prediction_log = []
for p in pred:
prediction_log.append( p[1] )
prediction_log = np.array(prediction_log)
prediction_log = (prediction_log>0.5)*1
current_accuracy = skl.metrics.accuracy_score(Y, prediction_log)
accuracy.append(current_accuracy)
#print( var, current_accuracy)
variable_score[ 'LR_ACCURACY' ] = accuracy
univariate_var_sel = variable_score.copy()
univariate_var_sel.columns
univariate_var_sel['INFORMATION_GAIN'] = univariate_var_sel['INFORMATION_GAIN'].rank()
univariate_var_sel['LR_ACCURACY'] = univariate_var_sel['LR_ACCURACY'].rank()
univariate_var_sel.to_csv( dir_var_sel + 'univariate_var_sel.csv', index = False)