-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_petAppeal_GridSearchCV_RandForest.py
executable file
·116 lines (87 loc) · 4.3 KB
/
5_petAppeal_GridSearchCV_RandForest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
import petAppeal
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
local_file_path = '/'
petfinder_file = local_file_path + 'petfinder_data_clean.csv'
cats_dogs = pd.read_csv(petfinder_file)
cats_dogs = cats_dogs[(cats_dogs.animal == 'Cat') | (cats_dogs.animal == 'Dog')]
drop_cols = ['Unnamed: 0','address1', 'address2', 'email', 'pet_id', 'phone',
'breed','lastUpdate', 'name', 'photos','description','zip',
'city', 'state', 'shelter_id', 'fax', 'id']
cats_dogs = cats_dogs.drop(drop_cols, axis=1)
##Check for class imbalance; downsample if necessary
cats_dogs = petAppeal.balance_check(cats_dogs, 'status')
cats_dogs_encoded = petAppeal.encode_data(cats_dogs)
y = cats_dogs_encoded.status
drop_cols = ['status']
cats_dogs_encoded = cats_dogs_encoded.drop(drop_cols, axis=1)
x = np.array(cats_dogs_encoded)
Classes = y.unique()
##encode labels
le = preprocessing.LabelEncoder()
le.fit(y)
encoded_labels = le.transform(y)
reversed_labels = le.inverse_transform(encoded_labels)
y = le.transform(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
param_grid = [{'n_estimators': range(1,110,10),
'criterion': ["gini", "entropy"],
'max_features': range(1,18)+ ["sqrt", "log2"],
'max_depth': range(1,55,5),
'min_samples_split': range(10,110,10),
'min_samples_leaf': range(10,110,10),
'min_weight_fraction_leaf': [0.0, 0.25, 0.50, 0.75, 1.0],
'bootstrap': [True, False],
'oob_score': [True, False],
'n_jobs': [-1, 1],
'random_state': [1, 3, 5, None],
'warm_start': [True, False],
'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]}]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
sss = StratifiedShuffleSplit(n_splits=3,
test_size=0.2,
random_state=0)
clf = GridSearchCV(RandomForestClassifier(),
param_grid=param_grid,
cv=sss,
verbose=5,
scoring='%s_macro' % score)
clf.fit(x_train, y_train)
print("Best parameters set found on development set:", clf.best_params_)
print("Grid scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
y_true, y_pred = y_test, clf.predict(x_test)
print("Detailed classification report:", classification_report(y_true, y_pred))
rForest_GridSearch_results = pd.DataFrame(clf.cv_results_)
rForest_GridSearch_results.to_csv(local_file_path+'rForest_GridSearch_results.csv')
#Use best estimator found in GridSearch for initial modeling
model_rForest = clf.best_estimator_
model_rForest.fit(x_train, y_train)
model_rForest.score(x_train, y_train)
y_pred = model_rForest.predict(x_test)
y_pred_prob = model_rForest.predict_proba(x_test)[:,1]
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
petAppeal.plot_confusion_matrix(cnf_matrix, classes=Classes, title='Random Forest - Default Model')
petAppeal.plot_confusion_matrix(cnf_matrix, classes=Classes, normalize=True, title='Random Forest - Default Model')
print 'Accuracy:', accuracy_score(y_test, y_pred)
print 'Precision:', precision_score(y_test, y_pred)
print 'Recall:', recall_score(y_test, y_pred)
print 'F1:', f1_score(y_test, y_pred)
print classification_report(y_test, y_pred)
importances = model_rForest.feature_importances_
std = np.std([tree.feature_importances_ for tree in model_rForest.estimators_], axis=0)
featureHeaders = list(cats_dogs_encoded)
petAppeal.plot_feature_importance(x_train, importances, featureHeaders, 'black')
model_name = 'petfinder_trained_RF_classifier'
petAppeal.saveVar(model_rForest, model_name)