-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_classification_only.py
executable file
·380 lines (315 loc) · 15.7 KB
/
data_classification_only.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
from utils.utils_ml import features_list, normalise
import mysql.connector
import argparse
from pathlib import Path
import tensorflow as tf
import random
import numpy as np
from utils.utils_hyperparameters import get_parameter_sets, get_parameters
from collections import Counter
from datetime import datetime
import os
from utils.utils_train import train_nn, train_svm, train_nb
from sklearn.model_selection import train_test_split
from utils.utils_ml import train_val_split, categoric
from tensorflow.keras import optimizers
import _io
from utils.utils_print import print_file_test, print_file_val
import matplotlib.pyplot as plt
def labels_and_features(mycursor: mysql.connector.cursor, length: int):
"""
Extract from database features that will then be used to train machine learning algorithms. For the
always working case, extract features form feat_working2 database and keep only the first 15 of them.
Associate the workingworkingworkingworking label to them
@param mycursor: mysql.connector.cursor
@param length: number of sets of features
@return: arrays of true labels, nlp predicted labels and corresponding features
"""
Xa = []
sqldata_id = "select * from an"
mycursor.execute(sqldata_id)
recorddata_id = mycursor.fetchall() # all features from all documents
y = [] # manual labels
for results in recorddata_id:
ID = results[0] # get ID of each recorded failure
# create a string corresponding to the failure by adding the failure type and working words together
label = results[3] + results[1] + results[4] + results[2]
y.append(label)
features = features_list(mycursor, ID)
Xa.append(features[1:])
X_svm = normalise(Xa, length)
return y, X_svm
def train_val_split_stratify(counter: int, inc: int, X_train_tot: np.ndarray, y_train_tot: np.ndarray,
X_test: np.ndarray, y_test: np.ndarray):
"""
Function verifies if both the NLP and the actual labels split contains
examples of each class. It increases inc until the random state selected
splits the data correctly. The function then outputs the split data
Inputs:
counter - how many times the script was run and the split was performed
inc - increment varies such that the split is correct
X_traintot - train and validation features
y_traintot - train and validation labels/predicted labels
X_test/NLP - test features
y_test/NLP - test labels/predicted labels
Outputs:
dictXy - containing the train/val/test features split and labels split
for both actual labels and NLP infered ones and the categorical
number associated with the labels
counter - does not change
inc - increment used for the random split
"""
# set random state to be the same for both predicted and actual labels
randomState = counter + inc
dictXy = {}
# split remaining data in train and validation data for actual labels
[X_train, X_val, y_train, y_val] = train_val_split(X_train_tot, y_train_tot, randomState)
# in case the selected random state does not distribute data evenly (some labels are not represented in the
# train/validation set, increase randomState until the split is adequate)
while len(Counter(y_train)) != len(Counter(y_val)):
inc += 1
randomState = counter + inc
# split remaining data in train and validation data for actual labels
[X_train, X_val, y_train, y_val] = train_val_split(X_train_tot, y_train_tot, randomState)
# transform the actual labels into numbers
[y_train_nn, y_val_nn, y_test_nn] = categoric(y_train, y_val, y_test)
dictXy['X_train'] = X_train
dictXy['X_val'] = X_val
dictXy['X_test'] = X_test
dictXy['y_train'] = y_train
dictXy['y_val'] = y_val
dictXy['y_test'] = y_test
dictXy['y_train_cat'] = y_train_nn
dictXy['y_val_cat'] = y_val_nn
dictXy['y_test_cat'] = y_test_nn
return dictXy, counter, inc
def get_parameters_2(random_bool: bool, list_of_params: list, list_of_params_svm: list, number_of_tests: int,
file: _io.TextIOWrapper):
"""
Function returning two dictionaries with the chosen hyperparameters for NN
and for the SVM. It also prints to the file the values
Inputs:
random - True if random search hyperparameter values wanted
False to use best hyperparameters found so far
list_of_params [number_of_tests x 6] - list of lists containing NN
hyperparameters
list_of_params_svm [number_of_tests x 4] - list of lists containing SVM
hyperparameters
number_of_tests - number of sets of randomly chosen parameters
corresponding to the number of times the scrips will be run
file - file to which hyperparameters are written
Outputs:
dict_nn - dictionary with hyperparameters for NN
dict_svm - dictionary with hyperparameters for SVM
"""
dict_nn = {}
dict_svm = {}
if random_bool is False:
dict_nn['no_hidden'] = 180
dict_nn['no_layers'] = 2
dict_nn['activation_fct'] = 'relu'
dict_nn['regularizer'] = 0.01
dict_nn['learning_rate'] = 0.01
dict_nn['number_of_epochs'] = 130
dict_svm['kernel'] = 'linear'
dict_svm['C'] = 1
dict_svm['gamma'] = 1
dict_svm['decision_function'] = 'ovo' # as in report
else:
dict_nn['no_hidden'] = list_of_params[number_of_tests][0]
dict_nn['no_layers'] = list_of_params[number_of_tests][1]
dict_nn['activation_fct'] = list_of_params[number_of_tests][2]
dict_nn['regularizer'] = list_of_params[number_of_tests][3]
dict_nn['learning_rate'] = list_of_params[number_of_tests][4]
dict_nn['number_of_epochs'] = list_of_params[number_of_tests][5]
dict_svm['kernel'] = list_of_params_svm[number_of_tests][0]
dict_svm['C'] = list_of_params_svm[number_of_tests][1]
dict_svm['gamma'] = list_of_params_svm[number_of_tests][2]
dict_svm['decision_function'] = list_of_params_svm[number_of_tests][3]
dict_nn['loss_fct'] = 'categorical_crossentropy' # used to be categorical crossentropy
dict_nn['decay_set'] = 1e-2 / dict_nn['number_of_epochs'] # used to be 1e-2/number_of_epochs
print(file=file)
print("No of hidden nodes: " + str(dict_nn['no_hidden']), file=file)
print(file=file)
print("Value of regularizer term " + str(dict_nn['regularizer']), file=file)
print(file=file)
print("Activation function is " + dict_nn['activation_fct'], file=file)
print(file=file)
print("Learning rate is " + str(dict_nn['learning_rate']), file=file)
print(file=file)
print("Momentum not used", file=file)
print(file=file)
print("Number of epochs " + str(dict_nn['number_of_epochs']), file=file)
print(file=file)
print("Number of hidden layers" + str(dict_nn['no_layers']), file=file)
print(file=file)
print("Loss function used is " + dict_nn['loss_fct'], file=file)
print(file=file)
print("Decay is " + str(dict_nn['decay_set']), file=file)
print("------------------------------", file=file)
print("SVM C is " + str(dict_svm['C']), file=file)
print(file=file)
print("SVM kernel is " + dict_svm['kernel'], file=file)
print(file=file)
print("SVM gamma is " + str(dict_svm['gamma']), file=file)
print(file=file)
print("SVM decision function is " + dict_svm['decision_function'], file=file)
return dict_nn, dict_svm
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--provided_labels",
type=int,
default=0,
)
parser.add_argument(
"--sql_password",
type=str,
)
parser.add_argument(
"--results_folder",
type=Path,
default="C:/Users/oncescu/data/4yp",
)
parser.add_argument(
"--database_name",
type=str,
default='final',
)
args = parser.parse_args()
# connect to database
cnx = mysql.connector.connect(user='root', password=args.sql_password,
host='127.0.0.1',
database=args.database_name)
# retrieve from the tfidf table the name of the words for which the scores were calculated
# the names are stored in NameOfColumns
length = 117
list_of_params, list_of_params_svm = get_parameter_sets(0)
tf.compat.v1.reset_default_graph()
tf.random.set_seed(0)
np.random.seed(0)
random.seed(0)
mycursor = cnx.cursor()
y, X_normalised = labels_and_features(mycursor,
length)
classnames, indices = np.unique(y, return_inverse=True)
counter = 0
X_train_tot, X_test, y_train_tot, y_test = train_test_split(X_normalised,
indices,
test_size=0.2,
random_state=32)
newdir = str(args.results_folder) + "/Results_only_classification_4"
os.mkdir(newdir)
f = open(newdir + "/results.txt", 'w')
print(f'y_test_real is {y_test}', file=f)
print(f'y_test_real count is {Counter(y_test)}', file=f)
print(f'y_trainval_real count is {Counter(y_train_tot)}', file=f)
accuracy_nn_test_list = [] # list of NN test accuracies
accuracy_nn_val_list = [] # list of NN validation accuracies
accuracy_svm_test_list = [] # list of SVM accuracy on test data
accuracy_svm_val_list = [] # list of SVM accuracy values on validation data
minmax_nn = {'max': 0, 'min': 1, 'maxv': 0, 'minv': 1}
conf_matrix_nn = {}
minmax_svm = {'max': 0, 'min': 1, 'maxv': 0, 'minv': 1}
conf_matrix_svm = {}
dict_nn, dict_svm = get_parameters_2(random_bool=False,
list_of_params=list_of_params,
list_of_params_svm=list_of_params_svm,
number_of_tests=0,
file=f)
inc = 1 # increment for setting random seed value and for making sure the split is correct
noclasses = len(classnames)
while counter < 100:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
patience=8,
restore_best_weights=True)
[dictXy, counter, inc] = train_val_split_stratify(counter, inc,
X_train_tot,
y_train_tot,
X_test,
y_test)
sgd = optimizers.SGD(lr=dict_nn['learning_rate'],
decay=dict_nn['decay_set'],
nesterov=False) # used to be lr 0.01
minmax_nn, conf_matrix_nn, history = train_nn(noclasses, dict_nn, sgd,
dictXy,
accuracy_nn_test_list,
callback,
accuracy_nn_val_list,
minmax_nn,
conf_matrix_nn,
label_type='NN')
if minmax_nn == conf_matrix_nn == history:
continue
# =============================================================================
minmax_svm, conf_matrix_svm = train_svm(dict_svm, dictXy,
accuracy_svm_test_list,
accuracy_svm_val_list,
minmax_svm,
conf_matrix_svm,
label_type='NN')
# =============================================================================
counter += 1
# =============================================================================
train_nb(X_train_tot, y_train_tot, X_test, y_test, f)
print(".............................", file=f)
# =============================================================================
print_file_test('NN', 'real', f, minmax_nn, conf_matrix_nn,
accuracy_nn_test_list)
print(".............................", file=f)
print_file_val('NN', 'real', f, minmax_nn, conf_matrix_nn,
accuracy_nn_val_list)
print(".............................", file=f)
# =============================================================================
print_file_test('SVM', 'real', f, minmax_svm, conf_matrix_svm,
accuracy_svm_test_list)
print(".............................", file=f)
print_file_val('SVM', 'real', f, minmax_svm, conf_matrix_svm,
accuracy_svm_val_list)
f.close()
# =============================================================================
# plot accuracies for train and validation
# =============================================================================
plt.figure(1) # added line compared to previous laptop
# =============================================================================
plt.plot(history.history['accuracy']) # before it was accuracy/acc in between quotes
plt.plot(history.history['val_accuracy']) # before it was val_accuracy in between quotes)
plt.title('Accuracy on train and validation data')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig(newdir + "/NN_Accuracy_train_val.png", dpi=1200)
# plt.show()
# plot loss for train and validation
# =============================================================================
plt.figure(2) # added line compared to previous laptop
# =============================================================================
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss function value for train and validation data')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.savefig(newdir + "/NN_Loss_train_val.png", dpi=1200)
# plt.show()
# =============================================================================
plt.figure(3) # added line compared to previous laptop
# =============================================================================
plt.plot(accuracy_nn_val_list)
plt.plot(accuracy_nn_test_list)
plt.title('NN accuracy on validation and test data given real labels')
plt.ylabel('Accuracy')
plt.xlabel('Number of runs')
plt.legend(['Validation', 'Test'], loc='upper left')
plt.savefig(newdir + "/NN_Validation_vs_test.png", dpi=1200)
# plt.show()
plt.plot(accuracy_svm_val_list)
plt.plot(accuracy_svm_test_list)
plt.title('SVM accuracy on validation and test data')
plt.ylabel('Accuracy')
plt.xlabel('Number of runs')
plt.legend(['Validation', 'Test'], loc='upper left')
plt.savefig(newdir + "/SVM_Validation_vs_test.png", dpi=1200)
# plt.show()
if __name__ == "__main__":
main()