-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel.py
259 lines (209 loc) · 11.3 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# all import functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('tagsets')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
import nlpaug.augmenter.word as naw
import contractions
from wordcloud import WordCloud
from collections import Counter
import os
from sklearn.metrics import *
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import time
#from xgboost import XGBClassifier
#from xgboost import plot_importance
import scikitplot as skplt
def read_data(x='pseudo_data.xlsx'):
if x == 'pseudo_data.xlsx':
pseudo_data = pd.read_excel("./data/" + x, sheet_name="Defect Data")
else:
pseudo_data = pd.read_excel("./data/" + x)
pseudo_df = pd.DataFrame(pseudo_data)
return pseudo_df
# part-of-speech (pos) tagging for lemmatisation
def get_pos(tag):
if tag[0] == 'N':
return wordnet.NOUN
elif tag[0] == 'V':
return wordnet.VERB
elif tag[0] == 'R':
return wordnet.ADV
elif tag[0] == 'J':
return wordnet.ADJ
return wordnet.NOUN
def processText(sentence):
#tokenize
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+')
tokens = tokenizer.tokenize(sentence.lower())
# fix contractions
fixed_tokens = [contractions.fix(word) for word in tokens]
#remove stopwords
useful_words = [word for word in fixed_tokens if word not in stopwords.words('english')]
#pos tagging
pos_tuple = pos_tag(useful_words)
#lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(tup[0], get_pos(tup[1])) for ind, tup in enumerate(pos_tuple)]
clean_text = ""
for lem_word in lemmatized_tokens:
clean_text += str(lem_word) + " "
return clean_text
def analyse_data(start_alphabet, end_alphabet):
# import training dataset
df = read_data()
df['Combined Text'] = df[df.columns[0:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
df['Processed Text'] = df['Combined Text'].apply(lambda x: processText(x))
# import test dataset
# assumption: column alphabet is in capital letters and doesnt go beyong ZZ
if len(start_alphabet) == 1:
start_index = ord(start_alphabet) - 65 # ord('A') = 65
else: # 2 characters ie. AA
start_index = ((ord(start_alphabet[0]) - 64) * 26) + ord(start_alphabet[1]) - 65
if len(end_alphabet) == 1:
end_index = ord(end_alphabet) - 65 # ord('A') = 65
else: # 2 characters ie. AA
end_index = ((ord(end_alphabet[0]) - 64) * 26) + ord(end_alphabet[1]) - 65
original_test_df = read_data('sample_test_data.xlsx')
test_df = original_test_df.tail(-1) # remove header row
test_df = test_df.iloc[:, start_index:end_index]
test_df['Combined Text'] = test_df[test_df.columns[0:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
test_df['Processed Text'] = test_df['Combined Text'].apply(lambda x: processText(x))
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df["Processed Text"], df["Primary Root Cause Classification #3"], test_size=0.2, shuffle=True)
#Tf-Idf vectorization
tfidf_vectorizer = TfidfVectorizer(use_idf = True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
to_predict = tfidf_vectorizer.transform(test_df['Processed Text'])
# train SVC model
print("Training SVC model")
#Parameter Tuning using Grid Search & Stratified K Fold Cross Validation for SVC
parameters = {'kernel':('linear', 'poly', 'rbf'), 'C':[0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1], 'degree': [1, 2, 3, 4]}
grid_svc = GridSearchCV(SVC(), parameters, refit=True, cv=10, verbose=3) #cv: number of folds in a StratifiedKFold cross validation
grid_svc.fit(X_train_tfidf, y_train)
#print(grid_svc.best_estimator_)
#Implement best SVC model
print("Predicting root cases using SVC model")
param_dict = grid_svc.best_params_
svc = SVC(kernel=param_dict['kernel'], C=param_dict['C'], gamma=param_dict['gamma'], degree=param_dict['degree'], decision_function_shape='ovr')
svc.fit(X_train_tfidf, y_train)
svc_y_pred = svc.predict(X_test_tfidf)
svc_pred_on_test = svc.predict(to_predict)
svc_cr = classification_report(y_test, svc_y_pred, output_dict= True)
# train Naive Bayes model model
print("Training Naive Bayes model")
#Parameter Tuning using Grid Search & Stratified K Fold Cross Validation for Naive Bayes
parameters = {'alpha':[0.01, 0.1, 0.5, 1, 2, 5]} #smoothing parameter
grid_mnb = GridSearchCV(MultinomialNB(), parameters, refit=True, cv=10, verbose=3) #cv: number of folds in a StratifiedKFold cross validation
grid_mnb.fit(X_train_tfidf, y_train)
#Implement best model
print("Predicting root cases using Naive Bayes model")
param_dict = grid_mnb.best_params_
mnb = MultinomialNB(alpha=param_dict['alpha'])
mnb.fit(X_train_tfidf, y_train)
mnb_y_pred = mnb.predict(X_test_tfidf)
mnb_pred_on_test = mnb.predict(to_predict)
mnb_cr = classification_report(y_test, mnb_y_pred, output_dict= True)
# train Multinomial Logistic Regression model model
print("Training Multinomial Logistic Regression model")
#Parameter Tuning using Grid Search & Stratified K Fold Cross Validation for Multinomial Logistic Regression
parameters = {'multi_class': ['multinomial'], 'solver': ['lbfgs'], 'C': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0], 'penalty': ['l2']} #smoothing parameter
grid_mlr = GridSearchCV(LogisticRegression(), parameters, refit=True, cv=10, verbose=3) #cv: number of folds in a StratifiedKFold cross validation
grid_mlr.fit(X_train_tfidf, y_train)
#Implement best model
param_dict = grid_mlr.best_params_
mlr = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=param_dict['C'], penalty=param_dict['penalty'])
mlr.fit(X_train_tfidf, y_train)
mlr_y_pred = mlr.predict(X_test_tfidf)
mlr_pred_on_test = mlr.predict(to_predict)
mlr_cr = classification_report(y_test, mlr_y_pred, output_dict= True)
all_values = [list(svc_cr['weighted avg'].values()), list(mnb_cr['weighted avg'].values()), list(mlr_cr['weighted avg'].values())]
temp = pd.DataFrame(all_values, index = ["SVC", "Multinomial Naive Bayes", "Multiomial Logistic Regression"], columns = list(mlr_cr['weighted avg'].keys()))
print('results is:')
print(temp)
original_test_df['SVC_Prediction'] = svc_pred_on_test
original_test_df['MNB_Prediction'] = mnb_pred_on_test
original_test_df['MLR_Prediction'] = mlr_pred_on_test
# export results in csv to be displayed on PowerBI
original_test_df.to_csv('./data/prediction_results.csv')
# returns all of the models' results in the form of dict
return temp
if __name__ == '__main__':
# import training dataset
df = read_data()
df['Combined Text'] = df[df.columns[0:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
df['Processed Text'] = df['Combined Text'].apply(lambda x: processText(x))
# import test dataset
test_df = read_data('temp_train.xlsx')
test_df = test_df.tail(-1) # remove header row
test_df['Combined Text'] = test_df[test_df.columns[0:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
test_df['Processed Text'] = test_df['Combined Text'].apply(lambda x: processText(x))
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df["Processed Text"], df["Primary Root Cause Classification #3"], test_size=0.2, shuffle=True)
#Tf-Idf vectorization
tfidf_vectorizer = TfidfVectorizer(use_idf = True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
# train SVC model
print("Training SVC model")
#Parameter Tuning using Grid Search & Stratified K Fold Cross Validation for SVC
parameters = {'kernel':('linear', 'poly', 'rbf'), 'C':[0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1], 'degree': [1, 2, 3, 4]}
grid_svc = GridSearchCV(SVC(), parameters, refit=True, cv=10, verbose=3) #cv: number of folds in a StratifiedKFold cross validation
grid_svc.fit(X_train_tfidf, y_train)
#print(grid_svc.best_estimator_)
#Implement best SVC model
print("Predicting root cases using SVC model")
param_dict = grid_svc.best_params_
svc = SVC(kernel=param_dict['kernel'], C=param_dict['C'], gamma=param_dict['gamma'], degree=param_dict['degree'], decision_function_shape='ovr')
svc.fit(X_train_tfidf, y_train)
svc_y_pred = svc.predict(X_test_tfidf)
svc_cr = classification_report(y_test, svc_y_pred, output_dict= True)
# train Naive Bayes model model
print("Training Naive Bayes model")
#Parameter Tuning using Grid Search & Stratified K Fold Cross Validation for Naive Bayes
parameters = {'alpha':[0.01, 0.1, 0.5, 1, 2, 5]} #smoothing parameter
grid_mnb = GridSearchCV(MultinomialNB(), parameters, refit=True, cv=10, verbose=3) #cv: number of folds in a StratifiedKFold cross validation
grid_mnb.fit(X_train_tfidf, y_train)
#Implement best model
print("Predicting root cases using Naive Bayes model")
param_dict = grid_mnb.best_params_
mnb = MultinomialNB(alpha=param_dict['alpha'])
mnb.fit(X_train_tfidf, y_train)
mnb_y_pred = mnb.predict(X_test_tfidf)
mnb_cr = classification_report(y_test, mnb_y_pred, output_dict= True)
# train Multinomial Logistic Regression model model
print("Training Multinomial Logistic Regression model")
#Parameter Tuning using Grid Search & Stratified K Fold Cross Validation for Multinomial Logistic Regression
parameters = {'multi_class': ['multinomial'], 'solver': ['lbfgs'], 'C': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0], 'penalty': ['l2']} #smoothing parameter
grid_mlr = GridSearchCV(LogisticRegression(), parameters, refit=True, cv=10, verbose=3) #cv: number of folds in a StratifiedKFold cross validation
grid_mlr.fit(X_train_tfidf, y_train)
#Implement best model
param_dict = grid_mlr.best_params_
mlr = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=param_dict['C'], penalty=param_dict['penalty'])
mlr.fit(X_train_tfidf, y_train)
mlr_y_pred = mlr.predict(X_test_tfidf)
mlr_cr = classification_report(y_test, mlr_y_pred, output_dict= True)
all_values = [list(svc_cr['weighted avg'].values()), list(mnb_cr['weighted avg'].values()), list(mlr_cr['weighted avg'].values())]
temp = pd.DataFrame(all_values, index = ["SVC", "Multinomial Naive Bayes", "Multiomial Logistic Regression"], columns = list(mlr_cr['weighted avg'].keys()))
print(temp)
print('done with running model.py file')