-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanomaly_detection.py
323 lines (259 loc) · 13.9 KB
/
anomaly_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
'''
Credit-Card-Anomaly- Detection
Date:- 4/Aug/2019
Programming Language/Algorithm: Python/SkLearn Classifier
Author:- Suraj Dakua
'''
#import libraries
import numpy as np #linear algebra library
import pandas as pd #to read the csv/sqlite data as dataframes.
import time
import matplotlib.pyplot as plt
import seaborn as sns #used for data visualization
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE #dimensionality reduction
import warnings
warnings.filterwarnings('ignore')
#import classifier you want to use for the problem statement I choose SVM
from sklearn.model_selection import train_test_split #split train test data into 75% and 25%.
from imblearn.over_sampling import SMOTE #over sampling algorithm
from imblearn.under_sampling import NearMiss # this algorithm is used to split the class in 50/50
from sklearn.metrics import accuracy_score, precision_score, classification_report, roc_auc_score, recall_score, f1_score
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.model_selection import KFold, StratifiedKFold #KFold used for cross-validation
#import the CSV file for the anomaly detection.
dataframe = pd.read_csv('/home/ml_rejolut/Desktop/Credit-Card-Detection/creditcard.csv')
dataframe.head() #returns the top rows of the dataframe
dataframe.describe() #returns the descriptive statistics of the table
#check for null values in the dataframe
dataframe.isnull().sum().max() #takes array-like object and finds whether the array is empty.
#print the columns in the CSV file
#dataframe.columns()
#print the fraud data items and not-fraud data items.
print('Successfull transactions', round(dataframe['Class'].value_counts()[0]/len(dataframe) * 100, 2), '% of the dataset')
print('Fraud Cases', round(dataframe['Class'].value_counts()[1]/len(dataframe)* 100,2), '% of the dataset') #round upto 2 decimals.
#as most of the transactions are not fraudulent so our model will predict every transaction as successfull transaction
#I dont want my model to work on assumptions I want it to recognize patterns of fraud transactions
#so letss first visualize the graph of fraud vs no fraud transactions with seaborn.
colors = ['#B22222','#0000FF'] #Note here the hash indicates this value is in hexadecimal.
#sns.countplot('Class', data=dataframe, palette=colors)
#plt.title('Visualize the transactions \n (0:Successed Transactions || 1:Fraud Transactions)', fontsize = 14)
#we scale the values of time and amount using sklearn preprocessing.
from sklearn.preprocessing import StandardScaler, RobustScaler
std_scaler = StandardScaler()
rob_scaler = RobustScaler()
#outliers are known as deviations from the rest objects.
#we use robust scaler because it is less prone to outliers.
dataframe['scaled_time'] = rob_scaler.fit_transform(dataframe['Time'].values.reshape(-1,1))
#reshape(-1,1) reshapes the array to single column.
#reshape(1,-1) this will reshapoe the row of the array.
dataframe['scaled_amount'] = rob_scaler.fit_transform(dataframe['Amount'].values.reshape(-1,1))
#axis = 1 or columns, 0 or index
#inplace = True means do the operation orelse return none.
dataframe.drop(['Time','Amount'], axis = 1, inplace = True)
#scaling is a method in machine learning use to normalize the independent variables or feature of the data.
scaled_time = dataframe['scaled_time']
scaled_amount = dataframe['scaled_amount']
dataframe.drop(['scaled_time','scaled_amount'], axis = 1, inplace=True)
dataframe.insert(0,'scaled_time', scaled_time)
dataframe.insert(1,'scaled_amount', scaled_amount)
X = dataframe.drop('Class', axis=1)
y = dataframe['Class']
#split the original dataframe into equal fraud and no fraud data.
stratified_fold = StratifiedKFold(n_splits=5, random_state=None,shuffle=False)
for train_index, test_index in stratified_fold.split(X, y):
original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
#turn the values into array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)
print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))
#suffle the data before creating the subsamples.
dataframe = dataframe.sample(frac=1)
# amount of fraud classes 492 rows.
fraud_df = dataframe.loc[dataframe['Class'] == 1]
non_fraud_df = dataframe.loc[dataframe['Class'] == 0][:492]
normal_distributed_df = pd.concat([non_fraud_df, fraud_df ])
# Shuffle dataframe rows
new_dataframe = normal_distributed_df.sample(frac=1, random_state=42)
new_dataframe.head()
print(new_dataframe['Class'].value_counts()/len(new_dataframe))
#now we have our data equally distributed lets visualise the data using seaborn
sns.countplot('Class', data=new_dataframe, palette=colors)
plt.title('Equally distributed classes.', fontsize = 14)
plt.show()
v14_fraud = new_dataframe['V14'].loc[new_dataframe['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
v14_iqr = q75 - q25
print('iqr: {}'.format(v14_iqr))
v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
print('Cut Off: {}'.format(v14_cut_off))
print('V14 Lower: {}'.format(v14_lower))
print('V14 Upper: {}'.format(v14_upper))
outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
print('Feature V14 Outliers for Fraud Cases: {}'.format(len(outliers)))
print('V10 outliers:{}'.format(outliers))
new_dataframe = new_dataframe.drop(new_dataframe[(new_dataframe['V14'] > v14_upper) | (new_dataframe['V14'] < v14_lower)].index)
print('----' * 44)
# -----> V12 removing outliers from fraud transactions
v12_fraud = new_dataframe['V12'].loc[new_dataframe['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
v12_iqr = q75 - q25
v12_cut_off = v12_iqr * 1.5
v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
print('V12 Lower: {}'.format(v12_lower))
print('V12 Upper: {}'.format(v12_upper))
outliers = [x for x in v12_fraud if x < v12_lower or x > v12_upper]
print('V12 outliers: {}'.format(outliers))
print('Feature V12 Outliers for Fraud Cases: {}'.format(len(outliers)))
new_dataframe = new_dataframe.drop(new_dataframe[(new_dataframe['V12'] > v12_upper) | (new_dataframe['V12'] < v12_lower)].index)
print('Number of Instances after outliers removal: {}'.format(len(new_dataframe)))
print('----' * 44)
# Removing outliers V10 Feature
v10_fraud = new_dataframe['V10'].loc[new_dataframe['Class'] == 1].values
q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
v10_iqr = q75 - q25
v10_cut_off = v10_iqr * 1.5
v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
print('V10 Lower: {}'.format(v10_lower))
print('V10 Upper: {}'.format(v10_upper))
outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
print('V10 outliers: {}'.format(outliers))
print('Feature V10 Outliers for Fraud Cases: {}'.format(len(outliers)))
new_df = new_dataframe.drop(new_dataframe[(new_dataframe['V10'] > v10_upper) | (new_dataframe['V10'] < v10_lower)].index)
print('Number of Instances after outliers removal: {}'.format(len(new_df)))
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color='#0A0AFF', label='No Fraud')
red_patch = mpatches.Patch(color='#AF0000', label='Fraud')
X = new_df.drop('Class', axis=1)
y = new_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
from sklearn.linear_model import LogisticRegression
classifiers = {
"LogisiticRegression": LogisticRegression(),
}
from sklearn.model_selection import cross_val_score
for key, classifier in classifiers.items():
classifier.fit(X_train, y_train)
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV
# Logistic Regression
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
# We automatically get the logistic regression with the best parameters.
log_reg = grid_log_reg.best_estimator_
# Overfitting Case
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')
# Let's Plot LogisticRegression Learning Curve
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Create a DataFrame with all the scores and the classifiers names.
log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5, method="decision_function")
print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
def logistic_roc_curve(log_fpr, log_tpr):
plt.figure(figsize=(12,8))
plt.title('Logistic Regression ROC Curve', fontsize=16)
plt.plot(log_fpr, log_tpr, 'b-', linewidth=2)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.axis([-0.01,1,0,1])
logistic_roc_curve(log_fpr, log_tpr)
plt.show()
y_pred = log_reg.predict(X_train)
undersample_y_score = log_reg.decision_function(original_Xtest)
from sklearn.metrics import average_precision_score
undersample_average_precision = average_precision_score(original_ytest, undersample_y_score)
print('Average precision-recall score: {0:0.2f}'.format(
undersample_average_precision))
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,6))
precision, recall, _ = precision_recall_curve(original_ytest, undersample_y_score)
plt.step(recall, precision, color='#004a93', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='#48a6ff')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('UnderSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(undersample_average_precision), fontsize=16)
from sklearn.model_selection import train_test_split, RandomizedSearchCV
print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))
# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []
# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)
# Implementing SMOTE Technique
# Cross Validating the right way
# Parameters
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
for train, test in stratified_fold.split(original_Xtrain, original_ytrain):
pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
best_est = rand_log_reg.best_estimator_
prediction = best_est.predict(original_Xtrain[test])
accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
precision_lst.append(precision_score(original_ytrain[test], prediction))
recall_lst.append(recall_score(original_ytrain[test], prediction))
f1_lst.append(f1_score(original_ytrain[test], prediction))
auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)
labels = ['No Fraud', 'Fraud']
smote_prediction = best_est.predict(original_Xtest)
print(classification_report(original_ytest, smote_prediction, target_names=labels))
sm = SMOTE(ratio='minority', random_state=42)
# This will be the data were we are going to
Xsm_train, ysm_train = sm.fit_sample(original_Xtrain, original_ytrain)
# We Improve the score by 2% points approximately
# Implement GridSearchCV and the other models
# Logistic Regression
t0 = time.time()
log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm.fit(Xsm_train, ysm_train)
t1 = time.time()
from sklearn.metrics import confusion_matrix
# Logistic Regression fitted using SMOTE technique
y_pred_log_reg = log_reg_sm.predict(X_test)
log_reg_cf = confusion_matrix(y_test, y_pred_log_reg)
fig, ax = plt.subplots(2, 2,figsize=(22,12))
sns.heatmap(log_reg_cf, ax=ax[0][0], annot=True, cmap=plt.cm.copper)
ax[0, 0].set_title("Logistic Regression Confusion Matrix", fontsize=14)
ax[0, 0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0, 0].set_yticklabels(['', ''], fontsize=14, rotation=360)
plt.show()
from sklearn.metrics import classification_report
print('Logistic Regression:')
print(classification_report(y_test, y_pred_log_reg))