-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinear Regression.py
501 lines (230 loc) · 10.4 KB
/
Linear Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
#!/usr/bin/env python
# coding: utf-8
# # Linear Regression Analysis - Actual vs Predicted observations
#
# > About Project - With the help of this car's data set, I want to know the relationship between the foremost specifications of the vehicle itself. we know Engine cc, BHP, and Torque
# are more influencing the mileage factor individually. we cannot estimate the mileage figure with engine cc or other specifications(simple regression). however, if we consider the main specifications of the vehicle as an independent variable and mileage as a dependent variable. by doing this analysis, we will be able to estimate the mileage figures
#
# ## Import DataSet
# In[1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
sns.set_style('darkgrid')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')
# ## Reading DataSet and Data Preprocessing
# In[2]:
df = pd.read_csv("C:\\Users\\Manikanta\\Downloads\\Car details v3.csv")
# In[3]:
df = df[['name','mileage','engine','max_power','torque']] ## choosing columns which are required for analysis.
# In[4]:
df.head() #here name column no need to keep however, at data cleansing part it would be helpfull for cross checking.
# In[5]:
df.info() #rows are 8128 and we can see here data type of each column
# In[6]:
df.isna().sum() #no.of null values per column
# In[7]:
#Note - removing the rows if "all columns" don't have values...
df.dropna(axis='index',how='all',subset=['mileage','engine','max_power','torque'],inplace=True)
# In[8]:
df.isnull().sum()
# In[9]:
df.duplicated().sum() # no.of duplicated values does dataset have
# In[10]:
#note - removing all unnessasary repeated values and and i will remove duplicate again later after remove the name name.
df.drop_duplicates(ignore_index=True,inplace=True)
# In[11]:
#ignore this column go forward... you will understand why did i code, later...
df['torque'].replace({'380Nm(38.7kgm)@ 2500rpm':'380Nm@ 1750-2500rpm'},inplace=True)
#________________________________________________________________________________
df.torque.replace({'190@ 21,800(kgm@ rpm)':'223Nm@'},inplace=True)
df.torque.replace({'145@ 4,100(kgm@ rpm)':'145Nm@'},inplace=True)
df.torque.replace({'110@ 3,000(kgm@ rpm)':'110Nm@'},inplace=True)
df.torque.replace({'130@ 2500(kgm@ rpm)':'130Nm@'},inplace=True)
df.torque.replace({'115@ 2,500(kgm@ rpm)':'115Nm@'},inplace=True)
df.torque.replace({'115@ 2500(kgm@ rpm)':'115Nm@'},inplace=True)
#_______________________________________________________________________________
df.torque.replace({'14.9 KGM at 3000 RPM':'14.9kgm'},inplace=True)
df.torque.replace({'24 KGM at 1900-2750 RPM':'24kgm'},inplace=True)
# In[12]:
#Note - 1. observations with string values are there so step-1 is seperate it and remove str values.
#Note - 2.then convert it into float type from object
df['mileage'] = df['mileage'].str.split().str.get(0).astype(float)
# In[13]:
#Note - 1. observations with string values are there so step-1 is seperate it and remove str values.
#Note - 2.then convert it into float type from object
df['engine'] = df['engine'].str.split().str.get(0).astype(float)
# In[14]:
#Note - 1. observations with string values are there so step-1 is seperate it and remove str values.
df['max_power'] =df['max_power'].str.split().str.get(0)
# In[15]:
#here i have found a data entry mistake
df[df['max_power']=='bhp']
# In[16]:
#step-1 replace with nan values then....
df.max_power.replace('bhp',np.nan,inplace=True)
# In[17]:
#step-2 converting into float type
df['max_power'] = df['max_power'].astype(float)
# In[18]:
#here we can see how torque observations along with str and special characters.
df['torque'].unique()
# In[19]:
#note 1-remoing all str values with observations along with @special character
#note 2.converting into float type
df['Torque']=df.torque.str.split().str.get(0).str.split('Nm').str.get(0).str.split('Kgm').str.get(0).str.split('kgm').str.get(0).str.split('nm').str.get(0).str.split('NM').str.get(0).str.replace('[\@\,]','').replace({'110(11.2)':'110'}).astype(float)
# In[20]:
df.info() #now it is fine however....
# # ___
#
# #### here in the torque column there are two types of torque figures are there -
#
# 1. Nm unit values
# 2. Kg-m unit values
#
# ### Nm = kgm * 9.8067 for more see the below image
#
#
#
# #### Therefore , we need to convert the values kgm to nm..
#
# In[21]:
from urllib.request import urlretrieve
urlretrieve('https://th.bing.com/th/id/R.4c321b88fc80b86880bb8662881c13fa?rik=FywKvPuufxs3wA&riu=http%3a%2f%2fwww.unipulse.tokyo%2fen%2fwp-content%2fuploads%2f2018%2f02%2fNm_kgm.gif&ehk=%2bC4wG%2fZtU57DZDu3cqH4XHsclceIyvMOZw564ayZUi4%3d&risl=&pid=ImgRaw&r=0','details.jpg')
# In[22]:
from PIL import Image
torque_det = Image.open('details.jpg')
plt.figure(figsize=(20,10))
plt.grid(False)
plt.axis(False)
plt.imshow(torque_det);
# In[23]:
# step -1 firsly we have torque named columns are 2 in our data. we should use unmodified column that is "torque"
# here i will seperate the column, nm and kgm like false and True Boolean type. but first we should know that
# kgm string in capital and small letters formate and some data entry mistakes (that is they were mistakenly type kgm
# instead of nm) so can check my notework at "In[11]".
add = pd.DataFrame({'bool':df['torque'].str.contains('kgm')})
# In[24]:
df = pd.concat((df,add),axis=1)
#step -2 concate it. next you can understand. it will be appearing..torque column with nm observation indicating 'False' and
# torque column with kgm indicating true..
df.head()
# In[25]:
#then divide it..into two types true and false and after calculation we will concate it both.
a = df[df['bool']==True]
# In[26]:
b = df[df['bool']==False]
# In[27]:
#create a column.it should have kgm to nm values
a['n_Torque'] = a.Torque* 9.80665
# In[28]:
#then select required columns
a1 = a[['name','mileage','engine','max_power','n_Torque']]
# In[29]:
#here we will concate them again so keeping columns are same as a1 part
b1 = b.drop(columns=['torque','bool'],axis=True).rename({'Torque':'n_Torque'},axis=1)
# In[30]:
car_df =pd.concat((a1,b1),axis=0,ignore_index=True)
# In[31]:
car_df.reset_index(inplace=True)
# In[32]:
#droping unnesscary columns secound time
car_df.drop(columns=['name','index'],inplace=True)
# In[33]:
#In this analysis we require only figures. so that i had to remove duplicates again
car_df[['mileage','engine','max_power','n_Torque']].drop_duplicates(inplace=True)
# In[34]:
#with the help of describe() function we can see stats values and have you observe the min of mileage ?
car_df.describe()
# In[35]:
#data entry mistake so that i am replacing the value 0 to nan
car_df.mileage.replace(0,np.nan,inplace=True)
# In[36]:
car_df.isnull().sum()
# In[37]:
# here why i am fill the null values with help of engine..
# we can remove the nullvales only if all columns are null but here only mileage column is null.
# i dont take mean or median of mileage observations. Instead of that take a base and find them,
# which is better to get meaning full values.... because mileage observations are not for only one cc engine..
car_df.mileage = car_df.groupby('engine')['mileage'].apply(lambda x:x.fillna(x.median()))
# In[38]:
car_df.isnull().sum()
# In[39]:
car_df.dropna(inplace=True)
# ## Skewness and Outliers
# In[40]:
for i in ['engine','max_power','n_Torque','mileage']:
print(i)
print('skewness : {:.2f}'.format(car_df[i].skew()))
print( )
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
car_df[i].hist(grid=False)
plt.subplot(1,2,2)
sns.boxplot(x=car_df[i])
plt.show()
# ## Multivariate Analysis
# In[41]:
plt.figure(figsize=(10,7))
sns.heatmap(car_df.corr(),vmax=1,vmin=-1,annot=True);
#here you can see mileage with every other specifications, a negetive corelation coffiecient.
# ## Observations :
#
# 1. Here we can see the engine (independent variable) and how it is influencing the mileage (dependent variable) r = -0.61 compare to remain independent variables of torque and max power.
#
# 2. after the engine variable next max power variable is influencing the mileage r= -0.4
#
# 3. Along with this, we can see a strong relationship between the engine and torque similarly torque and max power r =0.84
#
# #### with the help heatmap we could get the "R value"
#
# #### lets see regression direction in pair plots
#
# # ____
# In[42]:
#wait it will take few more seconds for loading...
sct = sns.pairplot(car_df,kind='kde')
sct.map_lower(sns.kdeplot,levels=4,color='.2');
# ## Linear Regression Model
# ### Define x and y
# In[43]:
x_var = car_df.drop(columns=['mileage'],axis=1).values
y_var = car_df.mileage.values
# ### Spliting the data set into the training set and test set
# In[44]:
x_train,x_test,y_train,y_test = train_test_split(x_var,y_var,test_size=0.20,random_state=0)
# ### Train the model on the Training set
# In[45]:
ml = LinearRegression()
ml.fit(x_train,y_train)
# ### Predict the test set results
# In[46]:
y_pred = ml.predict(x_test)
# ### Evalute the Model
# In[47]:
coff_of_det = r2_score(y_test,y_pred)
print('The coffiecient of Determination is {:.2f}'.format(coff_of_det))
print( )
print('The mean squared error is {:.2f}'.format(np.sqrt(mean_squared_error(y_test,y_pred))))
print( )
print('the mean absolute percentage error is {:.2f}'.format(mean_absolute_percentage_error(y_test,y_pred)))
# ### Plot the results
# In[48]:
new_df = pd.DataFrame({'actual_y':y_test,'predicted_y':y_pred,'residuals':y_test-y_pred})
# In[49]:
plt.figure(figsize=(10,10))
plt.title('Correlation between acutal and predicted')
sns.scatterplot(x=new_df.actual_y,y=new_df.predicted_y,palette='deep')
print('the correlation coffiecient r value is {:.2f}'.format(new_df.actual_y.corr(new_df.predicted_y)))
# ### Residual density plot
# In[50]:
plt.figure(figsize=(10,5))
new_df.residuals.plot(kind='kde');
# ### Observations -
#
# #### here we can see the residual density plot how normally distributed it is. However, this data has outliers and skewed distribution. we could see above. so that our residual plot below the left and right lines went to till (-20,20).