-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBengaluru_house_price_prediction.py
97 lines (75 loc) · 3.32 KB
/
Bengaluru_house_price_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation, svm, preprocessing
from sklearn.metrics import mean_squared_error
df = pd.read_csv('Predicting-House-Prices-In-Bengaluru-Train-Data.csv')
#print(df.head())
## ENCODE NON-NUMERIC DATA TO NUMERIC VALUES.
#df.drop(['society'], 1, inplace=True)
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True) # 0 and not -99999 coz age, sex etc cant be that value
#print(df.head())
def handle_non_numeric_data(df):
columns = df.columns.values
for column in columns:
text_digit_vals = {}
# create empty dictionary, {'female':0, 'male':1} like that where, the 0,1 are returned thus converting them into integers
def convert_to_int(key):
return text_digit_vals[key]
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents= df[column].values.tolist()
unique_elements = set (column_contents)
x=0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
df[column] = list(map(convert_to_int, df[column])) # altering the df values accordingly to the required numeric data set
return df
df = handle_non_numeric_data(df)
#print(df.head)
## NON - NUMERIC DATA CLEANED
#df.drop(['location', 'society', 'availability', 'area_type'], 1, inplace=True)
X = np.array(df.drop(['price'], 1))
y = np.array(df['price'])
# y= np.log(y)
X = preprocessing.scale(X)
# df['total_sqft'] = preprocessing.scale(df['total_sqft']) *100
# divide into test and train features
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25)
X[:, 5] = X[:, 5] * 100
# print(X[:, 5])
# Simple Linear Regression
clf1 = LinearRegression()
clf1.fit(X_train, y_train)
accuracy1 = clf1.score(X_test, y_test)
print('linear regression score =', accuracy1)
# Support Vector Machines - Linear SVR
clf2 = svm.LinearSVR()
clf2.fit(X_train, y_train)
accuracy2 = clf2.score(X_test,y_test)
print('SVM.LinearSVR score =', accuracy2)
# Support Vector Machines - SVR
clf3 = svm.SVR()
clf3.fit(X_train, y_train)
accuracy3 = clf3.score(X_test,y_test)
print('SVM.SVR score =', accuracy3)
# polynomial Linear Regression # degree 5 is giving good results for full X preprocessing
Poly_reg = PolynomialFeatures(degree=5, interaction_only=False, include_bias=False)
X_poly = Poly_reg.fit_transform(X_train, y=5)
clf = LinearRegression(fit_intercept= True, normalize= False, n_jobs=-1)
clf.fit(X_poly, y_train)
X_poly_test = Poly_reg.fit_transform(X_test)
# accuracy = clf.score(X_poly_test, y_test)
accuracy = r2_score(y_test, clf.predict(X_poly_test), multioutput='variance_weighted')
print('poly regression score=', accuracy)
#print('Poly_reg', y_test, clf.predict(X_poly_test))
#print('Linear_regression', y_test, clf1.predict(X_train))
#rms1 = mean_squared_error(clf.predict(X_poly_test), y_test)
#print('rms_poly', rms1)
#df1 = pd.read_csv('Predicting-House-Prices-In-Bengaluru-Test-Data.csv')
#print(df1.head)