-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathWater_solution - xgboost 45.R
211 lines (155 loc) · 7.8 KB
/
Water_solution - xgboost 45.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#Matt Brown email: matthew.brown.iowa@gmail.com
#Location: Washington, DC
#Website: www.DrivenData.org
#Competition: Pump it Up: Data Mining the Water Table
#Solution is 9th out of 2017 participants
#Software Tools: XGBoost package in R
#Brief Model Description: Ensemble of 11 XGBoost models with equal weight to each solution
#Feature Selection
#The original data set contained 40 variables. I reduced it down to 26 variables by removing variables
#that were similar/duplicates of other variables.
#After removing duplicates, I used xgb.importance function to remove variables that did not
#improve the model. I used trial and error to remove and then test to see how the model performed.
#I also modified some variables to reduce the effects of missing data. Specific modifications and
#explanation is shown in the 'Feature Selection' section of the code.
#Load required libraries
library(xgboost)
library(Matrix)
library(MatrixModels)
library(data.table)
#Read in test dataset
test<-read.csv("C:/Users/580010/Desktop/Kaggle/DrivenData -Pump It Up/test.csv")
#Create new column in test
test$status_group <- 0
#Read in train and label datasets
train<-read.csv("C:/Users/580010/Desktop/Kaggle/DrivenData -Pump It Up/train.csv")
label<-read.csv("C:/Users/580010/Desktop/Kaggle/DrivenData -Pump It Up/TrainingLabels.csv")
#Subset label so that it only contains the label (target variable)
label <- subset(label, select = status_group )
#Combine the train and the label data sets
train<-cbind(train,label)
#Create new status_group column so test and train and same number of columns.
#Required when building the model
train$status_group<-0
#Designate columns as train and test
train$tst <- 0
test$tst <- 1
#Combine train and test into one dataset
data<- rbind(train,test)
#######Feature Engineering###########
#Changed date_recorded to the Date variable type
data$date_recorded<-as.Date(data$date_recorded)
#Set region_code and district_code as a factors
data$region_code<-factor(data$region_code)
data$district_code<-factor(data$district_code)
#Modified construction year so that it starts at 0 and counts up
#E.g. 1960=0, 1961=1, etc.
min_year<-1960
data$construction_year<-data$construction_year-min_year
#Set missing construction_year values to to the median of the construction year values
data$construction_year[data$construction_year<0]= median(data$construction_year[data$construction_year>0])
#Set missing gps_height values to the median of gps_height
data$gps_height[data$gps_height==0]=median(data$gps_height[data$gps_height>0])
#data$amount_tsh[data$amount_tsh==0]<-median(data$amount_tsh[data$amount_tsh>0])
#######Feature Selection#######
#Removed duplicate/similar and variables which did not improve the model by setting the
#Not sure what num_private is, and it didn't improve the model.
data$num_private<-NULL
#Removed because only one unique value
data$recorded_by<-NULL
#Removed because there were too many unique values
data$wpt_name<-NULL
#Removed because both are similar to extraction_type_class
data$extraction_type_group<-NULL
data$extraction_type<-NULL
#Removed because similar to payment
data$payment_type<-NULL
#Removed because similar to quality_group.
data$water_quality<-NULL
#data$basin<-NULL
data$scheme_management<-NULL
#Removed because all are location variables. Long and lat should be sufficient to account for
#location.
data$district_code<-NULL
data$region<-NULL
data$region_code<-NULL
data$subvillage<-NULL
data$ward<- NULL
#Removed because similar to waterpoint_type
data$waterpoint_type_group<-NULL
#Removed because duplicate of quantity
data$quantity_group<-NULL
#Removed because too many unique values. Another option is to group the installers to reduce the
#number of unique values
data$installer<-NULL
#Separate data into train and test set
data_train <- data[data$tst==0,]
data_test <- data[data$tst==1,]
#Create test set that doesn't contain the ID column. I did this because the test and train
#datsets need to have the same number of columns when making predictions.
data_test.noID<-subset(data_test, select = -id)
#Remove the id and status group columns from the train dataset. I don't want these columns
#to affect the the model
data_train<-subset(data_train, select = c(-id,-status_group))
#Convert data frames to numeric matrices. Xgboost requires user to enter data as a numeric matrix
data_test.noID <- as.matrix(as.data.frame(lapply(data_test.noID, as.numeric)))
data_train <- as.matrix(as.data.frame(lapply(data_train, as.numeric)))
label<-as.numeric(label$status_group)
#Create a xgb.DMatrix which is the best format to use to create an xgboost model
train.DMatrix <- xgb.DMatrix(data = data_train,label = label, missing = NA)
#For loop to run model 11 time with different random seeds. Using an ensemble technique such as this
#improved the model performance
#Set i=2 because the first column is for the id variable
i=2
#Create data frame to hold the 11 solutions developed by the model
solution.table<-data.frame(id=data_test[,"id"])
for (i in 2:12){
#Set seed so that the results are reproducible
set.seed(i)
#Cross validation to determine the number of iterations to run the model.
#I tested this model with a variety of parameters to find the most accurate model
xgb.tab = xgb.cv(data = train.DMatrix, objective = "multi:softmax", booster = "gbtree",
nrounds = 500, nfold = 4, early.stop.round = 10, num_class = 4, maximize = FALSE,
evaluation = "merror", eta = .2, max_depth = 12, colsample_bytree = .4)
#Create variable that identifies the optimal number of iterations for the model
min.error.idx = which.min(xgb.tab[, test.merror.mean])
#Create model using the same parameters used in xgb.cv
model <- xgboost(data = train.DMatrix, objective = "multi:softmax", booster = "gbtree",
eval_metric = "merror", nrounds = min.error.idx,
num_class = 4,eta = .2, max_depth = 14, colsample_bytree = .4)
#Predict. Used the data_test.noID because it contained the same number of columns as the train.DMatrix
#used to build the model.
predict <- predict(model,data_test.noID)
#Modify prediction labels to match submission format
predict[predict==1]<-"functional"
predict[predict==2]<-"functional needs repair"
predict[predict==3]<-"non functional"
#View prediction
table(predict)
#Add the solution to column i of the solutions data frame. This creates a data frame with a column for
#each prediction set. Each prediction is a vote for that prediction. Next I will count the number of votes
#for each prediction as use the element with the most votes as my final solution.
solution.table[,i]<-predict
}
#Count the number of votes for each solution for each row
solution.table.count<-apply(solution.table,MARGIN=1,table)
#Create a vector to hold the final solution
predict.combined<-vector()
x=1
#Finds the element that has the most votes for each prediction row
for (x in 1:nrow(data_test)){
predict.combined[x]<-names(which.max(solution.table.count[[x]]))}
#View the number of predictions for each classification
table(predict.combined)
#Create solution data frame
solution<- data.frame(id=data_test[,"id"], status_group=predict.combined)
#View the first five rows of the solution to ensure that it follows submission format rules
head(solution)
#Create csv submission file
write.csv(solution, file = "Water_solution - xgboost 45.csv", row.names = FALSE)
#Calculate the importance of each variable to the model.
#Used this function to remove variables from the model variables which don't contribute to the model.
importance <- xgb.importance(feature_names = colnames(data_train), model =model)
importance
xgb.plot.importance(importance_matrix = importance)
#score .8247