-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPcCFOREST_PM_draft.Rmd
302 lines (230 loc) · 11 KB
/
PcCFOREST_PM_draft.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
---
title: "R Notebook"
output: html_notebook
---
```{r warning=FALSE, message=FALSE}
library(sqldf)
library(randomForest)
library(party) #changed from partykit
library(caret)
library(corrplot)
library(parallel);
library(doParallel);
library(gtools)
library(devtools)
library(dplyr)
library(data.table)
library(ggplot2)
```
#Set up encounters for RF
```{r}
#pcdata.pm<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA FILE.csv')
#8/9/18
pcdata.pm <- read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data/2017- 100 New Whistles/ROCCA Towed Data/BigRoccaFile_PM_TowedAll_EDIT_20180717.csv')
pcdata.pm <-droplevels(filter(pcdata.pm, population != "nwhi"))
#pcdata.pm<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA OLD raw.csv')
# Need even number of encounters for each population represented in the training data.
PEL = 1:8 # designated groupID's assigned for each pelagic group
#NWHI = 14:17 # designated groupID's assigned for each NWHI group, #20 was absorbed into #19 (June 2017)
MHI = c(31:34) # designated groupID's assigned for each MHI group, #25 absorbed into #26 (June 2017)
# Oct 7: more combining of NWHI and MHI occurred. Performed in Pc_DatManipulation.rmd
#This is set up to be flexible to change later if needed
nrep = 4 # multiplier for total number of groups
npel = 1 # number of pelagic schools to group together
nnwhi = 1 # number of northwest HI schools to group together
nmhi = 1 # number of main HI schools to group together
w = 150 # Number of whistles randomly pulled from each group
ntest = 1 # number of groups to randomly pull for test dataset
n_samps = 1 #Zack: number of times you want to resample from your dataset
```
Meat of the Analysis
```{r message=FALSE, warning=FALSE}
ptm<-proc.time()
#### Start ####
cresults.pm = matrix(0, nrow = 2, ncol = 2)
crealP = c()
cfakeP = c()
crealM = c()
cfakeM = c()
coutput_WhistleClass.pm =NULL
coutput_EncounterClass.pm=NULL
coutput_RawResults.pm=NULL
coutput_TrainSet.pmAll=NULL
coutput_TestSet.pmAll = NULL
coutput_Votes.pmAll=NULL
ctuned.pmAll=NULL
cvarIMPtotal.pm = NULL
#### OVERALL LOOP TO DERIVE MEANS AND STANDARD DEVS ####
for(rep in 1:n_samps){
set.seed(rep) #YB placed the seed inside the loop bc it wasn't duplicating trees
# Total groups included
#This randomly samples 'nrep' groups from each population
all = NULL
for(i in c(N_pel,N_mhi)){
sub=droplevels(subset(pcdata.pm, pcdata.pm$group==i)) #selects all whistles from the randomly selected groups, drops empty levels too
samp=sub[sample(nrow(sub), w),] #randomly samples w whistles from the 50 or more whistles
all <-rbind(all, samp)
}
ctest.pm<-droplevels(subset(all, group%in%N_test ))
ctrain.pm<-droplevels(subset(all, !(group%in%N_test)))
#print(unique(train$EncounterID))
#print(unique(test$EncounterID))
# testp <- subset(test, test$population == 'pelagic')
# testn <- subset(test, test$population == 'nwhi')
# ADDING } to test making the training and test data
#}
#ptm<-proc.time()
#### Tuning parameters ####
## Set up a dataframe where each row is a specific combination of mtry (# of variables selected at each split), n_tree (# of trees in forest), possibly other stuff
# mtry_vals = 5:7
# ntree_vals = seq(501,1001,100) #from, to, by
# #node_vals=1:3
#
# tune_settings = expand.grid(mtry = mtry_vals, ntree = ntree_vals)#, nodesize = node_vals)
# #tune_settings2 = expand.grid(mtry = mtry_vals, ntree = ntree_vals)
#
# #### Run Optimization Sequence ####
# ## Initiate Cluster
# ptm <-proc.time()
#
# cl = makeCluster(detectCores())
# registerDoParallel(cl, cores = detectCores())
#
#
# #### Formula ####
# # for the RF model with 'population' as the response and time-frequency measurements
#
# ## UNCORRELATED VARIABLES
# #formPc <- as.formula(paste("population ~ ", paste(names(pcdata.pm)[5:30],collapse="+")))
#
# ## CORRELATED VARIABLES
cformPc <- as.formula(paste("population ~ ", paste(names(pcdata.pm)[6:52],collapse="+"))) #Use variables to classify to population
#
#Requires Party 8/9/18
cfit.pm <- cforest(cformPc, data = ctrain.pm, control=cforest_unbiased(mtry=6, ntree=100))
VI <- varimp(cfit.pm, conditional = TRUE)
caret:::cforestStats(cfit.pm)
RFfit.pm <- randomForest(cformPc, data = ctrain.pm, mtry = 6, ntree = 100, importance = T, replace = F, confusion=T)
#Requires partykit
cfit.pm <- cforest(cformPc, data=ctrain.pm, ntree=1500, mtry = 6, control=ctree_control(mincriterion = 0, minsplit = 4L, minbucket = 2L))
OOBkit.pm <- table(ctrain.pm$population, predict(cfit.pm, OOB = TRUE, type = 'response'))
OOB.pm <- 1-(sum(diag(OOBkit.pm))/nrow(ctrain.pm))
cprediction.pm <-predict(cfit.pm, newdata=ctest.pm, type='response')
ctest.pm$rightPred <- as.character(cprediction.pm) == as.character(ctest.pm$population)
#sums all 'TRUE' and divides by total
caccuracy.pm <- sum((ctest.pm$rightPred)/nrow(ctest.pm))
####RESULTS!!!#####
cresults.pmtemp <- table(ctest.pm$population, cprediction.pm)
# ##The foreach function is the loop function for parallel processing.
# ##It uses %dopar% which partitions the loop among your cores, then combines each output using the .combine argument.
#
# x = foreach(i = 1:nrow(tune_settings), .combine = rbind) %dopar% {
# library(party)
# do_RF = cforest(cformPc, data=ctrain.pm, controls=cforest_control(mincriterion = 0,ntree=tune_setting$ntree[i], mtry = tune_settings$mtry[i], minsplit = 4L, minbucket = 2L))
# print(do_RF$err.rate[tune_settings$ntree[i],])
# }
#
#
# #### Stop Cluster ####
#
# stopCluster(cl)
#
# ## Combine settings and results ####
#
# tune_settings = cbind(tune_settings, x)
#
#
# ## Optimal Settings: the settings combo
# ## that has the lowest OOB rate
#
# tuned.pm<-tune_settings[which.min(tune_settings$OOB),]
# tuned.pmAll <- rbind(tuned.pmAll, tuned.pm)
#
# proc.time()-ptm
#### Run Conditional Random Forest ####
##Run Random Forest model using 'tuned' parameters for mtry and ntree
createCfGrid <- function(len, data) {
grd = createGrid("cforest", len, data)
grd = expand.grid(.controls = cforest_unbiased(mtry = 5, ntree = 1000))
return(grd)
}
cfit.pm <- train(cformPc,
data = pcdata.pm,
method = 'cforest',
controls = cforest_unbiased(ntree = 10))
cfit.pm <- cforest(cformPc, data=ctrain.pm, controls=cforest_unbiased(ntree=2000, mtry = 7))
OOBkit.pm <- table(ctrain.pm$population, predict(cfit.pm, OOB = TRUE, type = 'response'))
OOB.pm <- 1-(sum(diag(OOBkit.pm))/nrow(ctrain.pm))
#prediction for population of test data using RF model, 'fit', returns a list with aggregate and individual votes for each whistle
cprediction.pm <-predict(cfit.pm, newdata=ctest.pm, type='response')
# cprediction.vote <-predict(cfit.pm, newdata=test.pm, predict.all = T, type='vote')
#shows T/F for individual whistles and whether they were classified correctly
ctest.pm$rightPred <- as.character(cprediction.pm) == as.character(ctest.pm$population)
#sums all 'TRUE' and divides by total
caccuracy.pm <- sum((ctest.pm$rightPred)/nrow(ctest.pm))
####RESULTS!!!#####
cresults.pmtemp <- table(ctest.pm$population, cprediction.pm) # conf matrix of individual whistle classification
ctest.pmID <-unique(ctest.pm$EncounterID)
sink("cresults.pmtemp.txt", append=T)
ctest.pmID
caccuracy.pm
cresults.pmtemp
sink()
}
#calculate % accuracies of diagonal by population from confusion matrix of percentages
pel.pel<-round((cresults.pmtemp[1,1])/(cresults.pmtemp[1,1]+cresults.pmtemp[1,2])*100,2) #pelagic
mhi.mhi<-round((cresults.pmtemp[2,2])/(cresults.pmtemp[2,1]+cresults.pmtemp[2,2])*100,2) #mhi
#calculate misclassification for pelagic
pel.mhi<-round((cresults.pmtemp[1,2])/(cresults.pmtemp[1,1]+cresults.pmtemp[1,2])*100,2)
mhi.pel<-round((cresults.pmtemp[2,1])/(cresults.pmtemp[2,1]+cresults.pmtemp[2,2])*100,2)
# Important Variables
varIMPtemp.pm <- as.data.frame(importance(fit.pm))
varIMP.pm <- varIMPtemp.pm[order(varIMPtemp.pm$MeanDecreaseGini), , drop =F]
varIMPtotal.pm <- rbind(varIMPtotal.pm, "REP"=rep, varIMP.pm)
#Calculate means and standard dev for each iteration of confusion matrix
#se=function(x) sd(x)/sqrt(length(x))
#for a single iteration, stores separate cells of results for calculating mean and standard error
se <- function(x) sd(x)/sqrt(length(x))
realP <- c(realP,cresults.pmtemp[1,1])
fakeP <- c(fakeP,cresults.pmtemp[1,2])
realM <- c(realM,cresults.pmtemp[2,2])
fakeM <- c(fakeM,cresults.pmtemp[2,1])
#Consolidate back into confusion matrix
pel.row<-c(pel.pel, pel.mhi)
#nwhi.row<-c(nwhi.pel, nwhi.nwhi, nwhi.mhi)
mhi.row<-c(mhi.pel, mhi.mhi)
#class.title<-c('pelagic', 'mhi')
conf.mat.pm <- rbind(pel.row, mhi.row)
#conf.mat.pm = conf.mat.pm + conf.mat.pmtemp
#conf.mat.pm<-data.frame(rbind("Pelagic"=pel.row, "MHI"=mhi.row))
colnames(conf.mat.pm)=c("Pelagic", "MHI")
#conf.mat.pm
true.pel = if(pel.pel>60) 'Pelagic' else 'Ambiguous'
#true.nwhi = if(nwhi.nwhi>50) 'NWHI' else 'Ambiguous'
true.mhi = if(mhi.mhi>60) 'MHI' else 'Ambiguous'
true.enc = data.frame(rbind(pel.pel, mhi.mhi))
#d = as.data.frame(unique(test$EncounterID))
#d = as.data.frame(d[c(1:3), ])
e.pm=c()
e.pm=as.data.frame(rbind(true.pel, true.mhi))
e.pm=cbind(e.pm, true.enc)
colnames(e.pm) = c("EncounterClass", "% Correct") #, "EncounterID")
#Summed results confusion matrix after each iteration
results.pm = results.pm + cresults.pmtemp
#### OUTPUTS ####
g.pm=cbind(rep, "EncounterID"= test.pm$EncounterID, "EncounterCount"=test.pm$EncounterCount, "Prediction"=test.pm$rightPred, "PredictedPop"=prediction.pm, as.data.frame(cbind(prediction.vote$aggregate, prediction.vote$individual)))
output_WhistleClass.pm <- smartbind(output_WhistleClass.pm, g.pm)
output_EncounterClass.pm <- rbind(output_EncounterClass.pm, e.pm)
output_RawResults.pm<-rbind(output_RawResults.pm, rep, results.pm, accuracy.pm=signif(accuracy.pm, digits = 3)) #rough table keeping track of confusion matrices for all iterations
output_ConfMat.pm<-rbind(output_ConfMat.pm, rep, conf.mat.pm, accuracy.pm=signif(accuracy.pm, digits = 3))
#df to show encounters used in each training set
output_TrainSet.pm <- data.frame(rep, "EncounterID" = sort(unique(train.pm$EncounterID))) #, "encounter" = sort(unique(train$group)))
output_TrainSet.pmAll <-rbind(output_TrainSet.pmAll, output_TrainSet.pm)
#df to show encounters used in each test set
output_TestSet.pm <- data.frame(rep, "EncounterID" = sort(unique(test.pm$EncounterID)), e.pm) #, "encounter" = sort(unique(train$group)))
output_TestSet.pmAll <-rbind(output_TestSet.pmAll, output_TestSet.pm)
#df to show the prediction of each individual whistle per encounter (THIS IS DOCUMENTED IN 'g')
# output_IndDW<-data.frame(rep, "EncounterID"=test$EncounterID, "PredictedPop"=prediction, "EncounterCount"=test$EncounterCount)
# output_IndDWAll <- rbind(output_IndDWAll, output_IndDW) #consolidates all individual whistle results for each iteration
}
```