PcCFOREST_PM_draft.Rmd

---
title: "R Notebook"
output: html_notebook
---

```{r warning=FALSE, message=FALSE}
library(sqldf)
library(randomForest)
library(party) #changed from partykit
library(caret)
library(corrplot)
library(parallel); 
library(doParallel);
library(gtools)
library(devtools) 
library(dplyr)
library(data.table)
library(ggplot2)


```


#Set up encounters for RF
```{r}

#pcdata.pm<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA FILE.csv')

#8/9/18
pcdata.pm <- read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data/2017- 100 New Whistles/ROCCA Towed Data/BigRoccaFile_PM_TowedAll_EDIT_20180717.csv')

pcdata.pm <-droplevels(filter(pcdata.pm, population != "nwhi"))

#pcdata.pm<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA OLD raw.csv')

# Need even number of encounters for each population represented in the training data. 
PEL = 1:8                # designated groupID's assigned for each pelagic group 
#NWHI  = 14:17   # designated groupID's assigned for each NWHI group, #20 was absorbed into #19 (June 2017)     
MHI = c(31:34)    # designated groupID's assigned for each MHI group, #25 absorbed into #26 (June 2017) 
                              # Oct 7: more combining of NWHI and MHI occurred. Performed in Pc_DatManipulation.rmd
#This is set up to be flexible to change later if needed
nrep  = 4           # multiplier for total number of groups
npel  = 1           # number of pelagic schools to group together
nnwhi = 1           # number of northwest HI schools to group together
nmhi  = 1           # number of main HI schools to group together
w = 150              # Number of whistles randomly pulled from each group 
ntest = 1           # number of groups to randomly pull for test dataset
n_samps = 1        #Zack: number of times you want to resample from your dataset


```


Meat of the Analysis

```{r message=FALSE, warning=FALSE}

ptm<-proc.time() 


#### Start ####
cresults.pm = matrix(0, nrow = 2, ncol = 2)
crealP = c()
cfakeP = c()
crealM = c()
cfakeM = c()
coutput_WhistleClass.pm =NULL
coutput_EncounterClass.pm=NULL
coutput_RawResults.pm=NULL
coutput_TrainSet.pmAll=NULL
coutput_TestSet.pmAll = NULL
coutput_Votes.pmAll=NULL
ctuned.pmAll=NULL
cvarIMPtotal.pm = NULL


#### OVERALL LOOP TO DERIVE MEANS AND STANDARD DEVS ####  
for(rep in 1:n_samps){ 
 
  set.seed(rep) #YB placed the seed inside the loop bc it wasn't duplicating trees
  
  # Total groups included
  #This randomly samples 'nrep' groups from each population
  
  
  all = NULL
  for(i in c(N_pel,N_mhi)){
    sub=droplevels(subset(pcdata.pm, pcdata.pm$group==i)) #selects all whistles from the randomly selected groups, drops empty levels too
    samp=sub[sample(nrow(sub), w),] #randomly samples w whistles from the 50 or more whistles
    all <-rbind(all, samp)
    
    }
  
  ctest.pm<-droplevels(subset(all, group%in%N_test ))
  ctrain.pm<-droplevels(subset(all, !(group%in%N_test)))
  #print(unique(train$EncounterID))
  #print(unique(test$EncounterID))
  
  # testp <- subset(test, test$population == 'pelagic')
  # testn <- subset(test, test$population == 'nwhi')

# ADDING } to test making the training and test data
#}
  
  
#ptm<-proc.time() 


#### Tuning parameters ####
## Set up a dataframe where each row is a specific combination of mtry (# of variables selected at each split), n_tree (# of trees in  forest), possibly other stuff
# mtry_vals = 5:7 
# ntree_vals = seq(501,1001,100) #from, to, by
# #node_vals=1:3
# 
# tune_settings = expand.grid(mtry = mtry_vals, ntree = ntree_vals)#, nodesize = node_vals)
# #tune_settings2 = expand.grid(mtry = mtry_vals, ntree = ntree_vals)
# 
# #### Run Optimization Sequence ####
# ## Initiate Cluster
# ptm <-proc.time()
# 
# cl = makeCluster(detectCores())
# registerDoParallel(cl, cores = detectCores())
# 
# 
# #### Formula ####
# # for the RF model with 'population' as the response and time-frequency measurements 
# 
# ## UNCORRELATED VARIABLES
# #formPc <- as.formula(paste("population ~ ", paste(names(pcdata.pm)[5:30],collapse="+")))
# 
# ## CORRELATED VARIABLES
cformPc <- as.formula(paste("population ~ ", paste(names(pcdata.pm)[6:52],collapse="+"))) #Use variables to classify to population
# 
  
#Requires Party 8/9/18
cfit.pm <- cforest(cformPc, data = ctrain.pm, control=cforest_unbiased(mtry=6, ntree=100))
VI <- varimp(cfit.pm, conditional = TRUE)  
caret:::cforestStats(cfit.pm)

RFfit.pm <- randomForest(cformPc, data = ctrain.pm, mtry = 6, ntree = 100, importance = T, replace = F, confusion=T)
  
#Requires partykit
cfit.pm <- cforest(cformPc, data=ctrain.pm, ntree=1500, mtry = 6, control=ctree_control(mincriterion = 0, minsplit = 4L, minbucket = 2L))

OOBkit.pm <- table(ctrain.pm$population, predict(cfit.pm, OOB = TRUE, type = 'response')) 
OOB.pm <- 1-(sum(diag(OOBkit.pm))/nrow(ctrain.pm)) 

cprediction.pm <-predict(cfit.pm, newdata=ctest.pm, type='response')
ctest.pm$rightPred <- as.character(cprediction.pm) == as.character(ctest.pm$population)

 #sums all 'TRUE' and divides by total
   caccuracy.pm <- sum((ctest.pm$rightPred)/nrow(ctest.pm)) 
  
  ####RESULTS!!!#####  
  cresults.pmtemp <- table(ctest.pm$population, cprediction.pm)

# ##The foreach function is the loop function for parallel processing. 
# ##It uses %dopar% which partitions the loop among your cores, then combines each output using the .combine argument. 
# 
# x = foreach(i = 1:nrow(tune_settings), .combine = rbind) %dopar% { 
#     library(party)
#     do_RF = cforest(cformPc, data=ctrain.pm,  controls=cforest_control(mincriterion = 0,ntree=tune_setting$ntree[i], mtry = tune_settings$mtry[i], minsplit = 4L, minbucket = 2L))
#     print(do_RF$err.rate[tune_settings$ntree[i],])
#   }
# 
# 
# #### Stop Cluster ####
# 
# stopCluster(cl)
# 
# ## Combine settings and results ####
# 
# tune_settings = cbind(tune_settings, x)
# 
# 
# ## Optimal Settings: the settings combo
# ## that has the lowest OOB rate
# 
# tuned.pm<-tune_settings[which.min(tune_settings$OOB),]
# tuned.pmAll <- rbind(tuned.pmAll, tuned.pm)
# 
# proc.time()-ptm

#### Run Conditional Random Forest ####
##Run Random Forest model using 'tuned' parameters for mtry and ntree
 createCfGrid <- function(len, data) {
    grd = createGrid("cforest", len, data)
    grd = expand.grid(.controls = cforest_unbiased(mtry = 5, ntree = 1000))
    return(grd)
}
  cfit.pm <- train(cformPc,
                   data = pcdata.pm, 
                   method = 'cforest',
                   controls = cforest_unbiased(ntree = 10))
  
   cfit.pm <- cforest(cformPc, data=ctrain.pm, controls=cforest_unbiased(ntree=2000, mtry = 7))
 OOBkit.pm <- table(ctrain.pm$population, predict(cfit.pm, OOB = TRUE, type = 'response')) 
 OOB.pm <- 1-(sum(diag(OOBkit.pm))/nrow(ctrain.pm)) 
#prediction for population of test data using RF model, 'fit', returns a list with aggregate and individual votes for each whistle  
  cprediction.pm <-predict(cfit.pm, newdata=ctest.pm, type='response') 

 # cprediction.vote <-predict(cfit.pm, newdata=test.pm, predict.all = T, type='vote')

   #shows T/F for individual whistles and whether they were classified correctly  
   ctest.pm$rightPred <- as.character(cprediction.pm) == as.character(ctest.pm$population) 
  
   #sums all 'TRUE' and divides by total
   caccuracy.pm <- sum((ctest.pm$rightPred)/nrow(ctest.pm)) 
  
  ####RESULTS!!!#####  
  cresults.pmtemp <- table(ctest.pm$population, cprediction.pm)  # conf matrix of individual whistle classification
  
ctest.pmID <-unique(ctest.pm$EncounterID)
sink("cresults.pmtemp.txt", append=T)
ctest.pmID
caccuracy.pm
cresults.pmtemp
sink()

}   
  #calculate % accuracies of diagonal by population from confusion matrix of percentages
pel.pel<-round((cresults.pmtemp[1,1])/(cresults.pmtemp[1,1]+cresults.pmtemp[1,2])*100,2) #pelagic
mhi.mhi<-round((cresults.pmtemp[2,2])/(cresults.pmtemp[2,1]+cresults.pmtemp[2,2])*100,2) #mhi 
  #calculate misclassification for pelagic
pel.mhi<-round((cresults.pmtemp[1,2])/(cresults.pmtemp[1,1]+cresults.pmtemp[1,2])*100,2)
mhi.pel<-round((cresults.pmtemp[2,1])/(cresults.pmtemp[2,1]+cresults.pmtemp[2,2])*100,2) 
  

  # Important Variables
  varIMPtemp.pm <- as.data.frame(importance(fit.pm))
  varIMP.pm <- varIMPtemp.pm[order(varIMPtemp.pm$MeanDecreaseGini), , drop =F]
  varIMPtotal.pm <- rbind(varIMPtotal.pm, "REP"=rep, varIMP.pm)

  
#Calculate means and standard dev for each iteration of confusion matrix
#se=function(x) sd(x)/sqrt(length(x))
#for a single iteration, stores separate cells of results for calculating mean and standard error
se <- function(x) sd(x)/sqrt(length(x))
realP <- c(realP,cresults.pmtemp[1,1])
fakeP <- c(fakeP,cresults.pmtemp[1,2])
realM <- c(realM,cresults.pmtemp[2,2])
fakeM <- c(fakeM,cresults.pmtemp[2,1])


#Consolidate back into confusion matrix
pel.row<-c(pel.pel, pel.mhi)
#nwhi.row<-c(nwhi.pel, nwhi.nwhi, nwhi.mhi)
mhi.row<-c(mhi.pel, mhi.mhi)
#class.title<-c('pelagic', 'mhi')
conf.mat.pm <- rbind(pel.row, mhi.row)
#conf.mat.pm = conf.mat.pm + conf.mat.pmtemp
#conf.mat.pm<-data.frame(rbind("Pelagic"=pel.row, "MHI"=mhi.row))


colnames(conf.mat.pm)=c("Pelagic", "MHI")
#conf.mat.pm

true.pel = if(pel.pel>60) 'Pelagic' else 'Ambiguous'
#true.nwhi = if(nwhi.nwhi>50) 'NWHI' else 'Ambiguous'
true.mhi = if(mhi.mhi>60) 'MHI' else 'Ambiguous'
true.enc = data.frame(rbind(pel.pel, mhi.mhi))
#d = as.data.frame(unique(test$EncounterID))
#d = as.data.frame(d[c(1:3), ])
e.pm=c()
e.pm=as.data.frame(rbind(true.pel, true.mhi))
e.pm=cbind(e.pm, true.enc)
colnames(e.pm) = c("EncounterClass", "% Correct") #, "EncounterID")

#Summed results confusion matrix after each iteration
results.pm = results.pm + cresults.pmtemp 


#### OUTPUTS ####

g.pm=cbind(rep, "EncounterID"= test.pm$EncounterID, "EncounterCount"=test.pm$EncounterCount, "Prediction"=test.pm$rightPred, "PredictedPop"=prediction.pm, as.data.frame(cbind(prediction.vote$aggregate, prediction.vote$individual))) 

output_WhistleClass.pm <- smartbind(output_WhistleClass.pm, g.pm)

output_EncounterClass.pm <- rbind(output_EncounterClass.pm, e.pm)

output_RawResults.pm<-rbind(output_RawResults.pm, rep, results.pm, accuracy.pm=signif(accuracy.pm, digits = 3)) #rough table keeping track of confusion matrices for all iterations
output_ConfMat.pm<-rbind(output_ConfMat.pm, rep, conf.mat.pm, accuracy.pm=signif(accuracy.pm, digits = 3))
#df to show encounters used in each training set  
output_TrainSet.pm <- data.frame(rep, "EncounterID" = sort(unique(train.pm$EncounterID))) #, "encounter" = sort(unique(train$group)))
output_TrainSet.pmAll <-rbind(output_TrainSet.pmAll, output_TrainSet.pm)

#df to show encounters used in each test set
output_TestSet.pm <- data.frame(rep, "EncounterID" = sort(unique(test.pm$EncounterID)), e.pm) #, "encounter" = sort(unique(train$group)))
output_TestSet.pmAll <-rbind(output_TestSet.pmAll, output_TestSet.pm)

#df to show the prediction of each individual whistle per encounter (THIS IS DOCUMENTED IN 'g')
# output_IndDW<-data.frame(rep, "EncounterID"=test$EncounterID, "PredictedPop"=prediction, "EncounterCount"=test$EncounterCount) 
# output_IndDWAll <- rbind(output_IndDWAll, output_IndDW)  #consolidates all individual whistle results for each iteration

}
```