-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmachineLearningModels.R
159 lines (131 loc) · 6.82 KB
/
machineLearningModels.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
## Machine learning models on NEON tick abundance data
## BKH; 3/4/2020
library(gbm)
library(dismo) # for gbm.step
library(randomForest) # for randomForest
library(ranger) # for ranger; can give uncertainty in predictions
library(dplyr)
rm(list=ls())
## load data
# complete dataset
data.all <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/complete_data_taxonomy_collapsed.csv")
# domain-based training & test datasets
data.domain.train <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/domain_testset_taxonomy_collapsed.csv")
data.domain.test <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/domain_validationset_taxonomy_collapsed.csv")
# time-based training & test datasets
data.time.train <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/time_testset_taxonomy_collapsed.csv")
data.time.test <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/time_validationset_taxonomy_collapsed.csv")
# random training & test datasets
data.rand.train <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/random_testset_taxonomy_collapsed.csv")
data.rand.test <- read.csv("data_derived/subset_data/ticks_taxonomy_collapsed/random_validationset_taxonomy_collapsed.csv")
## get list of sites that have *some* tick observations
# table with all site counts
data.all %>% group_by(siteID) %>% summarise(siteTotal = sum(Count)) %>%
ungroup() -> site.table
# table with sites that have ticks
site.table %>% subset(siteTotal>0) %>% pull(siteID) -> tickSites
#View(data.time.test)
#View(filter(data.time.test, siteID %in% tickSites))
## code for BRT and RF taken from supplementary appendix 2 of
## Oppel et al 2012, Biological Conservation
## paper doi: https://doi.org/10.1016/j.biocon.2011.11.013
## appendix url: https://ars.els-cdn.com/content/image/1-s2.0-S0006320711004319-mmc2.txt
##############################
##### Nymph stage models #####
##############################
predictorIndex <- c(1,2,3,5,10,11) # define columns in data that are predictors
responseIndex <- c(16)
##### DOMAIN-based split #####
# I don't like splitting train/test by domain; removed this code
# That's where "BRT1" & "RF1" went to
##### TIME-based split #####
## Boosted regression trees ##
# Run model on training data
?gbm.step
BRT2 <- gbm.step(data=subset(data.time.train, lifeStage=="Nymph" & siteID %in% tickSites),
gbm.x = predictorIndex, # indexes or names predictor variables
gbm.y = responseIndex, # indexes or names response variable
family = "poisson", # no negative binomial option
tree.complexity = 8, # this is what paper used; explore
learning.rate = 0.001, # this is what paper used
bag.fraction = 0.64 # this is what paper used
)
summary(BRT2) # plotID, month, year, & domainID important (in decreasing order)
# Predict to test data
data.time.test.n <- subset(data.time.test,
lifeStage=="Nymph" & siteID %in% tickSites)
data.time.test.n$BRT_pred <- predict.gbm(BRT2,
data.time.test.n,
n.trees = BRT2$gbm.call$best.trees,
type = "response")
## plot predicted versus observed
plot(log(data.time.test.n$Count+1), log(data.time.test.n$BRT_pred+1))
## Random Forests ##
?randomForest
## run model to training set
# ***randomForest cannot handle categorical predictors with >53 categories,
# so removing plotID from model
RF2 <- randomForest(Count ~ domainID + siteID + nlcdClass + year + month,
data=data.time.train,
importance=T, # tell it you want imp. of predictors
replace=T, # sampling with replacement
mtry=5, # #of var rand sampled as candidates at each split
ntree=1500, # #of trees to grow
na.action=na.omit) # other settings too, this is what paper did
RF2$importance
# interp. help: https://stats.stackexchange.com/questions/162465/in-a-random-forest-is-larger-incmse-better-or-worse
# %IncMSE: higher number --> more important
# IncNodePurity: higher number --> better variable
# Decreasing Importance: siteID > month > year > nlcdClass > domainID
## predict to test set
data.time.test.n$RF_pred <- predict(RF2, data.time.test.n)
## plot observed versus estimated counts
plot(log(data.time.test.n$Count+1), log(data.time.test.n$RF_pred+1))
# over-estimating when obs count == 0, but pretty decent
##### RANDOM-based split #####
## Boosted regression trees ##
## Run model on training data
BRT3 <- gbm.step(data=subset(data.rand.train, lifeStage=="Nymph" & siteID %in% tickSites),
gbm.x = predictorIndex, # indexes or names predictor variables
gbm.y = responseIndex, # indexes or names response variable
family = "poisson", # no negative binomial option
tree.complexity = 8, # this is what paper used; explore
learning.rate = 0.001, # this is what paper used
bag.fraction = 0.64 # this is what paper used
)
summary(BRT3) # plotID, year, & month most important
## Predict to test data
data.rand.test.n <- subset(data.rand.test,
lifeStage=="Nymph" & siteID %in% tickSites)
?predict.gbm
data.rand.test.n$BRT_pred <- predict.gbm(BRT3,
data.rand.test.n,
n.trees = BRT3$gbm.call$best.trees,
type = "response")
## plot predicted versus observed
plot(log(data.rand.test.n$Count+1), log(data.rand.test.n$BRT_pred+1))
## Random forests ##
?randomForest
## run model to training set
# ***randomForest cannot handle categorical predictors with >53 categories,
# so removing plotID from model
RF3 <- randomForest(Count ~ domainID + siteID + nlcdClass + year + month,
data=data.rand.train,
importance=T, # tell it you want imp. of predictors
replace=T, # sampling with replacement
mtry=5, # #of var rand sampled as candidates at each split
ntree=1500, # #of trees to grow
na.action=na.omit) # other settings too, this is what paper did
RF3$importance
# Decreasing Importance: month > siteID > nlcdClass > year > domain
## predict to test set
data.rand.test.n$RF_pred <- predict(RF3, data.rand.test.n)
# ERROR: Type of predictors in new data do not match that of the training data.
# not sure why it threw here but not above...
# need to look into it ***
## plot observed counts versus predicted
plot(log(data.rand.test.n$Count+1), log(data.rand.test.n$RF_pred+1))
## BKH only: save workspace (models take f**king forever to run on my laptop)
# *** note to self: add below file to gitIgnore
#save.image(file="machineLearnSpace.RData")
#load.image(file="machineLearnSpace.RData")