-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelper_functions.R
97 lines (73 loc) · 4.91 KB
/
helper_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
## Helper functions for predicting residential status of OSM buildings
library(rgeos)
library(RANN)
get_data <- function(structures_poly, training=TRUE, impervious=TRUE, cropped_by=NULL){
# Subset only those with type
with_type <- which(!is.na(structures_poly$type))
if(training==TRUE){
structures_poly_selected <- structures_poly[with_type,]
}else{
structures_poly_selected <- structures_poly[-with_type,]
}
# Simplify structure polygons to points
structure_points <- SpatialPointsDataFrame(gCentroid(structures_poly_selected,byid=TRUE),
structures_poly_selected@data, match.ID=FALSE)
# Calculate distance to road (nearest road coordinate)
get_coords <- function(x){x@Lines}
road_coords <- sapply(roads@lines, get_coords)
road_coords <- sapply(road_coords, coordinates)
road_coords_all <- do.call("rbind", road_coords)
structure_points$dist_to_nearest_road <- nn2(road_coords_all, structure_points@coords, eps=0,searchtype="priority", k=2)$nn.dists[,2]
# Calculate areas
areas <- sapply(slot(structures_poly_selected, "polygons"), function(x) sapply(slot(x, "Polygons"), slot, "area"))
structure_points$poly_area <- sapply(areas, max) #Some buildings have multiple polygons so choose largest
structure_points$n_poly <- sapply(areas, length)
# Calculate polygon 'complexity'
poly_coords <- sapply(slot(structures_poly_selected, "polygons"), function(x) sapply(slot(x, "Polygons"), slot, "coords"))
structure_points$poly_complexity <- sapply(poly_coords, function(x){length(unlist(x))})
# Calculate distance to nearest structure
structure_points$dist_to_nearest <- nn2(structure_points@coords,structure_points@coords, eps=0,searchtype="priority", k=5)$nn.dists[,2]
# Calculate features of nearest structure
structure_points$area_of_nearest <- structure_points$poly_area[nn2(structure_points@coords,structure_points@coords, eps=0,searchtype="priority", k=2)$nn.idx[,2]]
structure_points$poly_complexity_of_nearest <- structure_points$poly_complexity[nn2(structure_points@coords,structure_points@coords, eps=0,searchtype="priority", k=2)$nn.idx[,2]]
structure_points$n_poly_of_nearest <- structure_points$n_poly[nn2(structure_points@coords,structure_points@coords, eps=0,searchtype="priority", k=2)$nn.idx[,2]]
# Import data on 'imperviousness' extracted from Midekisa et al. 2017 PloS ONE 12(9) e0184926
Impervious <- read.csv(paste0(country_code,"_raw_OSM_062617.csv"))
Impervious_buffer <- read.csv(paste0(country_code,"_resample_OSM_062617.csv"))
# Merge
structure_points <- merge(structure_points, Impervious[,c("OSM_ID", "Impervious")], by.x="osm_id",by.y="OSM_ID")
structure_points <- merge(structure_points, Impervious_buffer[,c("OSM_ID", "Impervious_buffer")], by.x="osm_id",by.y="OSM_ID")
structure_points$Impervious_of_nearest <- structure_points$Impervious[nn2(structure_points@coords,structure_points@coords, eps=0,searchtype="priority", k=2)$nn.idx[,2]]
# Add lat long as columns
structure_points$lng <- structure_points@coords[,1]
structure_points$lat <- structure_points@coords[,2]
return(structure_points)
}
## Identify optimal cutoff for producing equal sensitivity and specificity
optimal_cutoff <- function(predictions, observations){
perc_corr_class_sprayable <- NULL
perc_corr_class_not_sprayable <- NULL
true_num_sprayable <- sum(observations)
true_num_not_sprayable <- sum(observations==0)
for(i in 1:1000){
predicted_class_loop <- ifelse(predictions>=i/1000,1,0)
perc_corr_class_sprayable <- c(perc_corr_class_sprayable,
round(sum(observations==1 & predicted_class_loop==1,na.rm=T)/true_num_sprayable,3))
perc_corr_class_not_sprayable <- c(perc_corr_class_not_sprayable,
round(sum(observations==0 & predicted_class_loop==0,na.rm=T)/true_num_not_sprayable,3))
}
plot_data <- data.frame(threshold = rep(seq(0,1,length.out=1000),2),
perc_corr_class = c(perc_corr_class_sprayable, perc_corr_class_not_sprayable),
group = c(rep("Sprayable",1000), rep("Not sprayable",1000)))
ggplot(data=plot_data, aes(x=threshold, y=perc_corr_class, group=group, color=group)) +
geom_line(size=1.5) + xlab("Threshold") + ylab("Proportion correctly classified")
# Identify where the lines overlap
#opt_threshold <- (which(perc_corr_class_sprayable == perc_corr_class_not_sprayable)/1000)[1]
opt_threshold <- which.min((perc_corr_class_sprayable - perc_corr_class_not_sprayable)^2)[1]/1000
# Table predicted class against observed using different thresholds
predicted_class <- ifelse(predictions>=opt_threshold,1,0) # Threshold of 0.7 gives 95% sensitivity in both
table(predicted_class, observations)
return(list(plot_data = plot_data,
opt_cutoff = opt_threshold,
confusion_matrix = table(predicted_class, observations)))
}