-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathinat_images.R
170 lines (137 loc) · 5.77 KB
/
inat_images.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#! /usr/bin/Rscript
## Guillermo Huerta Ramos
#this functions makes sure the packages are installed and then loads them
inat_packages <- c("rinat","argparse")
package.check <- lapply(
inat_packages,
FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE,repos='https://cloud.r-project.org')
library(x, character.only = TRUE)
}
}
)
# library(rinat)
# library(argparse)
#argument configuration
parser <- ArgumentParser()
parser$add_argument("-o", "--observations", default=100,
help="The maximum number of results to return [default \"%(default)s\"]")
parser$add_argument("-q", "--quality", default="Research",
help = "Quality grade - Research or All_Q [default \"%(default)s\"]")
parser$add_argument("-l", "--license", default="NonCC",
help = "License type - NonCC, Wikicommons or All_L [default \"%(default)s\"]")
parser$add_argument("-s", "--size", default="Medium",
help="Select image size - Small, Medium, Large, Original [default \"%(default)s\"]")
parser$add_argument("-y", "--year", default=NULL,
help="Return observations for a given year (can only be one year) [default \"%(default)s\"]")
parser$add_argument("-m", "--month", default=NULL,
help="Return observations for a given month, must be numeric, 1-12 [default \"%(default)s\"]")
parser$add_argument("-d", "--day", default=NULL,
help="Return observations for a given day of the month, 1-31 [default \"%(default)s\"]")
parser$add_argument("-b", "--bounds", default=NULL,
help="A txt file with box of longitude (-180 to 180) and latitude (-90 to 90) see bounds.txt sample [default \"%(default)s\"]")
args <- parser$parse_args()
#create image folder
image_folder <- "./images"
dir.create(image_folder)
# read csv file with species names
obs <- read.csv("./species.csv", header = TRUE)
# Select genus and species columns to create species query
obs <- as.data.frame(paste(obs$Genus, obs$Species))
## optional functions if your database has any of the following:
# delete empty rows
obs <- obs[!(obs == " "), ]
# delete subspecies since they are not accepted as query (they will be downloaded as descendant taxa)
obs <- sub("^(\\S*\\s+\\S+).*", "\\1", obs)
# delete duplicated names
obs <- unique(obs)
# if argument "bounds" is used the next funcion reads the file
if (!is.null(args$bounds)) {
# Read the file as a single line
bounds <- readLines(paste0("./", args$bounds))
# Remove "c(" and ")" to extract only the numeric values
bounds <- gsub("c\\(|\\)", "", bounds)
# Split the string into numeric values
args$bounds <- as.numeric(unlist(strsplit(bounds, ",")))
}
#### get image urls and information
inat_data <- sapply(X = obs, FUN = function(x) {
message(sprintf("Fetching data for %s", x))
# trycatch function enables to continue the script even if a query doesn't have any hits
tryCatch(
{
# change "maxresults" argument to set the number of images to download
inat_out <- get_inat_obs(taxon_name = x,
maxresults = as.numeric(args$observations),
quality = NULL,
year = args$year,
month = args$month,
day = args$day,
bounds = args$bounds
)
# delay queries 2.5 seconds to avoid server overload error
Sys.sleep(2.5)
},
error = function(e) {
print(paste0("WARNING:couldn't find a match for ", x))
}
)
if (!exists("inat_out")) {
return(NULL)
} else {
return(inat_out)
}
}, simplify = FALSE)
omit_inat <- sapply(X = inat_data, FUN = is.null)
inat_data <- do.call(rbind, inat_data[!omit_inat])
species <- unique(inat_data$scientific_name)
final_inat_data <- sapply(X = species, FUN = function(x, inat_data, image_folder) {
newdata <- inat_data[inat_data$scientific_name == x, ]
if (args$quality == "Research") {
newdata <- newdata[newdata$quality_grade == "research", ]}
else if (args$quality == "All_Q"){
newdata <- newdata
}
if (args$license == "Wikicommons") {
newdata <- newdata[(newdata$license != "") & (newdata$license != "CC-BY-NC"), ]}
else if (args$license == "NonCC"){
newdata <- newdata[newdata$license != "", ]}
else if (args$license == "All_L"){
newdata <- newdata
}
infolder <- paste0(sub(" ", "_", x))
infolder <- file.path(image_folder, infolder)
dir.create(infolder, showWarnings = FALSE)
for (b in seq_len(nrow(newdata))) {
tryCatch(
{
user <- newdata[b, ]$user_login
cc <- newdata[b, ]$license
# "cc" images are no tagged, this next step includes "CC" on file names
if (cc == "") {
cc <- "CC"
}
url <- newdata[b, ]$image_url
id <- newdata[b, ]$id
if (args$size == "Small") {
url <- sub("medium","small", url) }
else if (args$size == "Large") {
url <- sub("medium","large", url) }
else if (args$size == "Original") {
url <- sub("medium","original", url) }
else {url<-url}
file_name <- paste0(x, "_", user, "_", cc, "_", id, ".jpeg")
file_name <- file.path(infolder, file_name)
download.file(url, file_name, method = "curl")
},
error = function(e) {
print(paste0("WARNING: couldn't find the url"))
}
)
}
return(newdata)
}, inat_data = inat_data, image_folder = image_folder, simplify = FALSE)
final_inat_data <- do.call(rbind, final_inat_data)
# generate file with inaturalist observations information
write.table(final_inat_data, "./inat_data.csv", row.names = FALSE, sep = "\t")