-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path5_1_curl_datacube_request_placeholders.sh
380 lines (304 loc) · 18 KB
/
5_1_curl_datacube_request_placeholders.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
## DEPENDENCIES
# to run this block in Anaconda prompt on local machine (Windows):
# "C:\Users\kriukovv\AppData\Local\Programs\Git\bin\sh.exe" 5_1_curl_datacube_request_placeholders.sh (use your local path for bash.exe)
# if it doesn't work, try "C:\Users\kriukovv\AppData\Local\Programs\Git\bin\bash.exe" 5_1_curl_datacube_request_placeholders.sh
# in Powershell: & "C:\Users\kriukovv\AppData\Local\Programs\Git\bin\sh.exe" -- 5_1_curl_datacube_request_placeholders.sh
# required: to install jq, yq and curl
# on Windows, jq installed manually as executable through official page (https://jqlang.github.io/jq/download/) and editing environment variables-path
# yq package installed through scoop: https://github.com/mikefarah/yq?tab=readme-ov-file
## ACCESS
# required: to have GBIF user registration: https://www.gbif.org/user/profile
## INPUT (all inputs are specified by user in config_gbif.json file)
# - taxon key pr keys (unique identifier) in GBIF for class(es) or species, depending on user choice (for example, https://www.gbif.org/species/212)
# - year (minimum year in date of GBIF occurrence), depending on user choice
# - country to extract GBIF occurrence data for, depending on user choice
# - GBIF user credentials
# All inputs are mandatory
## OUTPUT (filename and path are specified by user in config_gbif.json file)
# - list of all occurrence records for specified filters on taxons, years, countries and data issues, CSV (mandatory)
# - metadata for the CSV output with DOI (which is scheduled to be erased), JSON (mandatory)
# - (under development) licence metadata of all data sources comprising the first output (CSV mandatory). In the end, should be ingested into the JSON output (replace value for 'license' key).
# this block also dynamically updates config.yaml file (two values - 'gbif_datacube_csv' and 'gbif_taxon_key' for further processing)
## ISSUES AND LIMITATIONS
# - No more than 3 concurrent downloads for a standard user allowed.
# - Some of large requests might become frozen without any visible outcome (probably due to internet issues), while being active and successful on https://www.gbif.org/user/download
# - GBIF backbone taxonomy does not define Reptilia as a separate class (class with id=358 dedicated to Reptilia database fetches 0 downloaded records). For the purposes of the case study, two Reptilia classes (Testudines, taxon key 11418114) and (Squamata, taxon key, 11592253) have been used.
# - Downloads are sheduled to be erased after some time (https://github.com/gbif/gbif-api/issues/142)
# - Experienced relatively long (9648 seconds of difference between creation and moficiation timestamps on 01/10/2024-02/10/2024) quequeing for all requests
# - * Trying 130.225.43.2:443... * connect to 130.225.43.2 port 443 failed: Timed out * Failed to connect to api.gbif.org port 443 after 21104 ms: Couldn't connect to server* Closing connection curl: (28) Failed to connect to api.gbif.org port 443 after 21104 ms: Couldn't connect to server
## HELP
# GBIF occurrence datacube: https://techdocs.gbif.org/en/data-use/data-cubes
# GBIF licence metadata: https://techdocs.gbif.org/en/data-use/b-cubed/generate-cube-databricks#generating-cube-metadata
# To check outputs on GBIF, cancel requests etc: https://www.gbif.org/user/download
## 1. PROCESSING: first datacube with occurrence records
## Extract year of record, classKey and credentials from the configuration file and prepare the JSON request
# assign the variables from config_gbif.json
classKey=$(jq -r '.classKey' config_gbif.json) # -r extracts raw output without any quotes
speciesKey=$(jq -r '.speciesKey' config_gbif.json)
country=$(jq -r '.country' config_gbif.json)
year=$(jq -r '.min_year' config_gbif.json)
notificationEmail=$(jq -r '.notificationEmail' config_gbif.json)
username=$(jq -r '.username' config_gbif.json)
password=$(jq -r '.password' config_gbif.json)
output_dir_gbif=$(jq -r '.output_dir_gbif' config_gbif.json)
gbif_query_classes=$(jq -r '.gbif_query_classes' config_gbif.json) # to choose query to extract classes
gbif_query_species=$(jq -r '.gbif_query_species' config_gbif.json) # to choose query to extract species
gbif_query_classes_metadata=$(jq -r '.gbif_query_classes_metadata' config_gbif.json) # to choose query to extract licence metadata for classes
gbif_query_species_metadata=$(jq -r '.gbif_query_species_metadata' config_gbif.json) # to choose query to extract licence metadata for species
# if multiple classes - converting the list from json file to comma-separated list, otherwise bring it just as a string (for SQL syntax)
classKey_to_edit=$(jq -r '
if (.classKey | type == "array") then
.classKey | join(",")
else
.classKey | tostring
end
' config_gbif.json)
# remove any leading or trailing whitespace
classKey_to_edit=$(echo "$classKey_to_edit" | xargs)
# format the speciesKey correctly for SQL (in parentheses)
classKey="($classKey_to_edit)"
# debug echo "$classKey"
# if multiple species - converting the list from json file to comma-separated list, otherwise bring it just as a string (for SQL syntax)
speciesKey_to_edit=$(jq -r '
if (.speciesKey | type == "array") then
.speciesKey | join(",")
else
.speciesKey | tostring
end
' config_gbif.json)
# remove any leading or trailing whitespace
speciesKey_to_edit=$(echo "$speciesKey_to_edit" | xargs)
# format the speciesKey correctly for SQL (in parentheses)
speciesKey="($speciesKey_to_edit)"
# debug echo "$speciesKey"
# note: SQL supports 'classKey IN (358)' for single and multiple (classKey IN (358,212)) features, so no need to create queries for single taxon separately (calssKey=358)
# to use further: normalize speciesKey by converting to lowercase and removing parentheses
norm_speciesKey=$(echo "$speciesKey" | tr '[:upper:]' '[:lower:]' | tr -d '()')
# echo the variables to check the values
echo "Class key: $classKey"
echo "Species key: $speciesKey"
echo "Country code: $country"
echo "Minimum year of record: $year"
echo "Notification email: $notificationEmail"
echo "Username: $username"
echo "Password: [Hidden for security]"
echo "Query to GBIF datacube (classes): $gbif_query_classes"
echo "Query to GBIF datacube (species): $gbif_query_species"
## to specify taxon key from config file - if species key is not defined, then use class key
# check if speciesKey is empty or one of the invalid values
if [[ -z "$norm_speciesKey" || "$norm_speciesKey" =~ ^(none|null|nan|nodata|no_data|0)$ ]]; then # list possible values for defining no data
echo "speciesKey is not defined. Using classKey instead."
taxonKey="$classKey"
gbif_query="$gbif_query_classes"
gbif_query_metadata="$gbif_query_classes_metadata" # ancillary to get licence metadata
else
echo "speciesKey is defined. Using speciesKey."
taxonKey="$speciesKey"
gbif_query="$gbif_query_species"
gbif_query_metadata="$gbif_query_species_metadata" # ancillary to get licence metadata
fi
echo "Taxon key to extract GBIF datacube: $taxonKey"
echo "Query to access GBIF datacube: $gbif_query"
# TODO - to test multiple countries
# TODO - to include a case when classKey is empty
# prepare the JSON request by substituting values - defining arguments and replacing placeholders with variables from the configuration file
jq --arg classKey "$classKey" \
--arg speciesKey "$speciesKey" \
--arg country "$country" \
--arg year "$year" \
--arg notificationEmail "$notificationEmail" \
'
.notificationAddresses[0] |= $notificationEmail |
.sql |= sub("\\{\\{year\\}\\}"; $year) |
.sql |= sub("\\{\\{classKey\\}\\}"; $classKey) |
.sql |= sub("\\{\\{speciesKey\\}\\}"; $speciesKey) |
.sql |= sub("\\{\\{country\\}\\}"; $country)
' "$gbif_query" > prepared_request.json
# debug: echo the prepared JSON request to check if it looks correct
echo "Prepared request:"
cat prepared_request.json # concacenate and print
printf '%0.s-' {1..40}; printf '\n' # %0.s means to print - without any arguments
# use curl to send the request
response=$(curl --include \
--user "$username:$password" \
--header "Content-Type: application/json" \
--data @prepared_request.json \
https://api.gbif.org/v1/occurrence/download/request)
# \ is used to continue the command on the next line
# debug: print the entire HTTP response
echo "Full HTTP Response:"
echo -e "\n$response"
printf '%0.s-' {1..40}; printf '\n'
# TODO - if http code == 40*, raise error and break, if == 20* - keep running.
# probably should be done through saving headers as temporary txt and then extract http code from there
# to extract the download code
download_code=$(echo "$response" | tail -n 1)
# previous expression: extract the download code from the last non-empty line of the response body (isolates the part of the response that starts after download/)
# download_code=$(echo "$response" | grep -oP 'download/\K[^\"]+')
echo -e "\nDownload Code: $download_code"
# use the download code to check the status and download the data
if [[ "$download_code" != "null" ]]; then
echo -e "Fetching the download URL for code $download_code and taxon key $taxonKey... \n It may take dozens of minutes, depending on the size of data fetched." # -e enables identification of backlash escapes
# wait until the download is ready, and then download it
status="RUNNING"
while [[ "$status" == "RUNNING" || "$status" == "PENDING" || "$status" == "PREPARING" || "$status" == "" ]]; do
sleep 120 # wait before checking the status again (increased from 60 to reduce frequency of requests)
status_response=$(curl -v -L -Ss "https://api.gbif.org/v1/occurrence/download/${download_code}") # enable verbose logging
# check if there is a 503 error
if echo "$status_response" | grep -q "503 Service Unavailable"; then
echo "Service unavailable. Retrying after a short delay..."
sleep 5 # retry after 5 seconds as recommended by the API
continue
fi
# try to parse the status only if the response is valid JSON
if echo "$status_response" | jq . >/dev/null 2>&1; then # parse response as JSON. if a valid JSON, nothing is printed
status=$(echo "$status_response" | jq -r '.status') # extract status if valid JSON
else
echo "Invalid response format. Skipping parsing."
status="" # reset status to prevent the loop from breaking due to a bad response
fi
echo "Current status: $status"
done
# if the status is 'SUCCEEDED', download the file
if [[ "$status" == "SUCCEEDED" ]]; then
echo "Download ready. Fetching the file..."
# define the output filename
filename="key_${taxonKey}_${download_code}.zip"
# TODO - truncate filename if it is too long - many species. Particular species/classes can be extracted from metadata (json)
# added just in case as zip file had been brought empty
sleep 20
# download the file
curl --max-time 600 -L -Ss "https://api.gbif.org/v1/occurrence/download/request/${download_code}" -o "${filename}" # enable verbose logging
echo "Download completed: ${filename}"
# ensure the output directory exists before moving the output
mkdir -p "${output_dir_gbif}" # but doesn't cause errors if directory already exists
# move the .zip file to the output directory
mv "$filename" "${output_dir_gbif}/${filename}"
# unzip the file
echo "Unzipping ${filename}..."
if unzip "${output_dir_gbif}/${filename}" -d "${output_dir_gbif}/temp_unzip"; then
echo "Unzipping completed."
# find the .csv file
csv_file=$(find "${output_dir_gbif}/temp_unzip" -type f -name "*.csv")
# extract the base name of the CSV file (without directory and extension)
base_csv_name=$(basename "$csv_file")
# rename csv file, adding the taxon key
mv "$csv_file" "${output_dir_gbif}/key_${taxonKey}_${base_csv_name}"
# remove the temporary directory
rm -r "${output_dir_gbif}/temp_unzip"
# delete the zip file after successful extraction
rm "${output_dir_gbif}/${filename}"
echo "Deleted the zip file."
else
echo "Unzipping failed. Zip file will not be deleted."
fi
# to save metadata (anonymized?)
curl -Ss "https://api.gbif.org/v1/occurrence/download/${download_code}" -o "${output_dir_gbif}/${filename%.zip}.json" #-S means show errors, but -s means silent mode
echo "Metadata saved for: ${filename%.zip}."
# extract 'created' and 'modified' timestamps using jq
created=$(jq -r '.created' "${output_dir_gbif}/${filename%.zip}.json")
modified=$(jq -r '.modified' "${output_dir_gbif}/${filename%.zip}.json")
# calculate the fetching time (difference between modified and created) using date commands
fetching_time=$(($(date -d "$modified" +%s) - $(date -d "$created" +%s)))
# output the fetching time in a readable format
echo "Fetching time for ${filename%.zip}: $((fetching_time / 3600)) hours, $(((fetching_time % 3600) / 60)) minutes, $((fetching_time % 60)) seconds."
printf '%0.s-' {1..40}; printf '\n'
else
echo "Download failed or still processing."
printf '%0.s-' {1..40}; printf '\n'
fi
else
echo "Failed to get download code."
printf '%0.s-' {1..40}; printf '\n'
fi
# delete the intermediate json file with the prepared request
rm "prepared_request.json"
# TODO to check what other statuses might be (apart from mentioned above)
# TODO to consider usage of Schannel on Windows
# - * schannel: disabled automatic use of client certificate
# * schannel: failed to decrypt data, need more data
# use yq to update the filename of gbif datacube in the YAML file
yq eval ".gbif_datacube_csv = \"${filename%.zip}.csv\"" -i config.yaml
# use yq to write the taxon key to the YAML file
yq eval ".gbif_taxon_key = \"${taxonKey}\"" -i config.yaml
## 2. PROCESSING: second datacube, fetching data sources and their licence metadata - https://techdocs.gbif.org/en/data-use/b-cubed/generate-cube-databricks#generating-cube-metadata
# It is fetched through the separate download code.
# TODO - to revisit the following block. Currently fetches csv with all datasets and their licence policy. Finally, it should choose the most strict one (https://techdocs.gbif.org/en/data-use/b-cubed/generate-cube-databricks#generating-cube-metadata) and replace the value for 'license' key in JSON with other metadata.
# preparing query with placeholders from variable
jq --arg classKey "$classKey" \
--arg speciesKey "$speciesKey" \
--arg country "$country" \
--arg year "$year" \
--arg notificationEmail "$notificationEmail" \
'
.notificationAddresses[0] |= $notificationEmail |
.sql |= sub("\\{\\{year\\}\\}"; $year) |
.sql |= sub("\\{\\{classKey\\}\\}"; $classKey) |
.sql |= sub("\\{\\{speciesKey\\}\\}"; $speciesKey) |
.sql |= sub("\\{\\{country\\}\\}"; $country)
' "$gbif_query_metadata" > prepared_request_metadata.json
# prepare curl query
response_metadata=$(curl --include \
--user "$username:$password" \
--header "Content-Type: application/json" \
--data @prepared_request_metadata.json \
https://api.gbif.org/v1/occurrence/download/request)
# debug: print the entire HTTP response for licence metadata
echo "Metadata licence response:"
echo -e "\n$response_metadata"
printf '%0.s-' {1..40}; printf '\n'
# to extract the download code
download_code_licence=$(echo "$response_metadata" | tail -n 1)
echo "Download code for licence metadata: $download_code_licence"
if [[ "$download_code_licence" != "null" ]]; then
echo -e "Fetching the download URL for code $download_code_licence and taxon key $taxonKey... \n It may take some time, depending on the size of data fetched." # -e enables identification of backlash escapes
# wait until the download is ready, and then download it
status="RUNNING"
while [[ "$status" == "RUNNING" || "$status" == "PENDING" || "$status" == "PREPARING" || "$status" == "" ]]; do
sleep 120 # wait before checking the status again (increased from 60 to reduce frequency of requests)
status_response=$(curl -v -L -Ss "https://api.gbif.org/v1/occurrence/download/${download_code_licence}") # enable verbose logging
status=$(echo "$status_response" | jq -r '.status') # extract raw text from the status of the response
echo "Current status: $status"
done
# if the status is 'SUCCEEDED', download the file
if [[ "$status" == "SUCCEEDED" ]]; then
echo "Download ready. Fetching the file..."
# define the output filename
filename="key_${taxonKey}_licence.zip" # different from previous datacube
# TODO - truncate filename if it is too long - many species. Particular species/classes can be extracted from metadata (json)
# added just in case as zip file had been brought empty
sleep 20
# download the file
curl --max-time 600 -L -Ss "https://api.gbif.org/v1/occurrence/download/request/${download_code_licence}" -o "${filename}" # enable verbose logging
echo "Licence metadata download completed: ${filename}"
# ensure the output directory exists before moving the output
mkdir -p "${output_dir_gbif}" # but doesn't cause errors if directory already exists
# move the .zip file to the output directory
mv "$filename" "${output_dir_gbif}/${filename}"
# unzip the file
echo "Unzipping ${filename}..."
if unzip "${output_dir_gbif}/${filename}" -d "${output_dir_gbif}/temp_unzip"; then
echo "Unzipping completed."
# find the .csv file
csv_file=$(find "${output_dir_gbif}/temp_unzip" -type f -name "*.csv")
# rename csv file, adding the taxon key
mv "$csv_file" "${output_dir_gbif}/key_${taxonKey}_metadata_licence.csv"
# remove the temporary directory
rm -r "${output_dir_gbif}/temp_unzip"
# delete the zip file after successful extraction
rm "${output_dir_gbif}/${filename}"
echo "Deleted the zip file."
else
echo "Unzipping failed. Zip file will not be deleted."
fi
else
echo "Licence metadata download failed or still processing."
printf '%0.s-' {1..40}; printf '\n'
fi
else
echo "Failed to get download code for licence metadata."
printf '%0.s-' {1..40}; printf '\n'
fi
# delete the intermediate json file with the prepared request
rm "prepared_request_metadata.json"
# TODO - to write the most strict licence policy to the corresponding value in .json metadata