-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_exploration.R
179 lines (134 loc) · 5.79 KB
/
data_exploration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#Code for Atlas of Living Australia
#Created by Francisca Maron
#Date 07/12/2021
#Dependencies
library(tidyverse)
library(lubridate)
library(janitor)
library(httr)
library(jsonlite)
library(inspectdf)
# Download data ##########################################################
#Search reptiles with the API https://api.ala.org.au/
search_reptiles <- GET("https://biocache-ws.ala.org.au/ws/occurrences/search",
query = list(q = 'class:Reptilia',
fq = 'cl22:Australian Capital Territory',
facets = paste("decimalLatitude",
"decimalLongitude",
"scientificName",
"dataResourceName",
"basisOfRecord",
"cl22",
"cl10902",
"class", sep = ",")))
search_reptiles #json
#Transform to readable data
search_content <- fromJSON(rawToChar(search_reptiles$content))
names(search_content)
#Download occurrences (zip file according to documentation)
download_reptiles <- GET("https://biocache-ws.ala.org.au/ws/occurrences/index/download",
query = list(q = 'class:Reptilia',
fq = 'cl22:Australian Capital Territory',
fields = paste("decimalLatitude",
"decimalLongitude",
"scientificName",
"dataResourceName",
"basisOfRecord",
"class",
"eventDate",
"cl22",
"cl10902", sep = ","),
email= "[YOUR_EMAIL]",
reasonTypeId = "6"), write_disk("ala_reptiles.zip", overwrite = TRUE))
download_reptiles #zip
#Unzip the data
unzip("ala_reptiles.zip")
# Data cleaning ##########################################################
#Read the data
reptiles <- read.csv("data.csv", header = TRUE)
head(reptiles)
names(reptiles) #messy column names!
# Clean names
reptiles <- reptiles %>%
clean_names(case = "small_camel") #match the task
#Rename for the columns that don't match and to facilitate manipulation
reptiles <- reptiles %>%
rename("decimalLatitude" = decimalLatitudeWgs84,
"decimalLongitude" = decimalLongitudeWgs84,
"scientificName" = scientificNameIntepreted,
"state" = australianStatesAndTerritories,
"forest2013" = forestsOfAustralia2013V2_0)
unique(reptiles$state)#filter query didn't work because the forest layer had greater extent
unique(reptiles$class)#Reptilia
#Filter the data for ACT and select relevant columns
reptiles <- reptiles %>%
filter(state == "Australian Capital Territory") %>%
select(decimalLatitude, decimalLongitude, eventDate,
scientificName, class, dataResourceName,
basisOfRecord, state, forest2013)
#Check if filter and select worked
unique(reptiles$state) #ACT
head(reptiles) #selected columns
#Inspect structure
str(reptiles)
#Inspect eventDate column
reptiles$eventDate
#Transform eventDate as datetime instead of character
reptiles <- reptiles %>%
mutate(eventDate = ymd_hms(eventDate))
str(reptiles) #date is POSIXct now
# Exploratory analysis ##########################################################
#Inspect missing values
inspect_na(reptiles) #just eventDate missing data
#Inspect categorical variables
inspect_cat(reptiles)
# Univariate analyses
#Check species
unique(reptiles$scientificName)
length(unique(reptiles$scientificName))#102 unique species (not good for bar plot)
table(reptiles$scientificName) #some species don't have the complete scientific name
#For the purpose of the visualisation I'll keep the species that have the complete scientific name
#Scientific name has 2 or more words
#Use regex to match first character capital followed by any character not capital
#Add white space to remove species identified with genus
reptiles <- reptiles %>%
filter(grepl("^[A-Z][^A-Z]+\\s+", scientificName))
#Check forest types
unique(reptiles$forest2013) #there is a blank string as a value
length(unique(reptiles$forest2013)) #11 different
table(reptiles$forest2013) #17 occurrences on the blank string, what is this?
#Maybe preserved specimen?
reptiles%>%
filter(forest2013 == "") #Human observations and one preserved specimen
#Change to unknown
reptiles <- reptiles %>%
mutate(forest2013 = case_when(forest2013 == "" ~ "Unknown", TRUE ~ forest2013))
ggplot(reptiles, aes(x = forest2013)) +
geom_bar() +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = .5))
#Most species occurrences on Eucalypt Medium Woodland
#Check resource
unique(reptiles$dataResourceName)
length(unique(reptiles$dataResourceName)) #23 resources
table(reptiles$dataResourceName) # ACT Wildlife Atlas with most ocurrence
#Check basis of record
unique(reptiles$basisOfRecord) #Names are in snake formate
reptiles <- reptiles %>%
mutate(basisOfRecord = str_to_title(gsub("_", " ", basisOfRecord)))
ggplot(reptiles, aes(x = basisOfRecord))+
geom_bar()+
theme_bw()
#Human observation has the most records followed by unknown
#Check date range
range(reptiles$eventDate, na.rm = TRUE) #Records from 1954!
#For the visualisation I'll use the month
#Create month column
reptiles <- reptiles %>%
mutate(month = factor(month(eventDate, label = TRUE),
levels = c("Jan", "Feb", "Mar",
"Apr", "May","Jun",
"Jul", "Aug","Sep",
"Oct", "Nov", "Dec")))
#Save the data and perform bivariate analysis as part the visualisation task
save(reptiles, file = "reptiles.RData")