-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroupby_joining_summarizing.R
189 lines (144 loc) Β· 5.45 KB
/
groupby_joining_summarizing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
library(tidyverse)
library(lubridate)
library(nycflights13)
############################### group_by basics ################################
# subset the data to look only at LGA and JFK
lga = flights %>%
mutate(date = paste0(year, '-', month, '-', day) %>% as_date()) %>%
dplyr::filter(origin == "LGA") %>%
sample_n(5) %>%
select(date, origin, dep_delay) %>%
mutate(was_delayed = as.numeric(dep_delay > 0))
jfk = flights %>%
mutate(date = paste0(year, '-', month, '-', day) %>% as_date()) %>%
dplyr::filter(origin == "JFK") %>%
sample_n(5) %>%
select(date, origin, dep_delay) %>%
mutate(was_delayed = as.numeric(dep_delay > 0))
# now we can run statistics on these airports separately
table(lga$was_delayed)
table(jfk$was_delayed)
mean(lga$dep_delay)
mean(jfk$dep_delay)
# there is a better way with group_by
df = flights %>%
mutate(
origin = parse_factor(flights$origin, levels = c('JFK', 'LGA', 'EWR')),
date = paste0(year, '-', month, '-', day) %>% as_date()
) %>%
select(date, origin, dep_delay) %>%
sample_n(900)
table(df$origin)
df = flights %>%
mutate(
origin = parse_factor(flights$origin, levels = c('JFK', 'LGA', 'EWR')),
date = paste0(year, '-', month, '-', day) %>% as_date()
) %>%
select(date, origin, dep_delay) %>%
group_by(origin)%>%
sample_n(300) %>%
ungroup()
table(df$origin)
# now we can perform operations by group
df %>%
group_by(origin) %>%
tally()
# an even simpler way
df %>%
count(origin)
# add the sort option
flights %>% count(origin, sort = T)
# we can group multiple variables
df %>%
group_by(origin, date) %>%
tally() # %>% ungroup() # can use ungroup to change it back to a normal tibble
# similarly
df %>% count(origin, date)
# we can add unique identifiers to our groups
df = df %>% group_by(origin) %>% sample_n(5) %>% ungroup()
df %>% group_by(origin) %>% mutate(group_index = 1:n()) #n() returns the number
# of rows inside of a tidyverse function
df %>% mutate(group_ids = group_indices(df, origin)) %>%
group_by(origin) %>% mutate(group_index = 1:n())
################################## summarize ###################################
df = flights %>% select(origin, dep_delay, arr_delay, distance, tailnum) %>%
mutate (origin = factor(origin))
# calculate summary statistics by group
df %>%
group_by(origin) %>%
summarise(mean_distance = mean(distance, na.rm = T))
df %>%
group_by(origin) %>%
summarise(
mean_distance = mean(distance, na.rm = T),
total_arr_delay = sum(arr_delay, na.rm = T))
# we can also use the _all, _if, and _at additions like with other dplyr
# functions with summarize
df %>% select(-tailnum) %>%
group_by(origin) %>%
summarize_all(mean, na.rm = T) %>%
rename_at(vars(dep_delay, arr_delay, distance), ~ paste0(., '_mean'))
# instead of using select to get rid of the character variable tailnum, we can
# use summarize_if
df %>%
group_by(origin) %>%
summarize_if(is.numeric, ~ mean(., na.rm = T))
# only specify certain variables
df %>%
group_by(origin) %>%
summarize_at(
vars(dep_delay, arr_delay),
funs(mean(., na.rm = T), var(., na.rm = T))
)
############################ joining data with dplyr ###########################
# set up 2 tibbles
tibble_1 = tibble(
age = c(10, 20, 30),
gender = factor(c('M', 'M', 'F'), levels = c('F', 'M')),
weight = c(200, 185, 125)
)
tibble_2 = tibble(
age = c(11, 22, 33),
income = c(10, 20, 30),
gender = factor(c('M', 'M', 'F'), levels = c('F', 'M')),
weight = c(200, 185, 125)
)
# we can stack these on top of one another, notice that the variables do not
# have to be in the same order or have the same exact variables, bind_rows()
# will fine in NA values in this case
bind_rows(tibble_1, tibble_2)
# we can also pass a single list filled with tibbles/data frames and bind_rows
# will stack every tibble in the list
tibble_list = list(tibble_1, tibble_2)
bind_rows(tibble_list)
# instead of stacking data frames on top of one another, we can also stack them
# next to each other with bind_cols()
bind_cols(tibble_1, tibble_2)
bind_cols(tibble_list)
# the data sets much have the same length or you will get an error
jobs = tibble(job = c('accountant', 'doctor', 'teacher', 'dentist'))
bind_cols(tibble_1, jobs)
################################# joins #######################################
# left join with dplyr = returns all rows from the left dataset and all
# column from both data sets. The columns will be matched by a common variable
states = tibble(state_name = state.name, state_abb = state.abb)
df = tibble(
state_name = sample(state.name, 100, replace = T),
zip_code = sample(10000:99999, 100, replace = T)
)
df
unique(df$state_name)
left_join(df, states)
left_join(df, states, by = 'state_name')
states <- states %>% rename(name_of_state = state_name)
left_join(df, states, by = c('state_name' = 'name_of_states'))
# right join works the same way but keeps all rows from the dataset on the right
# inner_join keeps all the data that exists in both dataframes
band_members
band_instruments
inner_join(band_members, band_instruments, by = 'name')
# mick exists in band_members but not in instruments
# keith exists in band instruments but not in members
# so, these are excluded
# full_join() basically keeps everything, with missing data as NA
band_members %>% full_join(band_instruments)