diff --git a/data/microplastics_in_blood.csv b/data/microplastics_in_blood.csv index 2ce039f7..d85d3642 100644 --- a/data/microplastics_in_blood.csv +++ b/data/microplastics_in_blood.csv @@ -6,8 +6,8 @@ A,Bisphenol A,3,-0.5 B,BPA,5,0.5 B,dioxin,8,0.5 A,bpa,6,2.5 -B,O,5,3.5 +B,Other,5,3.5 B,dioxin,2,0.5 A,bpa,1,1.5 B,BPA,7,1.5 -B,O,3,2.5 +B,Other,3,2.5 diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index b50ed6f6..3c697462 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -460,10 +460,9 @@ Need quotes for conditions and new values! ```{r} plastics %>% mutate(microplastic_recoded = case_when( - microplastic == "O" ~ "Other", microplastic == "Bisphenol A" ~ "BPA", microplastic == "bpa" ~ "BPA", - microplastic =="dioxin" ~"Dioxin")) %>% + microplastic == "dioxin" ~"Dioxin")) %>% count(microplastic, microplastic_recoded) ``` @@ -477,10 +476,9 @@ We didn't specify what happens to values that were already `Other` or `Dioxin` o ```{r, eval = FALSE} plastics %>% mutate(microplastic_recoded = case_when( - microplastic == "O" ~ "Other", microplastic == "Bisphenol A" ~ "BPA", microplastic == "bpa" ~ "BPA", - microplastic =="dioxin" ~"Dioxin")) %>% + microplastic == "dioxin" ~"Dioxin")) %>% count(microplastic, microplastic_recoded) ``` @@ -496,7 +494,7 @@ Note that automatically values not reassigned explicitly by {data_input} %>% mutate({variable_to_fix} = case_when({Variable_fixing} /some condition/ ~ {value_for_con}, - TRUE ~ {value_for_not_meeting_condition}) + .default = {value_for_not_meeting_condition}) # need this to avoid NAs ``` ::: @@ -504,31 +502,29 @@ Note that automatically values not reassigned explicitly by {value_for_not_meeting_condition} could be something new or it can be the original values of the column -## case_when with TRUE ~ original variable name +## case_when with .default = original variable name ```{r} plastics %>% mutate(microplastic_recoded = case_when( - microplastic == "O" ~ "Other", microplastic == "Bisphenol A" ~ "BPA", microplastic == "bpa" ~ "BPA", microplastic =="dioxin" ~"Dioxin", - TRUE ~ microplastic)) %>% + .default = microplastic)) %>% count(microplastic_recoded) ``` -## Typically it is good practice to include the TRUE statement +## Typically it is good practice to include the .default statement You never know if you might be missing something - and if a value already was an NA it will stay that way. ```{r, eval = FALSE} plastics %>% mutate(microplastic_recoded = case_when( - microplastic == "O" ~ "Other", microplastic == "Bisphenol A" ~ "BPA", microplastic == "bpa" ~ "BPA", - microplastic =="dioxin" ~"Dioxin", - TRUE ~ microplastic)) %>% + microplastic == "dioxin" ~"Dioxin", + .default = microplastic)) %>% count(microplastic, microplastic_recoded) ``` @@ -558,8 +554,7 @@ plastics %>% microplastic == "Bisphenol A" ~ "BPA", microplastic == "bpa" ~ "BPA", microplastic == "dioxin" ~"Dioxin", - microplastic == "O" ~ "Other", - TRUE ~ microplastic)) %>% + .default = microplastic)) %>% count(microplastic) ``` @@ -571,10 +566,9 @@ plastics %>% ```{r} plastics %>% mutate(microplastic_recoded = case_when( - microplastic %in% c("dioxin", "Dioxin") ~ "dioxin", + microplastic %in% c("dioxin", "Dioxin") ~ "Dioxin", microplastic %in% c("BPA", "bpa", "Bisphenol A") ~ "BPA", - microplastic %in% c("O", "Other") ~ "Other", - TRUE ~ microplastic)) %>% + .default = microplastic)) %>% count(microplastic, microplastic_recoded) @@ -606,6 +600,40 @@ head(plastics) plastics %>% count(Foods, Effect) ``` +## Note that if you change data classes this might impact .default + +```{r, eval = FALSE} +plastics <- plastics %>% + mutate(Effect = case_when( + blood_level_change_nM > 0 ~ "Increase", + blood_level_change_nM == 0 ~ "Same", + blood_level_change_nM < 0 ~ "Decrease", + .default = blood_level_change_nM)) +# this will give an error! + +plastics <- plastics %>% + mutate(Effect = case_when( + blood_level_change_nM > 0 ~ "Increase", + blood_level_change_nM == 0 ~ "Same", + blood_level_change_nM < 0 ~ "Decrease", + .default = as.character(blood_level_change_nM))) +# this works! + + +``` + +## multiple conditions with `case_when` recoding + +```{r} +plastics %>% + mutate(Amt_change = case_when( + blood_level_change_nM > 0 & blood_level_change_nM < 2 ~ "Small", + blood_level_change_nM >= 2 ~ "Large", + blood_level_change_nM < 0 & blood_level_change_nM > -2 ~ "Small", + blood_level_change_nM <= -2 ~ "Large", + blood_level_change_nM == 0 ~ "none")) %>% + head() +``` ## GUT CHECK: we need to use what function with `case_when()` to modify or create a new variable? @@ -616,11 +644,11 @@ B. `select()` C. `mutate()` -## GUT CHECK: If we want all unspecified values to remain the same with `case_when()`, how should we complete the `TRUE ~` statement? +## GUT CHECK: If we want all unspecified values to remain the same with `case_when()`, how should we complete the `.default` statement? -A. With the name of the variable we are modifying or using as source +A. = the name of the variable we are modifying or using as source -B. With the word "same" +B. = "same" # Working with strings @@ -721,8 +749,7 @@ plastics %>% mutate(microplastic_recoded = case_when( microplastic %in% c("Dioxin", "dioxin") ~ "Dioxin", microplastic %in% c("BPA", "bpa", "Bisphenol A") ~ "BPA", - microplastic %in% c("O", "Other") ~ "Other", - TRUE ~ microplastic)) + .default = microplastic)) ``` ## `case_when()` improved with `stringr` @@ -733,9 +760,8 @@ plastics %>% plastics %>% mutate(microplastic_recoded = case_when( str_detect(string = microplastic, pattern = "^b|^B") ~ "BPA", - str_detect(string = microplastic, pattern = "^o|^O") ~ "Other", str_detect(string = microplastic, pattern = "^d|^D") ~ "Dioxin", - TRUE ~ microplastic)) %>% + .default = microplastic)) %>% count(microplastic, microplastic_recoded) ``` @@ -791,7 +817,7 @@ plastics_comb ## Summary - `case_when()` requires `mutate()` when working with dataframes/tibbles - `case_when()` can recode **entire values** based on **conditions** (need quotes for conditions and new values) - - remember `case_when()` needs `TRUE ~ varaible` to keep values that aren't specified by conditions, otherwise will be `NA` + - remember `case_when()` needs `.default = varaible` to keep values that aren't specified by conditions, otherwise will be `NA` **Note:** you might see the `recode()` function, it only does some of what `case_when()` can do, so we skipped it, but it is in the extra slides at the end. @@ -827,6 +853,13 @@ Image by % + "dioxin" = "Dioxin") %>% count(microplastic, microplastic_recoded) ``` @@ -869,8 +901,7 @@ plastics %>% mutate(microplastic_recoded = recode(microplastic, "Bisphenol A" = "BPA", "bpa" = "BPA", - "dioxin" = "Dioxin", - "O" = "Other")) %>% + "dioxin" = "Dioxin")) %>% count(microplastic, microplastic_recoded) ``` @@ -884,12 +915,40 @@ plastics %>% mutate(microplastic = recode(microplastic, "Bisphenol A" = "BPA", "bpa" = "BPA", - "dioxin" = "Dioxin", - "O" = "Other")) %>% + "dioxin" = "Dioxin")) %>% count(microplastic) ``` +## More complicated case_when + +```{r} +ces <- read_csv(file = "https://daseh.org/data/CalEnviroScreen_data.csv") +set.seed(123) +ces %>% mutate(new_col_case_when = + case_when(Longitude < -121 & Latitude > 37.8 ~ "Distract A", + .default = "District B")) %>% + select(Longitude, Latitude, new_col_case_when) %>% + slice_sample(n = 6) +``` + +## Don't need `case_when()` if just calculating new variables + +```{r} +ces %>% mutate(num_col_mutate = Longitude * Latitude) %>% pull(num_col_mutate) + +``` + +## `case_when()` if you want NA values + +```{r} +ces %>% mutate(num_new = case_when( + Longitude < -121 & Latitude > 37.8 ~ Longitude * Latitude), + .defualt = NA) %>% + pull(num_new) +``` + + ## String Splitting