Skip to content

Commit

Permalink
Merge pull request #269 from fhdsl/cleaning_improvement
Browse files Browse the repository at this point in the history
Cleaning improvement
  • Loading branch information
carriewright11 authored Jan 23, 2025
2 parents ba36a5b + 0853a73 commit 0e19b9b
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 33 deletions.
4 changes: 2 additions & 2 deletions data/microplastics_in_blood.csv
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ A,Bisphenol A,3,-0.5
B,BPA,5,0.5
B,dioxin,8,0.5
A,bpa,6,2.5
B,O,5,3.5
B,Other,5,3.5
B,dioxin,2,0.5
A,bpa,1,1.5
B,BPA,7,1.5
B,O,3,2.5
B,Other,3,2.5
121 changes: 90 additions & 31 deletions modules/Data_Cleaning/Data_Cleaning.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -460,10 +460,9 @@ Need quotes for conditions and new values!
```{r}
plastics %>%
mutate(microplastic_recoded = case_when(
microplastic == "O" ~ "Other",
microplastic == "Bisphenol A" ~ "BPA",
microplastic == "bpa" ~ "BPA",
microplastic =="dioxin" ~"Dioxin")) %>%
microplastic == "dioxin" ~"Dioxin")) %>%
count(microplastic, microplastic_recoded)
```
Expand All @@ -477,10 +476,9 @@ We didn't specify what happens to values that were already `Other` or `Dioxin` o
```{r, eval = FALSE}
plastics %>%
mutate(microplastic_recoded = case_when(
microplastic == "O" ~ "Other",
microplastic == "Bisphenol A" ~ "BPA",
microplastic == "bpa" ~ "BPA",
microplastic =="dioxin" ~"Dioxin")) %>%
microplastic == "dioxin" ~"Dioxin")) %>%
count(microplastic, microplastic_recoded)
```

Expand All @@ -496,39 +494,37 @@ Note that automatically values not reassigned explicitly by
{data_input} %>%
mutate({variable_to_fix} = case_when({Variable_fixing}
/some condition/ ~ {value_for_con},
TRUE ~ {value_for_not_meeting_condition})
.default = {value_for_not_meeting_condition}) # need this to avoid NAs
```
:::

{value_for_not_meeting_condition} could be something new
or it can be the original values of the column

## case_when with TRUE ~ original variable name
## case_when with .default = original variable name

```{r}
plastics %>%
mutate(microplastic_recoded = case_when(
microplastic == "O" ~ "Other",
microplastic == "Bisphenol A" ~ "BPA",
microplastic == "bpa" ~ "BPA",
microplastic =="dioxin" ~"Dioxin",
TRUE ~ microplastic)) %>%
.default = microplastic)) %>%
count(microplastic_recoded)
```

## Typically it is good practice to include the TRUE statement
## Typically it is good practice to include the .default statement

You never know if you might be missing something - and if a value already was an NA it will stay that way.

```{r, eval = FALSE}
plastics %>%
mutate(microplastic_recoded = case_when(
microplastic == "O" ~ "Other",
microplastic == "Bisphenol A" ~ "BPA",
microplastic == "bpa" ~ "BPA",
microplastic =="dioxin" ~"Dioxin",
TRUE ~ microplastic)) %>%
microplastic == "dioxin" ~"Dioxin",
.default = microplastic)) %>%
count(microplastic, microplastic_recoded)
```

Expand Down Expand Up @@ -558,8 +554,7 @@ plastics %>%
microplastic == "Bisphenol A" ~ "BPA",
microplastic == "bpa" ~ "BPA",
microplastic == "dioxin" ~"Dioxin",
microplastic == "O" ~ "Other",
TRUE ~ microplastic)) %>%
.default = microplastic)) %>%
count(microplastic)
```
Expand All @@ -571,10 +566,9 @@ plastics %>%
```{r}
plastics %>%
mutate(microplastic_recoded = case_when(
microplastic %in% c("dioxin", "Dioxin") ~ "dioxin",
microplastic %in% c("dioxin", "Dioxin") ~ "Dioxin",
microplastic %in% c("BPA", "bpa", "Bisphenol A") ~ "BPA",
microplastic %in% c("O", "Other") ~ "Other",
TRUE ~ microplastic)) %>%
.default = microplastic)) %>%
count(microplastic, microplastic_recoded)
Expand Down Expand Up @@ -606,6 +600,40 @@ head(plastics)
plastics %>%
count(Foods, Effect)
```
## Note that if you change data classes this might impact .default

```{r, eval = FALSE}
plastics <- plastics %>%
mutate(Effect = case_when(
blood_level_change_nM > 0 ~ "Increase",
blood_level_change_nM == 0 ~ "Same",
blood_level_change_nM < 0 ~ "Decrease",
.default = blood_level_change_nM))
# this will give an error!
plastics <- plastics %>%
mutate(Effect = case_when(
blood_level_change_nM > 0 ~ "Increase",
blood_level_change_nM == 0 ~ "Same",
blood_level_change_nM < 0 ~ "Decrease",
.default = as.character(blood_level_change_nM)))
# this works!
```

## multiple conditions with `case_when` recoding

```{r}
plastics %>%
mutate(Amt_change = case_when(
blood_level_change_nM > 0 & blood_level_change_nM < 2 ~ "Small",
blood_level_change_nM >= 2 ~ "Large",
blood_level_change_nM < 0 & blood_level_change_nM > -2 ~ "Small",
blood_level_change_nM <= -2 ~ "Large",
blood_level_change_nM == 0 ~ "none")) %>%
head()
```

## GUT CHECK: we need to use what function with `case_when()` to modify or create a new variable?

Expand All @@ -616,11 +644,11 @@ B. `select()`
C. `mutate()`


## GUT CHECK: If we want all unspecified values to remain the same with `case_when()`, how should we complete the `TRUE ~` statement?
## GUT CHECK: If we want all unspecified values to remain the same with `case_when()`, how should we complete the `.default` statement?

A. With the name of the variable we are modifying or using as source
A. = the name of the variable we are modifying or using as source

B. With the word "same"
B. = "same"


# Working with strings
Expand Down Expand Up @@ -721,8 +749,7 @@ plastics %>%
mutate(microplastic_recoded = case_when(
microplastic %in% c("Dioxin", "dioxin") ~ "Dioxin",
microplastic %in% c("BPA", "bpa", "Bisphenol A") ~ "BPA",
microplastic %in% c("O", "Other") ~ "Other",
TRUE ~ microplastic))
.default = microplastic))
```

## `case_when()` improved with `stringr`
Expand All @@ -733,9 +760,8 @@ plastics %>%
plastics %>%
mutate(microplastic_recoded = case_when(
str_detect(string = microplastic, pattern = "^b|^B") ~ "BPA",
str_detect(string = microplastic, pattern = "^o|^O") ~ "Other",
str_detect(string = microplastic, pattern = "^d|^D") ~ "Dioxin",
TRUE ~ microplastic)) %>%
.default = microplastic)) %>%
count(microplastic, microplastic_recoded)
```

Expand Down Expand Up @@ -791,7 +817,7 @@ plastics_comb
## Summary
- `case_when()` requires `mutate()` when working with dataframes/tibbles
- `case_when()` can recode **entire values** based on **conditions** (need quotes for conditions and new values)
- remember `case_when()` needs `TRUE ~ varaible` to keep values that aren't specified by conditions, otherwise will be `NA`
- remember `case_when()` needs `.default = varaible` to keep values that aren't specified by conditions, otherwise will be `NA`

**Note:** you might see the `recode()` function, it only does some of what `case_when()` can do, so we skipped it, but it is in the extra slides at the end.

Expand Down Expand Up @@ -827,6 +853,13 @@ Image by <a href="https://pixabay.com/users/geralt-9301/?utm_source=link-attribu

# Extra Slides

## `n_complete_row()` evaluating how many columns are complete for each row

```{r}
head(plastics_comb)
head(plastics_comb) %>% n_complete_row()
```

## `recode()` function

This is similar to `case_when()` but it can't do as much.
Expand Down Expand Up @@ -856,8 +889,7 @@ plastics %>%
mutate(microplastic_recoded = recode(microplastic,
"Bisphenol A" = "BPA",
"bpa" = "BPA",
"dioxin" = "Dioxin",
"O" = "Other")) %>%
"dioxin" = "Dioxin") %>%
count(microplastic, microplastic_recoded)
```

Expand All @@ -869,8 +901,7 @@ plastics %>%
mutate(microplastic_recoded = recode(microplastic,
"Bisphenol A" = "BPA",
"bpa" = "BPA",
"dioxin" = "Dioxin",
"O" = "Other")) %>%
"dioxin" = "Dioxin")) %>%
count(microplastic, microplastic_recoded)
```

Expand All @@ -884,12 +915,40 @@ plastics %>%
mutate(microplastic = recode(microplastic,
"Bisphenol A" = "BPA",
"bpa" = "BPA",
"dioxin" = "Dioxin",
"O" = "Other")) %>%
"dioxin" = "Dioxin")) %>%
count(microplastic)
```


## More complicated case_when

```{r}
ces <- read_csv(file = "https://daseh.org/data/CalEnviroScreen_data.csv")
set.seed(123)
ces %>% mutate(new_col_case_when =
case_when(Longitude < -121 & Latitude > 37.8 ~ "Distract A",
.default = "District B")) %>%
select(Longitude, Latitude, new_col_case_when) %>%
slice_sample(n = 6)
```

## Don't need `case_when()` if just calculating new variables

```{r}
ces %>% mutate(num_col_mutate = Longitude * Latitude) %>% pull(num_col_mutate)
```

## `case_when()` if you want NA values

```{r}
ces %>% mutate(num_new = case_when(
Longitude < -121 & Latitude > 37.8 ~ Longitude * Latitude),
.defualt = NA) %>%
pull(num_new)
```



## String Splitting

Expand Down

0 comments on commit 0e19b9b

Please sign in to comment.