04_cleaning.Rmd

---
title: "04_cleaning"
author: "Astrid Elmann Hansen"
date: '2023-01-05'
output: html_document
---

```{r}
pacman::p_load(tidyverse)

# loading files
ans <- read_csv("data/kandidat_answers.csv")
demo <- read_csv("data/kandidat_demo.csv")
elected <- read_csv("data/kandidat_elected.csv")

# cleaning columns
ans <- ans %>% rename("ID" = "...1")
demo <- demo %>% select(-...1)
elected <- elected %>% select(-...1)

# combining info
demo$elected <- ifelse(demo$ID %in% elected$ID, T, F)
demo <- demo %>% full_join(elected, by = "ID")

# removing unused variables
rm(elected)
```


## Participation and number of questions answered
```{r}
# df with numeric answers and ID
q <- ans %>% 
  select(ID, where(is.numeric))

# change 0 to NA (skipped questions)
q[q == 0] <- NA

# column indicating participation in test
q$participated <- ifelse(rowSums(is.na(q)) == 25, F, T) # if 25 NAs then FALSE else TRUE

# number of questions answered
q$no_answered <- 25 - rowSums(is.na(q[,2:26])) # 25 - rows with NA
```

## Measuring answering type
```{r}
# no of answered questions that are "completely agree" or "completely disagree"
q$sure <- rowSums(q[,2:26] == 1 | q[,2:26] == 5, na.rm = TRUE) # remove NA's

# percentage of questions answered "completely agree" or "completely disagree"
q$conviction <- round(q$sure/q$no_answered,2) * 100
q %>% select(1:26) %>% write_csv("data/answers.csv")
```

```{r}
# combining dfs
q <- q %>% select(-(2:26), -sure)
demo <- demo %>% full_join(q, by = "ID")
```

# Number of notes to questions
```{r}
# df with notes and ID
n <- ans %>% 
  select(where(is.character))

# for some reason one candidate has note answers containing only /n or /n/r/n
which(n$ID == "712-inger-stoejberg")
n[50,2:26] <- NA

# number of notes
n$no_notes <- 25 - rowSums(is.na(n))
```

# Average length of notes
```{r}
# this is a bit ugly, but with only 25 columns it was quicker than finding a neater solution
# the length of each note for each question for each participant is found and summed across participants, NAs replaced by 0
n$total_note_len <- (replace(str_length(n$info_1) , is.na(n$info_1) , 0)) +
                    (replace(str_length(n$info_2) , is.na(n$info_2) , 0)) +
                    (replace(str_length(n$info_3) , is.na(n$info_3) , 0)) +
                    (replace(str_length(n$info_4) , is.na(n$info_4) , 0)) +
                    (replace(str_length(n$info_5) , is.na(n$info_5) , 0)) +
                    (replace(str_length(n$info_6) , is.na(n$info_6) , 0)) +
                    (replace(str_length(n$info_7) , is.na(n$info_7) , 0)) +
                    (replace(str_length(n$info_8) , is.na(n$info_8) , 0)) +
                    (replace(str_length(n$info_9) , is.na(n$info_9) , 0)) +
                    (replace(str_length(n$info_10), is.na(n$info_10), 0)) +
                    (replace(str_length(n$info_11), is.na(n$info_11), 0)) +
                    (replace(str_length(n$info_12), is.na(n$info_12), 0)) +
                    (replace(str_length(n$info_13), is.na(n$info_13), 0)) +
                    (replace(str_length(n$info_14), is.na(n$info_14), 0)) +
                    (replace(str_length(n$info_15), is.na(n$info_15), 0)) +
                    (replace(str_length(n$info_16), is.na(n$info_16), 0)) +
                    (replace(str_length(n$info_17), is.na(n$info_17), 0)) +
                    (replace(str_length(n$info_18), is.na(n$info_18), 0)) +
                    (replace(str_length(n$info_19), is.na(n$info_19), 0)) +
                    (replace(str_length(n$info_20), is.na(n$info_20), 0)) +
                    (replace(str_length(n$info_21), is.na(n$info_21), 0)) +
                    (replace(str_length(n$info_22), is.na(n$info_22), 0)) +
                    (replace(str_length(n$info_23), is.na(n$info_23), 0)) +
                    (replace(str_length(n$info_24), is.na(n$info_24), 0)) +
                    (replace(str_length(n$info_25), is.na(n$info_25), 0))

# average note length
n$avg_note_len <- round(n$total_note_len/n$no_notes,0)
n$avg_note_len <- ifelse(is.na(n$avg_note_len), 0, n$avg_note_len)

# saving file
n %>% select(1:26) %>% write_csv("data/notes.csv")
```

```{r}
# combining dfs
n <- n %>% select(ID, no_notes, avg_note_len)
demo <- demo %>% full_join(n, by = "ID")
```

## Save df
```{r}
demo %>% 
write_csv("data/fv_22_kandidat_test.csv")
```