-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_cleaning.Rmd
132 lines (103 loc) · 4.25 KB
/
04_cleaning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
---
title: "04_cleaning"
author: "Astrid Elmann Hansen"
date: '2023-01-05'
output: html_document
---
```{r}
pacman::p_load(tidyverse)
# loading files
ans <- read_csv("data/kandidat_answers.csv")
demo <- read_csv("data/kandidat_demo.csv")
elected <- read_csv("data/kandidat_elected.csv")
# cleaning columns
ans <- ans %>% rename("ID" = "...1")
demo <- demo %>% select(-...1)
elected <- elected %>% select(-...1)
# combining info
demo$elected <- ifelse(demo$ID %in% elected$ID, T, F)
demo <- demo %>% full_join(elected, by = "ID")
# removing unused variables
rm(elected)
```
## Participation and number of questions answered
```{r}
# df with numeric answers and ID
q <- ans %>%
select(ID, where(is.numeric))
# change 0 to NA (skipped questions)
q[q == 0] <- NA
# column indicating participation in test
q$participated <- ifelse(rowSums(is.na(q)) == 25, F, T) # if 25 NAs then FALSE else TRUE
# number of questions answered
q$no_answered <- 25 - rowSums(is.na(q[,2:26])) # 25 - rows with NA
```
## Measuring answering type
```{r}
# no of answered questions that are "completely agree" or "completely disagree"
q$sure <- rowSums(q[,2:26] == 1 | q[,2:26] == 5, na.rm = TRUE) # remove NA's
# percentage of questions answered "completely agree" or "completely disagree"
q$conviction <- round(q$sure/q$no_answered,2) * 100
q %>% select(1:26) %>% write_csv("data/answers.csv")
```
```{r}
# combining dfs
q <- q %>% select(-(2:26), -sure)
demo <- demo %>% full_join(q, by = "ID")
```
# Number of notes to questions
```{r}
# df with notes and ID
n <- ans %>%
select(where(is.character))
# for some reason one candidate has note answers containing only /n or /n/r/n
which(n$ID == "712-inger-stoejberg")
n[50,2:26] <- NA
# number of notes
n$no_notes <- 25 - rowSums(is.na(n))
```
# Average length of notes
```{r}
# this is a bit ugly, but with only 25 columns it was quicker than finding a neater solution
# the length of each note for each question for each participant is found and summed across participants, NAs replaced by 0
n$total_note_len <- (replace(str_length(n$info_1) , is.na(n$info_1) , 0)) +
(replace(str_length(n$info_2) , is.na(n$info_2) , 0)) +
(replace(str_length(n$info_3) , is.na(n$info_3) , 0)) +
(replace(str_length(n$info_4) , is.na(n$info_4) , 0)) +
(replace(str_length(n$info_5) , is.na(n$info_5) , 0)) +
(replace(str_length(n$info_6) , is.na(n$info_6) , 0)) +
(replace(str_length(n$info_7) , is.na(n$info_7) , 0)) +
(replace(str_length(n$info_8) , is.na(n$info_8) , 0)) +
(replace(str_length(n$info_9) , is.na(n$info_9) , 0)) +
(replace(str_length(n$info_10), is.na(n$info_10), 0)) +
(replace(str_length(n$info_11), is.na(n$info_11), 0)) +
(replace(str_length(n$info_12), is.na(n$info_12), 0)) +
(replace(str_length(n$info_13), is.na(n$info_13), 0)) +
(replace(str_length(n$info_14), is.na(n$info_14), 0)) +
(replace(str_length(n$info_15), is.na(n$info_15), 0)) +
(replace(str_length(n$info_16), is.na(n$info_16), 0)) +
(replace(str_length(n$info_17), is.na(n$info_17), 0)) +
(replace(str_length(n$info_18), is.na(n$info_18), 0)) +
(replace(str_length(n$info_19), is.na(n$info_19), 0)) +
(replace(str_length(n$info_20), is.na(n$info_20), 0)) +
(replace(str_length(n$info_21), is.na(n$info_21), 0)) +
(replace(str_length(n$info_22), is.na(n$info_22), 0)) +
(replace(str_length(n$info_23), is.na(n$info_23), 0)) +
(replace(str_length(n$info_24), is.na(n$info_24), 0)) +
(replace(str_length(n$info_25), is.na(n$info_25), 0))
# average note length
n$avg_note_len <- round(n$total_note_len/n$no_notes,0)
n$avg_note_len <- ifelse(is.na(n$avg_note_len), 0, n$avg_note_len)
# saving file
n %>% select(1:26) %>% write_csv("data/notes.csv")
```
```{r}
# combining dfs
n <- n %>% select(ID, no_notes, avg_note_len)
demo <- demo %>% full_join(n, by = "ID")
```
## Save df
```{r}
demo %>%
write_csv("data/fv_22_kandidat_test.csv")
```