-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCodebook_Creation_Code.Rmd
476 lines (386 loc) · 23.1 KB
/
Codebook_Creation_Code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
---
title: <%= format(TITLE) %>
author: "NYU GLOBAL TIES FOR CHILDREN"
date: <%= format(Sys.time(), "%d %B, %Y") %>
output:
word_document:
toc: yes
toc_depth: 1
reference_docx: <%= format(FORMAT.PATH) %>
---
```{r setup, message=FALSE,echo=FALSE,results='hide'}
## Environment variables
Sys.getenv("MASTERCODEBOOK","ASSESSMENTS","FINALNAMES")
Sys.getenv("ITEMDES")
Sys.getenv("CODING","SUBGROUP","SUBDESCRIPTION")
Sys.getenv("LEAVEOUT","ADESCRIPTION","DROPCASE")
Sys.getenv("DATABASE")
Sys.getenv("PRESENCE.COL")
Sys.getenv("GROUP")
doc.type <- knitr::opts_knit$get('rmarkdown.pandoc.to')
```
```{r loadLibraries, echo=FALSE,message=FALSE,warning=FALSE}
## Make sure that all packages that will be needed for the code to run are installed in this computer and loaded:
check.and.install <- function(p){
needed.packages <- p[!(p %in% installed.packages()[,"Package"])]
## Install missing packages
if(length(needed.packages)){ install.packages(needed.packages)}
## Load all packages
for (i in 1:length(p)){
eval(parse(text=paste0("library(",p[i],")")))
}
}
p <- c("knitr","ggplot2","ggthemes","gridExtra","pander","crayon", "data.table", "digest")
check.and.install(p)
panderOptions('knitr.auto.asis', FALSE)
"%not_in%" <- function(x,table)match(x,table,nomatch = 0)==0
```
```{r, results='asis', fig.align="center", echo=FALSE,warning=FALSE, error = FALSE, fig.width=12,message=FALSE}
# Define a helper function that can redirect output for debugging
cat2 <- function(..., output_pipe = stderr()) {
if (!is.null(output_pipe)) cat(..., file = output_pipe, append = TRUE)
invisible(cat(...))
}
if(is.null(CODING)){
stop("Codebook column \"Coding\" not found", call. = FALSE)
}else{
if((!is.null(CODING) & (CODING %not_in% names(MASTERCODEBOOK)))){
stop("Codebook column \"Coding\" not found", call. = FALSE)
}
}
## Delete spaces in the edges
MASTERCODEBOOK$IndexSections <- gsub("^[[:space:]]+|[[:space:]]+$|^[[:space:]]|[[:space:]]$","",MASTERCODEBOOK$IndexSections)
## If, for whatever reason, whitespace in the coding column hasn't be converted to NA, do so.
MASTERCODEBOOK[!is.na(MASTERCODEBOOK[[CODING]]) & grepl("^\\s*$", MASTERCODEBOOK[[CODING]]), CODING] <- NA
## Make sure we're only looking for rows that actually correspond to existing variables in the final database
MASTERCODEBOOK <- MASTERCODEBOOK[!is.na(MASTERCODEBOOK[,FINALNAMES]),]
MASTERCODEBOOK <- MASTERCODEBOOK[!grepl("^$",MASTERCODEBOOK[,FINALNAMES]),]
## The DROPCASE indicator can also help make sure we're not printing out any variables that didn't end up in the database
if(!is.null(DROPCASE)){
MASTERCODEBOOK <- MASTERCODEBOOK[!grepl("^Drop",MASTERCODEBOOK[,DROPCASE],ignore.case = T),]
}
## The PRESENCE.COL column helps the function determine which items from the mapping to include in codebooks meant for just ONE time-point
## AND if a name of the element is provided, it is used for plotting (the name should match the wave tag of the items)
if(!is.null(PRESENCE.COL)){
MASTERCODEBOOK <- MASTERCODEBOOK[!is.na(MASTERCODEBOOK[,PRESENCE.COL]),]
MASTERCODEBOOK <- MASTERCODEBOOK[!grepl("^$",MASTERCODEBOOK[,PRESENCE.COL]),]
if(!is.null(names(PRESENCE.COL))){
wave_tag <- names(PRESENCE.COL)
}else{if(!is.null(DATABASE)){
print("If you want plots and descriptives, you need to specify the wave tag of the items that you want in your codebook, as it appears in the DATABASE you provided. Otherwise, don't specify a DATABASE.")
}
}
}
if(!is.null(LEAVEOUT)){ ## ignore the items that are specific to a different year/codebook
MASTERCODEBOOK <- MASTERCODEBOOK[!grepl(paste0(LEAVEOUT, collapse = "|"), MASTERCODEBOOK[, grepl("leaveout", names(MASTERCODEBOOK), ignore.case = TRUE)]), ]
}
## Subset the master codebook only to the assessments of interest
ASSESSMENTS <- unique(MASTERCODEBOOK$IndexSections[grepl(paste0(ASSESSMENTS,collapse = "|"),MASTERCODEBOOK$IndexSections)])
MASTERCODEBOOK <- MASTERCODEBOOK[MASTERCODEBOOK$IndexSections %in% ASSESSMENTS,]
## Create an 0 variable that will be filled in on each loop with an indicator for whether to plot a new table or not
MASTERCODEBOOK$DONTPLOT <- rep(0,nrow(MASTERCODEBOOK))
########################################################################
## Loop over the different groups of variables that you require a codebook for
########################################################################
for(g in ASSESSMENTS){
## Print out a page break and then the group name
cat2("\n")
cat2(paste0("###### Page Break"))
cat2("\n")
cat2("_____")
cat2("\n")
cat2(paste0("# ",g))
cat2("\n")
cat2("_____")
cat2("\n")
## Subset the codebook again and keep only items for the group in the loop
SUBSET <- MASTERCODEBOOK[MASTERCODEBOOK$IndexSections==g,]
SUBSET <- SUBSET[!duplicated(SUBSET[[FINALNAMES]]), ]
n <- nrow(SUBSET)
## If a Section description exists, then print it out
if(!is.null(ADESCRIPTION)){
if(!is.na(na.exclude(SUBSET[,ADESCRIPTION])[1])){
cat2(paste0(na.exclude(SUBSET[,ADESCRIPTION])[1]))
cat2("\n")
}
}
if (n < 1L) {
cat(paste0("##### 🤯Requested assessment '", g, "' not found in codebook. Please check that the names you entered in the function match the name in your spreadsheet 'IndexSections' column. Skipping...\n"), sep = "")
next
}
#####################################################################
## Loop over each variable that belongs to that assessment (group/section)
#####################################################################
for(v in 1:n){
## Define an empty bullet object that will be used later
bullet <- ""
####################################################################
## If the item belongs to a subgroup of items that share the *same coding*
is.sub <- !is.na(SUBSET[v,SUBGROUP]) & !grepl("^$|^[[:space:]]+$",SUBSET[v,SUBGROUP])
##then print the name of the subgroup
####################################################################
if(is.sub){
## Print out the subgroup name, as long as it's not identical to the group/section name
if(SUBSET[v,SUBGROUP]!= g && SUBSET$DONTPLOT[v] == 0) {
cat2("\n")
cat2(paste0("## ",SUBSET[v,SUBGROUP]))
cat2("\n")
## If the subgroup note exists, print it out
if(!is.null(SUBSET$SUB_NOTE)){
cat2("\n")
cat2(paste0("### ",SUBSET$SUB_NOTE[v]))
cat2("\n")
}
}
## We want to print out the items of the subgroup as a bullet list,
## For which we create an object that will result in a bullet later on
bullet <- "+ "
}else{
## If it doesn't belong to a subgroup
bullet <- "###"
# ## Print out the variable name
# cat2("\n")
# cat2(paste0("###",SUBSET[v,FINALNAMES],": "))
# cat2("\n")
# ## Print out the question or description of the item
# cat2(paste0("*",SUBSET[v,ITEMDES],"*"))
# cat2("\n")
}
####################################################################
## If the variable is categorical
####################################################################
is.cat <- SUBSET$Type[v] %in%
c("binary","ordered-categorical","unordered-categorical") & ## If the item is categorical
SUBSET$DONTPLOT[v]==0 ## and the table hasn't been already plotted for the items of that group
####################################################################
## Extract the item names that will get printed out later
####################################################################
## Reset the names object
#names <- NULL
if(is.sub){
## Extract the names of the variables of the present subgroup
names <- SUBSET[SUBSET[,SUBGROUP]==SUBSET[v,SUBGROUP],FINALNAMES]
}else{
## Extract the name of the variable of the present iteration
names <- SUBSET[v,FINALNAMES]
}
########################################################################################################
## DEAL WITH EQUIVALENCES BETWEEN NAMES IN THE SPREADSHEET AND NAMES MODIFIED BY THE CLEANING FUNCTION
########################################################################################################
### If a database was provided
if(!is.null(DATABASE) & SUBSET$DONTPLOT[v]==0 ){
names.in.data <- names
## Which one isn't there
not.there <- names.in.data[! (names.in.data %in% names(DATABASE))]
### If we are looking for a single wave in a mapping file, we'll have to add the wave suffixes, as the cleaning function does...
if(length(not.there)>0){
## They may be there but with corresponding wave tags
wave <- grepl(paste0(paste0("^",not.there,wave_tag,"$"),collapse = "|"), names(DATABASE))
if(any(wave)){
roots <- gsub(paste0(wave_tag,"$"),"",names(DATABASE)[wave])
for(r in 1:length(roots)){
names.in.data[names.in.data==roots[r]] <- names(DATABASE)[wave][grepl(roots[r],names(DATABASE)[wave])]
} ## End of loop
}
}
not.there <- names.in.data[! (names.in.data %in% names(DATABASE))]
## OR They may be there but with corresponding GROUP tags
if(!is.null(GROUP)){
group.tag <- SUBSET[1,GROUP]
if(length(not.there)>0){
group.and.wave <- grepl(paste0(paste0(not.there,wave_tag),collapse = "|"), names(DATABASE)) & grepl(group.tag,names(DATABASE))
if(any(group.and.wave) ){
roots <- gsub(paste0(wave_tag,"|_",group.tag),"",names(DATABASE)[group.and.wave])
for(r in 1:length(roots)){
names.in.data[names.in.data==roots[r]] <- names(DATABASE)[group.and.wave][grepl(roots[r],names(DATABASE)[group.and.wave])]
} ## End of loop
}
}
} ## End GROUP condition
## Once more, update the mising items' object
not.there <- names.in.data[! (names.in.data %in% names(DATABASE))]
## Anything missing still?
if(length(not.there)>0 & SUBSET$DONTPLOT[v]==0){
### Variables are not found in database with or without the wave tag
### Print error if a variable is missing
cat("##### 🧐 Please check for inconsistencies between the database and the CSV codebook:")
cat2("\n")
cat(paste0("##### The variable ",not.there," is not in the database provided."))
cat2("\n")
}
} ## No database
########################################################################################################
########################################################################################################
## MOVE ON.....
## DEAL WITH PLOTTING:
########################################################################################################
########################################################################################################
## If a database is provided AND the item is continuous or ordered categorical,
## then a descriptive table and a plot can be created too
########################################################################################################
if (!is.null(DATABASE)) {
is.plottable <- length(DATABASE)>0 & grepl("proportion|continuous|count|^ordered",SUBSET$Type[v],ignore.case = T) &
SUBSET$DONTPLOT[v]==0 ## If it has already been plotted then it's not considered plottable anymore
} else {
is.plottable <- FALSE
}
####################################################################
if(is.plottable){
### If the item belonged to a subgroup then we also want to place all descriptives and plots together
## But if the full subgroup has already been plotted we don't want to plot or tab
## Create a table large enough for all items of the subgroup
DES <- matrix(NA,length(names),5)
colnames(DES) <-c("min","mean","median","sd","max")
P <- list()
###############################################################################################################
###############################################################################################################
if(length(names)!=length(names.in.data)){stop("Wait, DEBUG: object 'names' and 'names in data' aren't the same length",call. = T)}
for (n in 1:length(names)) { ## Loop over elements of a subgroup
if (names.in.data[n]=="AGE") {
next
}
## Print out the variable names as a list (with bullets)
####################################################################
cat2("\n")
cat2(paste0(bullet," **",SUBSET[SUBSET[,FINALNAMES]==names[n],FINALNAMES],"** : "))
## Print out the question or description of the item
####################################################################
cat2(paste0("*",SUBSET[SUBSET[,FINALNAMES]==names[n], ITEMDES ],"*"))
cat2("\n")
### DESCRIPTIVES
DES[n,1] <- round(min(DATABASE[,names.in.data[n]],na.rm=T), digits = 2)
DES[n,2] <- round(mean(DATABASE[,names.in.data[n]],na.rm=T), digits = 2)
DES[n,3] <- round(median(DATABASE[,names.in.data[n]],na.rm=T), digits = 2)
DES[n,4] <- round(sd(DATABASE[,names.in.data[n]],na.rm=T), digits = 2)
DES[n,5] <- round(max(DATABASE[,names.in.data[n]],na.rm=T), digits = 2)
### HISTOGRAM
### Create a space for all the plots for a determined subgroup
## First determine an appropriate ratio for the figures based on their ranges
max.freq <- max(table(DATABASE[,names.in.data[n]]))
plot.ratio <- abs(DES[n,5]- DES[n,1])/max.freq
P[[n]] <- ggplot(DATABASE, aes_string(x=names.in.data[n])) +
geom_histogram(color="purple", fill="white") +
theme_tufte(base_size = 9, base_family = "serif", ticks = TRUE) +
coord_fixed(ratio=plot.ratio)
} ## closing n loop
rownames(DES) <- names
## Print descriptives
cat2("\n")
cat2("### Variable summary")
cat2("\n")
pander::pandoc.table(DES,keep.line.breaks=T,style = "grid")
## Print the histograms
cat2("\n")
library(gtable)
library(grid)
## Set the plot organization and sizing for multiple plots
plot.cols <- 3
plot.rows <- ceiling(length(P)/plot.cols)
## If the number of entries in the grob table is larger than the number of plots in P
## then fill the remaining spaces with empty plots
empty.slots <- plot.cols*plot.rows-length(P)
if( empty.slots>0){
for(i in seq(length(P)+1,plot.cols*plot.rows,1)){
P[[i]] <- ggplot(DATABASE, aes_string(x=names.in.data[n])) +
geom_blank()+ theme(axis.text.x=element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.ticks.x=element_blank())+
xlab("") + coord_fixed(ratio=plot.ratio)
}
}
pl <- lapply(P, ggplotGrob)
ROWS <-list()
for( i in 1:plot.rows){
## Combine all the plots of the first row into one image
ROWS[[i]] <- do.call(cbind,c(pl[(plot.cols*(i-1)+1):min(length(P),(plot.cols*i))],size="first"))
## Specify the height of the resulting image based on the height of the first plot
ROWS[[i]]$heights <- eval(parse(text=paste0("grid::unit.pmax(",paste0(rep("pl[[1]][['heights']]",plot.cols),
collapse = ","),")")))
}
## Combine all the "rows" of plots created in the loop into one big plot, IF there is more than one row of plots
if(plot.rows>1){
PL <- do.call(rbind,c(ROWS,size="first"))
## Specify the width of the resulting image based on the width of the first plot
PL$widths <- eval(parse(text=paste0("grid::unit.pmax(",paste0(rep("ROWS[[1]][['widths']]",plot.rows),
collapse = ","),")")))
}else{PL <- ROWS[[1]]}
grid.newpage()
grid.layout(ncol=1,heights = plot.rows*2)
#do.call("grid.draw",P)
## Print out the plot
#do.call("grid.arrange",c(P,heights=rep(2,length(P))))
grid.draw(PL)
cat2("\n")
} else { ## If the variable is NOT plottable and
####################################################################
## Belongs OR doesn't belong to a subgroup
if(SUBSET$DONTPLOT[v] == 0){
for(xn in 1:length(names)){ ## Loop over elements of a subgroup (xn can be 1 here)
## Print out the variable names as a list (with bullets)
####################################################################
cat2("\n")
cat2(paste0(bullet," **",SUBSET[SUBSET[[FINALNAMES]] == names[xn],FINALNAMES],"** : "))
## Print out the question or description of the item
####################################################################
cat2(paste0("*",SUBSET[SUBSET[[FINALNAMES]] == names[xn], ITEMDES],"*"))
cat2("\n")
} ## Closing n loop
if(is.cat && ## If the item is categorical
!is.na(SUBSET[v,CODING]) && !grepl("^$|^[[:space:]]$",SUBSET[v,CODING]) ## and values & labels were provided in the CSV file
){
## Then, print out a table with the coding for each category
pander::set.caption("",permanent = T)
cat2("\n")
cat2("### Variable coding")
cat2("\n")
# If this is a subgroup BUT not all the codings are the same, export the individual codings
if (is.sub) {
current_subgroup <- SUBSET[v, SUBGROUP]
sgdt <- data.table::as.data.table(SUBSET[SUBSET[[SUBGROUP]] == current_subgroup, ])
sgdt[, coding_hash := apply(.SD, 1L, function(x) digest::digest(eval(parse(text = as.character(x))))), .SDcols = CODING]
if (sgdt[, length(unique(coding_hash))] == 1L) {
pander::pandoc.table(eval(parse(text=as.character(SUBSET[v,CODING]))),
col.names=c("VALUE","LABEL"),keep.line.breaks=T,style = "grid")
} else {
unique_codes <- sgdt[!is.na(sgdt[[CODING]]) | grepl("^\\s*$", sgdt[[CODING]]), unique(coding_hash)]
for (uc in unique_codes) {
cat("For the following variables:\n\n ", sgdt[coding_hash == uc, paste(paste0("+ ", .SD[[FINALNAMES]][!is.na(.SD[[FINALNAMES]])]), collapse = "\n")])
cat("\n")
outcode <- sgdt[uc, on = "coding_hash", mult = "first"][[CODING]]
if (!(is.character(outcode) || length(outcode) == 1L)) {
browser()
}
cat("\n\n")
pander::pandoc.table(eval(parse(text = outcode)),
col.names=c("VALUE","LABEL"),keep.line.breaks=T,style = "grid")
cat("\n\n")
}
}
} else {
cat("\n\n")
pander::pandoc.table(eval(parse(text=as.character(SUBSET[v,CODING]))),
col.names=c("VALUE","LABEL"),keep.line.breaks=T,style = "grid")
}
cat2("\n\n")
}else{ ## If it was a categorical item but no coding was specified
if(is.cat){
## Then print a note:
cat2("The coding for this variable was not specified in the CSV input file")
cat2("\n")
}
}
cat2("\n")
} ##Closing conditional statement for items of a group that has not been printed yet.
} ## Finished printing names and tables for everything that is NOT prottable
## Then, if the item belonged to a subgroup that shared the same scale/coding, let's make sure that
## the table doesn't get printed for the next items of that group, by switching the DONTPLOT indicator
SUBSET$DONTPLOT[is.sub & ## If a subgroup was specified for those variables
grepl(SUBSET[v,SUBGROUP], SUBSET[,SUBGROUP], fixed = TRUE)] <- 1 ## and if the Subgroup is the same as the variable
## of the current iteration.
cat2("\n")
} ## Closing loop for variables v within a group g
} ## Closing loop for groups of variables g
```