Skip to content

Commit 833bf83

Browse files
authored
feat: add replace_missing_with_na parameter (#9)
* Add `replace_missing_with_na` parameter to `homogenize_panel()` to allow panelcleaner to create variables in the waves that are missing their raw variables. This is helpful during data collection when not all variables have had submissions but you want to keep the original panel mapping specification.
1 parent 7edc831 commit 833bf83

File tree

5 files changed

+139
-8
lines changed

5 files changed

+139
-8
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: panelcleaner
22
Title: An interactive interface to homogenize messy panel data into a long format
3-
Version: 0.0.4
3+
Version: 0.0.5
44
Authors@R:
55
c(person(
66
given = "Patrick",

NEWS.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# panelcleaner 0.0.5
2+
3+
* Add `replace_missing_with_na` parameter to `homogenize_panel()` to allow panelcleaner to create
4+
variables in the waves that are missing their raw variables. This is helpful during data collection
5+
when not all variables have had submissions but you want to keep the original panel mapping
6+
specification.
7+
18
# panelcleaner 0.0.4
29

310
* Fix bug where panelcleaner would not subset the variables to those that are present in the wave database if the user wants to allow missing variables to not stop homogenization

R/homogenization.R

+35-5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,27 @@
4242
#' is really only useful if you intend you data to be used in a
4343
#' [blueprintr](https://nyuglobalties.github.io/blueprintr) project.
4444
#'
45+
#' # Extra Parameters
46+
#'
47+
#' In some cases the default behavior of panelcleaner is too restrictive, especially during
48+
#' the beginning of data collection. Often, APIs or general data exports don't include
49+
#' variables that don't have any submissions yet, but you still want to keep those variables
50+
#' in your input data. These parameters lift some restrictions on panelcleaner's behavior:
51+
#'
52+
#' * `drop_na_homogenized`: If `TRUE`, any NA entries in the homogenized_name column will be
53+
#' ignored, as if the row in the panel mapping doesn't exist.
54+
#' * `ignored_missing_codings`: If `TRUE`, waves with NA codings but with non-NA homogenized
55+
#' codings will not have their values homogenized.
56+
#' * `ignored_missing_homogenized_codings`: If `TRUE`, any variables that have defined wave
57+
#' codings but no homogenized coding will not have their codings homogenized.
58+
#' * `error_missing_raw_variables`: If `FALSE`, raw variables that should be present in the
59+
#' data, given the panel mapping, but aren't will not throw an error. Instead, they'll be
60+
#' added to the list of [panelcleaner::issues()].
61+
#' * `replace_missing_with_na`: If `TRUE`, raw_variables that should be present in the data,
62+
#' given the panel mapping, but are not will be created and filled with NA values. A message
63+
#' will be displayed of all the variables where this action was applied. This value supersedes
64+
#' `error_missing_raw_variables`.
65+
#'
4566
#' @export
4667
homogenize_panel <- function(panel, mapping = NULL, ...) {
4768
tk_assert(is_unhomogenized_panel(panel))
@@ -299,6 +320,7 @@ homogenize_wave_descriptions <- function(panel, w, long_map, ctx = list()) {
299320

300321
homogenize_wave_names <- function(panel, w, long_map, ctx = list()) {
301322
error_missing_raw_variables <- ctx$error_missing_raw_variables %||% TRUE
323+
replace_missing_with_na <- ctx$replace_missing_with_na %||% FALSE
302324

303325
schema <- panel_mapping_schema(long_map)
304326

@@ -315,12 +337,20 @@ homogenize_wave_names <- function(panel, w, long_map, ctx = list()) {
315337

316338
if (any(!variables %in% names(wave_db))) {
317339
missing_vars <- long_map[!long_map[[schema$wave_name]] %in% names(wave_db), ][[schema$wave_name]]
340+
missing_msg <- c(
341+
"Some variables present in mapping for {ui_value(w)} are not in the data: [",
342+
glue_collapse(ui_value(missing_vars), ", "), "]"
343+
)
318344

319-
if (isTRUE(error_missing_raw_variables)) {
320-
tk_err(c(
321-
"Some variables present in mapping for {ui_value(w)} are not in the data: [",
322-
glue_collapse(ui_value(missing_vars), ", "), "]"
323-
))
345+
if (isTRUE(replace_missing_with_na)) {
346+
msg <- c(missing_msg, "\nThey have been created with missingness in anticipation of their eventual existence")
347+
message(paste0(msg, collapse = ""))
348+
349+
for (mv in missing_vars) {
350+
wave_db[[mv]] <- NA
351+
}
352+
} else if (isTRUE(error_missing_raw_variables)) {
353+
tk_err(missing_msg)
324354
} else {
325355
issue <- list(missing_vars)
326356
names(issue) <- glue("missing_raw_variables_{w}")

man/homogenize_panel.Rd

+22
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-homogenization.R

+74-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
context("Homogenization")
2-
31
test_that("Variable name homogenization works", {
42
ids_1 <- sample(1:500, 100)
53
ids_2 <- ids_1
@@ -374,3 +372,77 @@ test_that("Missing variables don't stop homogenization, if desired", {
374372
c("missing_raw_variables_t1", "missing_raw_variables_t2")
375373
)
376374
})
375+
376+
test_that("Missing variables can be created, if desired", {
377+
ids_1 <- sample(1:500, 100)
378+
ids_2 <- ids_1
379+
ids_2[sample(1:100, 10)] <- sample(500:1000, 10)
380+
381+
wave_1 <- data.frame(id = ids_1, time = 1, q1 = sample(1:5, 100, replace = TRUE), q2 = sample(0:1, 100, replace = TRUE), stringsAsFactors = FALSE)
382+
wave_2 <- data.frame(id = ids_2, time = 2, question1 = sample(1:5, 100, replace = TRUE), Q2 = sample(0:1, 100, replace = TRUE), stringsAsFactors = FALSE)
383+
384+
coding_1 <- bquote(
385+
coding(
386+
code("Never", 1),
387+
code("Rarely", 2),
388+
code("Sometimes", 3),
389+
code("Frequently", 4),
390+
code("Always", 5)
391+
)
392+
)
393+
394+
coding_2 <- bquote(
395+
coding(
396+
code("Never", 5),
397+
code("Rarely", 4),
398+
code("Sometimes", 3),
399+
code("Frequently", 2),
400+
code("Always", 1)
401+
)
402+
)
403+
404+
coding_h <- bquote(
405+
coding(
406+
code("Never", 1),
407+
code("Rarely", 2),
408+
code("Sometimes", 3),
409+
code("Frequently", 4),
410+
code("Always", 5)
411+
)
412+
)
413+
414+
single_deparse <- function(expr) {
415+
paste0(deparse(expr), collapse = "")
416+
}
417+
418+
mapping <- tibble::tribble(
419+
~name_t1, ~coding_t1, ~name_t2, ~coding_t2, ~panel_name, ~homogenized_name, ~homogenized_coding,
420+
"id", NA_character_, "id", NA_character_, "test_panel", "id", NA_character_,
421+
"time", NA_character_, "time", NA_character_, "test_panel", "time", NA_character_,
422+
"q1", NA_character_, "question1", NA_character_, "test_panel", "question_1", NA_character_,
423+
"q2", NA_character_, "Q2", NA_character_, "test_panel", "question_2", NA_character_,
424+
"q3", single_deparse(coding_1), "q3", single_deparse(coding_2), "test_panel", "question_3", single_deparse(coding_h)
425+
)
426+
427+
panel_map <- panel_mapping(
428+
mapping,
429+
c("t1", "t2"),
430+
.schema = list(
431+
wave_name = "name",
432+
wave_coding = "coding",
433+
panel = "panel_name",
434+
homogenized_name = "homogenized_name",
435+
homogenized_coding = "homogenized_coding"
436+
)
437+
)
438+
439+
panel <-
440+
enpanel("test_panel", t1 = wave_1, t2 = wave_2) %>%
441+
add_mapping(panel_map)
442+
443+
# replace_missing_with_na overrides error_missing_raw_variables
444+
homogenized_panel <- expect_no_error(homogenize_panel(panel, replace_missing_with_na = TRUE))
445+
446+
expect_true(all(is.na(wave(homogenized_panel, "t1")$question_3)))
447+
expect_true(all(is.na(wave(homogenized_panel, "t2")$question_3)))
448+
})

0 commit comments

Comments
 (0)