-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStandard Verification Function One Set.R
295 lines (242 loc) · 13.5 KB
/
Standard Verification Function One Set.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
########## STANDARD VERIFICATION FUNCTION WHEN ONLY ONE ASSESSMENT IS AVAILABLE ########
###################################################
###### **Note: ID variables should be standardized
###### before applying this function, (you can use
###### the basic cleaning function, if your ID variables contain "ID)**
###################################################
###################################################
########## Exports basic problematic cases into excel files for manual verification
########## It works with a simple comparison against one reference file (e.g. Masterlist, Randomization file, etc)
########## Exports duplicate IDs
########## Exports cases where IDs don't match the standard ID form
########## DATA: Database that need to be verified
########## REFERENCE: Administrtive database, previously wave of data collection, or any other file that can be used as a reference for verification.
########## ID: ID variable name used
################### e.g. ID="ST_ID"
########## IDPATT: ID format structure. Specified by the user as a list, where the "Fixed" portion of the IDs is indicated between quotes, if there's any
################### subset of characters that may change by region it should be specified in the "Changing" portion, the range of digits that are are allowed in
################### in the IDs (if 3 to 4 digits allowed then 3:4, if only 4 digits allowed then 4, and the structure of the ID is indicated in quotes,
################### with X representing the fixed portion of the ID, C the changing characters and N the position of the numbers in relation to the letter portions.
################### E.g. if IDs are of the form AREM1734, BREM5643 then IDPATT=list(Fixed="REM", Changing="(A|B)", Numbers=4, structure= "CXN")
################### if IDs are of the form TATT134, TBTT345, TATT4956 then IDPATT=list(Fixed="T", Changing="(A|B)", Numbers=3:4, structure= "XCXXN")
################### if IDs are of the form ST134, ST67895 then IDPATT=list(Fixed="ST", Changing="", Numbers=3:5, structure= "XN")
################### if IDs are of the form A134 and B678 then IDPATT=list(Fixed="", Changing="(A|B)", Numbers=3, structure= "CN")
########## UNIQUEID: Variable that if combined with the ID variable can function as a unique identifier of each ASSESSMENT in the data (not person)
################### **Note: it doesn't matter if the same person took it twice we want one unique identifier per survey entry.
################### (the only cases in which it is OK for the the identifier+ID combination not to be unique is if the system re-sent the
################### same entry twice, in which case there would be no way to uniquely distinguish two surveys, and
#### those two would share the same "unique" ID) These are the cases classified as "IDENTICAL" and this function will address them,
#### but they don't often happen. The function won't actually use it but the argument is there to remind the user that the UNIQUE ID should be created before
################### the verification output is given to the workers who will take charge of the manual verifications.
########## OUTPATH: Path to the directory where the issues should be saved
################### e.g. OUTPATH = paste0("/Users/",user,"/Box Sync/Box 3EA Team Folder/Data Management/2016-17 3EA Niger Data Cleaning/Merges Pre-Imputation/ACROSS-WAVE MERGES/ISSUES/")
########## FILE: Name root for the output files including the country letters, the year, the assessment, and the wave information
################### These should depart from the content of each one of the dataframes specified in "DATA" (in the same order, please).
################### e.g. FILE = "LBY2_EGRA_BASELINE" or FILE = "NGY1_TOCA_MIDLINE" or FILE=c("LBY2_EGRA_BASELINE","LBY2_EGMA_BASELINE","LBY2_ODK_BASELINE")
oneset.issues <- function(
DATA,
ID,
IDPATT,
UNIQUEID,
OUTPATH,
FILE,
FUN_DIR,
REFERENCE = NULL,
FIX_FORMAT = FALSE,
ROW_NAMES = FALSE,
EXTRA_COLUMNS = NULL
) {
## Check and install any required packages
source(paste0(FUN_DIR, "packages_check.R"))
REQUIREMENTS <- c("crayon","encryptr")
check.and.install(p = REQUIREMENTS)
# Check if OUTPATH exists. Make directory if it doesn't exist.
if (!dir.exists(OUTPATH)) {
message("Created output directory: ", file.path(OUTPATH))
dir.create(OUTPATH, recursive = TRUE)
}
## Duplicates
DATA <- DATA[order(DATA[,ID]),]
DUPS <- DATA[duplicated(DATA[,ID])|duplicated(DATA[,ID],fromLast = T),]
## Cases with weird IDs
X <- IDPATT[[1]]
C <- IDPATT[[2]]
N <- IDPATT[[3]]
S <- unlist(strsplit(IDPATT[[4]],split="|"))
ORDER <- rep(NA,length(S))
ORDER[which(S==bquote(C))] <- C
ORDER[which(S==bquote(X))] <- X
stringfinal <- ""
string <- rep(NA,length(N))
for(n in 1:length(N)){
string[n] <- paste0(replicate(N[n],"[0-9]"),collapse = "")
stringfinal <- paste0(stringfinal,string[n],sep = "$|")
}
ORDER[which(S==bquote(N))] <- paste0("(",substr(stringfinal,1,nchar(stringfinal)-1),")",collapse = "|") ## Remove only the last character
ORDER[1] <- paste0("^", ORDER[1])
## "ORDER" CONTAINS THE STRUCTURE THAT THE IDS SHOULD HAVE
## Now we look into the data for IDs that don't match the given structure, and save those cases into "WEIRD"
WEIRD <- DATA[!grepl(paste0(ORDER,collapse = ""),DATA[,ID]),]
## Make sure that none of the weird cases are already in the duplicates
"%not_in%" <- function(x,table){match(x,table,nomatch = 0)==0 }
WEIRD <- WEIRD[WEIRD[,ID] %not_in% DUPS[,ID],]
## Add an indicator for the type of issue
WEIRD <- cbind(rep("WEIRD ID",nrow(WEIRD)),WEIRD)
names(WEIRD)[1] <- "ISSUE"
DUPS <- cbind(rep("DUPLICATED ID",nrow(DUPS)),DUPS)
names(DUPS)[1] <- "ISSUE"
## Append weird cases and duplicates
ISSUES <- rbind(WEIRD,DUPS)
namevars <- names(DATA)[grepl("NAME|INITIAL",names(DATA),ignore.case = T)]
if (nrow(ISSUES) > 0) {
if (isTRUE(FIX_FORMAT)) {
ISSUES <- .add_fix_format(ISSUES, id_col = ID, uid_col = UNIQUEID, verification_col_order = EXTRA_COLUMNS)
}
### Temporary chunk for workshop - data security ----------------------------------------
### -------------------------------------------------------------------------------------
# if(length(namevars)>0){
# ISSUES <- encrypt(ISSUES,namevars,public_key_path=file.path(folder,"Code/id_workshop.pub"))
# }
### -------------------------------------------------------------------------------------
write.csv(ISSUES, paste0(OUTPATH, FILE, "_DUPLICATES&ISSUES.csv"), row.names = ROW_NAMES)
}else{
cat(crayon::magenta(paste0("HURRAY!! ", FILE, " HAD NO DUPLICATES OR ALIEN IDS!!")))
cat("\n")
}
## Unmatched
if(!is.null(REFERENCE)) {
onlyindat <- DATA[DATA[[ID]] %not_in% REFERENCE[[ID]], ]
onlyinref <- REFERENCE[REFERENCE[[ID]] %not_in% DATA[[ID]], ]
if (isTRUE(FIX_FORMAT)) {
onlyindat <- .add_fix_format(onlyindat, id_col = ID, uid_col = UNIQUEID, verification_col_order = EXTRA_COLUMNS)
onlyinref <- .add_fix_format(onlyinref, id_col = ID, uid_col = UNIQUEID, verification_col_order = EXTRA_COLUMNS)
}
### Temporary chunk for workshop - data security ----------------------------------------
### -------------------------------------------------------------------------------------
# if(length(namevars)>0){
# onlyindat <- encrypt(onlyindat,namevars,public_key_path=file.path(folder,"Code/id_workshop.pub"))
# refnames <- names(onlyinref)[grepl("NAME|INITIAL",names(onlyinref),ignore.case = T)]
# onlyinref <- encrypt(onlyinref,refnames,public_key_path=file.path(folder,"Code/id_workshop.pub"))
# }
### -------------------------------------------------------------------------------------
### -------------------------------------------------------------------------------------
write.csv(onlyindat, paste0(OUTPATH, FILE, "_ONLYINDATA.csv"), row.names = ROW_NAMES)
write.csv(onlyinref, paste0(OUTPATH, FILE, "_ONLYINREFERENCE.csv"), row.names = ROW_NAMES)
}
}
# New version of oneset.issues() that's more flexible
oneset_issues <- function(
database,
id_col,
unique_id_col,
database_name,
output_directory = NULL,
reference = NULL,
column_structure = NULL,
custom_checks = NULL,
extra_cols = NULL
) {
stopifnot(inherits(database, "data.frame"))
if (!requireNamespace("data.table", quietly = TRUE)) {
stop("oneset_issues() makes extensive use of data.table. Please install it.", call. = FALSE)
}
DT <- data.table::as.data.table(database)
# ID duplicates
.handle_issue(DT, "duplicated_id", function(dat) {
dat[, duplicated(dat[[id_col]]) | duplicated(dat[[id_col]], fromLast = TRUE)]
})
# Column pattern structure checks -- defaults to ID check
if (!is.null(column_structure)) {
if (is.list(column_structure[[1L]])) {
# Check selected columns' structure
stopifnot(!is.null(names(column_structure)))
for (col in names(column_structure)) {
.handle_issue(DT, paste0("bad_", col), function(dat) {
dat[, !grepl(.structure_regex(column_structure[[..col]]), dat[[..col]])]
})
}
} else {
.handle_issue(DT, paste0("bad_", id_col), function(dat) {
dat[, !grepl(.structure_regex(..column_structure), dat[[..id_col]])]
})
}
}
# Only in data checks
if (!is.null(reference)) {
stopifnot(inherits(reference, "data.frame"))
RF <- data.table::as.data.table(reference)
.handle_issue(DT, "dataonly", function(dat) {
dat[, apply(.SD, 1L, function(x) !x %in% RF[[..id_col]]), .SDcols = id_col]
})
}
# Custom checks
if (!is.null(custom_checks)) {
stopifnot(is.list(custom_checks), !is.null(names(custom_checks)))
for (check in names(custom_checks)) {
.handle_issue(DT, check, custom_checks[[check]])
}
}
# Convert issue cols to one column
issue_cols <- names(DT)[grepl("^_.*_issue$", names(DT))]
for (issue_col in issue_cols) {
query <- bquote(.(as.name(issue_col)) := as.character(.(as.name(issue_col))))
DT[, eval(query)]
query <- bquote(.(as.name(issue_col)) := NA_character_)
DT[DT[[issue_col]] != "TRUE", eval(query)]
query <- bquote(.(as.name(issue_col)) := {
flagname <- gsub("^_", "", issue_col)
gsub("_issue$", "", flagname)
})
DT[DT[[issue_col]] == "TRUE", eval(query)]
}
DT[, ISSUE := apply(.SD, 1L, function(x) paste0(x[!is.na(x)], collapse = ", ")), .SDcols = issue_cols]
query <- bquote(order(.(as.name(id_col))))
DT <- DT[eval(query)]
output <- .add_fix_format(as.data.frame(DT), id_col = id_col, uid_col = unique_id_col, verification_col_order = extra_cols)
output <- output[!(is.na(output$ISSUE) | grepl("^\\s*$", output$ISSUE)), ]
if (!is.null(output_directory) && nrow(output) > 0L) {
data.table::fwrite(output, file = file.path(output_directory, paste0(database_name, ".csv")))
} else if (nrow(output) == 0L) {
message("No issues detected in ", database_name, "!")
}
output
}
.handle_issue <- function(dat, flag_name, condition) {
stopifnot(!data.table::is.data.table(dt))
query <- bquote(.(as.name(paste0("_", flag_name, "_issue"))) := condition(dat))
dat[, eval(query)]
}
.structure_regex <- function(idpatt) {
stopifnot(length(setdiff(names(idpatt), c("Fixed", "Numbers", "Changing", "structure"))) == 0L)
working_pattern <- idpatt[["structure"]]
# Handling the `Numbers = x:y` syntax
if (length(idpatt[["Numbers"]]) > 1) {
idpatt <- paste0(idpatt[["Numbers"]][1], ",", idpatt[["Numbers"]][length(idpatt[["Numbers"]])])
}
working_pattern <- gsub("N", paste0("\\\\d{", idpatt[["Numbers"]], "}"), working_pattern)
working_pattern <- gsub("C", idpatt[["Changing"]], working_pattern)
working_pattern <- gsub("X", gsub("(\\W)", "\\\\\\1", idpatt[["Fixed"]]), working_pattern)
working_pattern
}
.add_fix_format <- function(dat, id_col = "", uid_col = "", verification_col_order = NULL) {
stopifnot(is.data.frame(dat))
format_cols <- c("Verifier", "Problem", "WHAT", "change.from", "change.to", "Note")
collided_vars <- format_cols[format_cols %in% names(dat)]
if (length(collided_vars) > 0L) {
warning("Could not safely add single set fix format because of variable name conflict: ", paste0(collided_vars, collapse = ", "), ". Please compensate for these", call. = FALSE, immediate. = TRUE)
return(dat)
}
for (column in format_cols) {
dat[, column] <- rep("", nrow(dat))
}
format_col_front <- c(id_col, uid_col, format_cols, "ISSUE")
format_col_front <- format_col_front[format_col_front != "" & format_col_front %in% names(dat)]
total_order <- if (!is.null(verification_col_order)) {
ver_cols <- setdiff(verification_col_order, format_col_front)
ver_cols <- ver_cols[ver_cols %in% names(dat)]
c(format_col_front, ver_cols, setdiff(names(dat), c(format_col_front, ver_cols)))
} else {
c(format_col_front, setdiff(names(dat), format_col_front))
}
dat[, total_order]
}