-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStandard Verification Function.R
451 lines (370 loc) · 24.6 KB
/
Standard Verification Function.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
########## STANDARD VERIFICATION FUNCTION ########
###################################################
###### **Note: ID variables should be standardized
###### (within and across datasets),
###### and cleaned before applying this function**
###################################################
###################################################
########## Exports basic problematic cases into excel files for manual verification
########## It works with a simple comparison against one reference file (e.g. Masterlist, Randomization file, etc)
########## Exports duplicate IDs
########## Exports unmatched cases
########## Creates flags based on subject names (Students, Teachers, Schools, depending the level of the survey)
########## Creates flags based on a group of variables specified by the user
########## DATA: Databases that need to be verified, specified as a list
################### e.g. DATA=list(EGRA=EGRAdata,EGMA=EGMAdata,ODK=ODKdata), DATA=list(BASE=BASELINE, MID=MIDLINE, END=ENDLINE), or if only one: DATA=list(TOCA=TOCAdata)
################### For each element of the list you must write a short name to distinguish that dataset in the function and the name of the
################### imported data.
########## ID: ID variable name used to link the database with the reference files, specified as string
################### e.g. ID="ST_ID"
################### **Note: make sure that this variable is called and formatted in the same way across all databases and reference file**
########## UNIQUEID: Variable that if combined with the ID variable can function as a unique identifier of each ASSESSMENT in the data (not person)
################### **Note: it doesn't matter if the same person took it twice we want one unique identifier per survey entry.
################### (the only cases in which it is OK for the the identifier+ID combination not to be unique is if the system re-sent the
################### same entry twice, in which case there would be no way to uniquely distinguish two surveys, and
################### those two would share the same "unique" ID) These are the cases classified as "IDENTICAL" and this function will address them,
################### but they don't often happen.
################### E.g. UNIQUEID=c("START_TIME","START_TIME","START_TIME"), or UNIQUEID=c("SUBMISSION_TIME","START_TIME","DATE")
########## REFERENCE: Administrtive database, previously wave of data collection, or any other file that can be used as a reference for verification.
########## NAMES: Group of name variable(s) (multiple only if at the student level), specified in the form of a character vector
################### e.g. NAMES = c("NAME", "FATHER_NAME","MOTHER_NAME") or "TEACHER_NAME"
################### E.g. if you want to include "NAME" as a criterion but not "NAMETAG" make sure you either change the name
################### of one of them or that you subset the datasets to the relevant variables for verification before applying the function**
########## VARS: Group of variables that wiill be used for verification, specified by the user in the form of a character vector
################### e.g. VARS = c("AGE","GRADE","SCHOOL_ID","PACKET")
################### **Note: make sure that these variables are called and formatted in the same way
################### after importing both the assessment data and the reference data.**
################### **Note2: make sure that there are no other variables in the data set that contain these names in theirs
################### E.g. if you want to include "CLASS" as a criterion but not "CLASSIFICATION" make sure you either change the name
################### of one of them or that you subset the datasets to the relevant variables for verification before applying the function**
########## NMARGIN: Percentage of letter discrepancy that we want to allow when comparing names without flagging.
################### e.g. NMARGIN = 0.1 (for 10%)
########## Vlowermargins and Vuppermargins: Specify in the form of a matrix how big of a difference you want to allow between the datasets
################### and the reference file.
################### for the numeric variables without getting flagged
################### e.g. maximum a difference of +-1 year in age and a difference of 2 on the upper bound for grade and of 0 on the lower bound
################### Vlowermargins= c(1,0,NA,NA)
################### Vuppermargins= c(1,2,NA,NA)
########## OUTPATH: Path to the directory where the issues should be saved
################### e.g. OUTPATH = paste0("/Users/",user,"/Box Sync/Box 3EA Team Folder/Data Management/2016-17 3EA Niger Data Cleaning/Merges Pre-Imputation/ACROSS-WAVE MERGES/ISSUES/")
########## FILE: Name root for the output files including the country letters, the year, the assessment, and the wave information
################### These should depart from the content of each one of the dataframes specified in "DATA" (in the same order, please).
################### e.g. FILE = "LBY2_EGRA_BASELINE" or FILE = "NGY1_TOCA_MIDLINE" or FILE=c("LBY2_EGRA_BASELINE","LBY2_EGMA_BASELINE","LBY2_ODK_BASELINE")
export.issues <- function(DATA, ID,UNIQUEID, REFERENCE, NAMES, VARS, NMARGIN, Vlowermargins,Vuppermargins,OUTPATH,FILE, FNPATH, debug_file = NULL, allow_duplicates = TRUE){
## If no path was specified
if(is.null(FNPATH)){
print("You forgot to specify the PATH to the folder where all the TIES functions are stored.
This function has dependencies on other TIES functions, so make sure to fill in the 'FNPATH=' argument. For example: FNPATH=paste0('/Users/',user,'/Box Sync/Box 3EA Team Folder/Data Management/General Functions/')")
}
if (length(NAMES)==0){
NAMES <- NA
}
if (length(VARS)==0){
VARS <- NA
}
if(length(FILE)!=length(DATA)){
print("Looks like the number of DATABASES that you are using and the number of output FILE you entered are different.
Please check if you missed to specify how you want the issues for each database to be saved as in the FILE argument.")
}
"%not_in%" <- function(x,table){match(x,table,nomatch = 0)==0 }
### Temporary chunk for workshop --------------------------------------------------------
### -------------------------------------------------------------------------------------
# STIDS <- DATA$ML_Y2[,c("STID_Y1","STID_Y2")]
#
# DATA$EGRA_Y2B$STID_Y1 <- DATA$EGRA_Y2B$STID_Y1.y
# DATA$EGRA_Y2B$STID_Y1.x <- NULL
# DATA$EGRA_Y2B$STID_Y1.y <- NULL
#
# DATA$ODK_Y2B$STID_Y1 <- DATA$ODK_Y2B$STID_Y1.y
# DATA$ODK_Y2B$STID_Y1.x <- NULL
# DATA$ODK_Y2B$STID_Y1.y <- NULL
### -------------------------------------------------------------------------------------
### -------------------------------------------------------------------------------------
##################################################################################
## Add empty columns if the variable doesn't already exist in that database
#source(paste0("/home/",user,"/GLOBAL_TIES/FUNCTIONS/AddEmptyColumns.R"))
source(paste0(FNPATH,"AddEmptyColumns.R"))
DATA <- add.empty(DATA=DATA,R=REFERENCE, VARLIST=c(NAMES,VARS))
DUPS <- list()
for(d in 1:length(DATA)){
## Keep an assessment-tagged unique identifier for the assessment that won't be changed throughout fixes so that we can always track back to the
### original state.
eval(parse(text=paste0("DATA[[d]]$UNIQUEID_",names(DATA)[d], "<- paste0(DATA[[d]]$",ID,",DATA[[d]]$",UNIQUEID[d],")")))
## Duplicates
DATA[[d]] <- DATA[[d]][order(DATA[[d]][,ID]),]
dups <- duplicated(DATA[[d]][,ID])|duplicated(DATA[[d]][,ID],fromLast = T)
# if(sum(dups)>0){
# DUPS[[d]] <- DATA[[d]][dups,]
# write.csv(DUPS[[d]],paste0(OUTPATH,FILE[d],"_DUPLICATES.csv"))
# }
## Create a variable that indicates whether there is a duplicate of that ID in a given data set to help make the final verification manageable.
DATA[[d]]$DUPLICATED <- rep("NO",nrow(DATA[[d]]))
DATA[[d]]$DUPLICATED[dups] <- paste0("ID DUPLICATED IN ",names(DATA)[d])
#names(DATA[[d]])[names(DATA[[d]])=="DUPLICATED"] <- paste0("DUPLICATED_",names(DATA)[d])
# ## Unmatched
# write.csv(DATA[[d]][DATA[[d]][,ID] %not_in% REFERENCE[,ID],],paste0(OUTPATH,FILE[d],"_ONLYINDATA.csv"))
# write.csv(REFERENCE[REFERENCE[,ID] %not_in% DATA[[d]][,ID] ,],paste0(OUTPATH,FILE[d],"_ONLYINREFERENCE.csv"))
## Keep only the variables used for verification
DATA[[d]] <- DATA[[d]][,grepl(paste0("^",paste0(names(DATA[[d]])[names(DATA[[d]]) %in% c(VARS,NAMES,ID,UNIQUEID[d])],collapse = "$|^"),
"|ASSESSMENT_TIME|DUPLICATED$|",ID,"$"),names(DATA[[d]]),ignore.case=T)]
## Add sufixes to the variables that the dataframes have in common and that will be used for verification
names(DATA[[d]])[ names(DATA[[d]])!=ID] <- paste0(names(DATA[[d]])[ names(DATA[[d]])!=ID],"_",names(DATA)[d])
}
## Also check for duplicates in the reference file
dups <- duplicated(REFERENCE[,ID])|duplicated(REFERENCE[,ID],fromLast = T)
## and create a flag for them
REFERENCE$DUPLICATED_REFERENCE <- rep("NO",nrow(REFERENCE))
REFERENCE$DUPLICATED_REFERENCE[dups] <- "ID DUPLICATED IN REFERENCE"
## Keep only the variables used for verification in Reference file
REFERENCE <- REFERENCE[,grepl(paste0("^",paste0(names(REFERENCE)[names(REFERENCE) %in% c(VARS,NAMES,ID,UNIQUEID[length(DATA)+1])],collapse = "$|^"),
"|ASSESSMENT_TIME|DUPLICATED_REFERENCE|",ID,"$"),names(REFERENCE),ignore.case=T)]
####################################################################################
## Mismatched
####################################################################################
####################################################################################
for(d in 1:length(DATA)){
print("#######################")
print(paste0(names(DATA)[d]))
print(names(DATA[[d]]))
print(length(names(DATA[[d]])))
}
##################################################################################
## Merge databases
# MERGE <- Reduce(function(...) merge(...,by=ID,all=T),DATA)
merge_warn <- FALSE
MERGE <- Reduce(function(x, y) {
if (requireNamespace("data.table", quietly = TRUE)) {
x <- data.table::as.data.table(x)
y <- data.table::as.data.table(y)
x_dups <- x[duplicated(x[[ID]]) | duplicated(x[[ID]], fromLast = TRUE), .N]
y_dups <- y[duplicated(y[[ID]]) | duplicated(y[[ID]], fromLast = TRUE), .N]
if (x_dups == 0L || y_dups == 0L) {
return(merge(x, y, by = ID, all = TRUE))
} else {
print("Duplicates caught. Cartesian product imminent.")
if (x_dups > 0L) {
print("lhs duplicates:")
print(x[duplicated(x[[ID]]) | duplicated(x[[ID]], fromLast = TRUE)][[ID]])
}
if (y_dups > 0L) {
print("rhs duplicates:")
print(y[duplicated(y[[ID]]) | duplicated(y[[ID]], fromLast = TRUE)][[ID]])
}
if (!isTRUE(allow_duplicates)) {
stop("Halting to prevent cartesian product.", call. = FALSE)
}
}
x <- as.data.frame(x)
y <- as.data.frame(y)
}
if (merge_warn != TRUE) {
warning("Using base R `merge`. For better performance, install 'data.table'", call. = FALSE, immediate. = TRUE)
merge_warn <<- TRUE
}
merge(x, y, by = ID, all = TRUE)
}, DATA)
MERGE <- merge(REFERENCE,MERGE,by=ID,all=T)
## Concatenate duplicated indicators into one variable and delete the others
## Create a function that collapses strings without including NA values
paste3 <- function(...,sep=", ") {
L <- list(...)
L <- lapply(L,function(x) {x[is.na(x)] <- ""; x})
ret <-gsub(paste0("(^",sep,"|",sep,"$)"),"",
gsub(paste0(sep,sep),sep,
do.call(paste,c(L,list(sep=sep)))))
is.na(ret) <- ret==""
ret
}
MERGE$DUPLICATED <- apply(MERGE[,grepl("DUPLICATED",names(MERGE))],1,function(x){if(!is.na(x)){paste3(x,collapse=" & ")}})
MERGE$DUPLICATED <- gsub("NO & | & NO","",MERGE$DUPLICATED)
MERGE$DUPLICATED <- gsub("^& | &$| &[[:space:]]+$","",MERGE$DUPLICATED)
MERGE$DUPLICATED <- gsub("&+","&",MERGE$DUPLICATED)
MERGE$DUPLICATED <- gsub("^ & $|^$|^[[:space:]]+","NO",MERGE$DUPLICATED)
## Delete the individual indicators
MERGE <- MERGE[,!(grepl("DUPLICATED_",names(MERGE))&!grepl("REFERENCE",names(MERGE)))]
## Bring indicator to the front
MERGE <- MERGE[,c(paste0(ID),"DUPLICATED",names(MERGE)[!grepl(paste0(ID,"|DUPLICATED"),names(MERGE))])]
####################################################################################
### By variables
####################################################################################
if(!all(is.na(VARS))){
for(i in 1:length(VARS)){
## Print status message
print(paste0("Checking for ",VARS[i]," issues"))
## Make sure that if the user specified this as a numeric variable, it wasn't turned by accident into another class like logical or character
eval(parse(text=paste0("MERGE[,grepl('^",VARS[i],"',names(MERGE))] <- as.data.frame(sapply(MERGE[,grepl('^",VARS[i],"',names(MERGE))], function(c){as.numeric(as.character(c))}))")))
## If there are no "lower" & "upper" margins, just look for any discrepancy
if(is.na(Vlowermargins[i]) & is.na(Vuppermargins[i])){
eval(parse(text=paste0("MERGE$",VARS[i],"_ISSUE<- !apply(MERGE[,grepl('^",VARS[i],"',names(MERGE))],1,function(x){
all(x==x[which(!is.na(x))[1]],na.rm=T)})")))
} ## Checking if all entries equal the the first non-missing entry if the vector
## If the data is not missing in the reference file, then that is the reference value
## But this ensures that if the value is missing from the reference file a check is still run
## using the second-priority file as reference or the next assessment that has an observed entry.
## If an upper margin was indicated, then flag if the upper difference is larger
if(!is.na(Vuppermargins[i])){
eval(parse(text=paste0("MERGE$",VARS[i],"_ISSUE<- apply(MERGE[,grepl('^",VARS[i],"',names(MERGE))],1,function(x){
any(x>x[which(!is.na(x))[1]]+Vuppermargins[i],na.rm = T)})")))
} ## Checking if all entries equal the the first non-missing entry if the vector
## If the data is not missing in the reference file, then that is the reference value
## But this ensures that if the value is missing from the reference file a check is still run
## using the second-priority file as reference or the next assessment that has an observed entry.
## If a lower margin was indicated, then flag if the lower difference is larger
if(!is.na(Vlowermargins[i])){
## If the upper margin was indicated and the ISSUE variable was already created, then we just want to fill it in if no discrepancies were found on the upper side
eval(parse(text=paste0("if(length(MERGE$",VARS[i],"_ISSUE)>0){MERGE$",VARS[i],"_ISSUE[MERGE$",VARS[i],"_ISSUE!=TRUE]<- apply(MERGE[MERGE$",VARS[i],"_ISSUE!=TRUE,grepl('^",VARS[i],"',names(MERGE)) & !grepl('ISSUE',names(MERGE))],1,function(x){any(x<(x[which(!is.na(x))[1]]-Vlowermargins[i]),na.rm = T)})
}else{
MERGE$",VARS[i],"_ISSUE<- apply(MERGE[,grepl('^",VARS[i],"',names(MERGE))],1,function(x){any(x<(x[which(!is.na(x))[1]]-Vlowermargins[i]),na.rm = T)})
}"))) ## Otherwise we just want to create the ISSUE variable from scratch based on the lower margin alone
}
}
}else{print("You didn't specify any characteristics in VARS, so nothing else will be used for verification")}
####################################################################################
### Now let's find the mismatches by name
####################################################################################
## First make sure that there are no invald regular expressions in the NAMES variables
allnames <- paste0("^",NAMES,collapse = "|")
MERGE[,grepl(allnames,names(MERGE))] <- sapply(MERGE[,grepl(allnames,names(MERGE))],
function(x){gsub("[[:punct:]]","",x)})
if(!all(is.na(NAMES))){
## This process is computationally burdensome (yes, it crashed my computer once), so we need to use some tools
## to parallelize the operations.
## For this we need to use the doParallel package
## This chunk of code will make sure that such packet is installed in your computer
check.and.install <- function(p){
needed.packages <- p[!(p %in% installed.packages()[,"Package"])]
## Install missing packages
if(length(needed.packages)){ install.packages(needed.packages)}
## Load all packages
for (i in 1:length(p)){
eval(parse(text=paste0("library(",p[i],")")))
}
}
p <- c("doParallel", "RecordLinkage")
check.and.install(p)
## Next, for the package to work we need a set up that tells your computer how many cores to allocate for this task
## psanker: parallel::detectCores() automatically detects this, but to avoid completely eating your machine, we use detectCores() - 1
numcores <- parallel::detectCores() - 1
cl <- if (!is.null(debug_file)) {
makePSOCKcluster(numcores, outfile = debug_file)
} else {
makePSOCKcluster(numcores)
}
doParallel::registerDoParallel(cl, cores = numcores)
print(names(MERGE))
## Now we can start looping over the text variables in NAMES to look for mismatches
for (i in 1:length(NAMES)){
## Print a status message
print(paste0("Checking for ",NAMES[i]," issues"))
#writeLines(c(""), paste0(OUTPATH,"log.txt"))
source(paste0(FNPATH,"NameMismatchFunction2.R"))
name.issues <- foreach(n = names(DATA),.combine="cbind")%:%
#cat(paste0(NAMES[i], " in ", n))
foreach(o = names(DATA)[names(DATA) %not_in% n], .export = c("Name.Missmatch","MERGE","NMARGIN", "notsim_chr"), .packages="RecordLinkage", .combine="cbind") %dopar% {
#source(paste0(FNPATH,"NameMismatchFunction2.R"))
#sink(paste0(OUTPATH,"log.txt"), append=TRUE)
cat("\n", paste0(n," compared to ",o," in ",NAMES[i]), "\n", sep = "")
#w <- which(names(DATA)[names(DATA) %not_in% n]==o)
# eval(parse(text=paste0("issue <- Name.Missmatch(NAME_A='",NAMES[i],"_",n,"',NAME_B='",NAMES[i],"_",o,"',DATA=MERGE,PROPORTION=NMARGIN, NAME_ROOT='", NAMES[i], "')")))
issue <- notsim_chr(
col_a = paste0(NAMES[i], "_", n),
col_b = paste0(NAMES[i], "_", o),
database = MERGE,
proportion = NMARGIN,
varname = NAMES[i]
)
# Check against reference column **if** the reference variable exists in MERGE
if (NAMES[i] %in% names(MERGE)) {
issue | notsim_chr(
col_a = paste0(NAMES[i], "_", n),
col_b = NAMES[i],
database = MERGE,
proportion = NMARGIN,
varname = NAMES[i]
)
} else {
issue
}
}
## Flag any entries that got at least one mismatch between assessments
MERGE[, paste0(NAMES[i], "_ISSUE")] <- apply(name.issues, 1L, any, na.rm = TRUE)
# eval(parse(text=paste0("MERGE$",NAMES[i],"_ISSUE<- apply(name.issues,1,function(x){any(x,na.rm=T)})")))
#apply(name.issues2,1,function(x){any(x,na.rm=T)})
}
stopCluster(cl) ## This should be done after concluding the parallelization process
}else{ print("You didn't specify any NAMES variables, so names will be ignored for verification")}
### Remove all the sub-issue variables
#MERGE <- MERGE[,!(grepl("ISSUE",names(MERGE)) & grepl("[0-9]$",names(MERGE)))]
MERGE <- MERGE[,order(names(MERGE))]
MERGE <- MERGE[,c(names(MERGE)[grepl(paste0(VARS,collapse = "|"),names(MERGE))],names(MERGE)[!grepl(paste0(VARS,collapse = "|"),names(MERGE))])]
MERGE <- MERGE[,c(names(MERGE)[grepl(paste0(NAMES,collapse = "|"),names(MERGE))],names(MERGE)[!grepl(paste0(NAMES,collapse = "|"),names(MERGE))])]
MERGE <- MERGE[,c(names(MERGE)[grepl(paste0(ID),names(MERGE))],names(MERGE)[!grepl(paste0(ID),names(MERGE))])]
### Add a few empty columns at the start that will be needed for verification
MERGE$Problem <- rep("",nrow(MERGE))
MERGE$WHAT <- rep("",nrow(MERGE))
MERGE$change.from <- rep("",nrow(MERGE))
MERGE$change.to <- rep("",nrow(MERGE))
MERGE$DATABASE <- rep("",nrow(MERGE))
MERGE$Note <- rep("",nrow(MERGE))
MERGE$X <- NULL
### Move the verification columns to the front
Verif <- "^Problem$|^WHAT$|^change.to$|^change.from$|^DATABASE$|^Note$"
MERGE <- MERGE[,c(names(MERGE)[grepl(Verif,names(MERGE))],names(MERGE)[!grepl(Verif,names(MERGE))])]
row.names(MERGE) <- NULL
### Run the basic cleaning function to get rid of empty columns
# Basic Cleaning
source(paste0(FNPATH,"Basic Cleaning.R"))
MERGE <- CLEAN(MERGE)
### Create a column that counts the number of issues
MERGE$N_ISSUES <- apply(MERGE[, grepl("ISSUE",names(MERGE))],1, function(x) sum(as.logical(x))) # Since CLEAN removes converts all logicals to characters, swap back
############## <Ugly chunk> can be written better
### Purge the DUPLICATED variable from unnecessary &s
MERGE$DUPLICATED[grepl("(&[[:space:]]+&)|&$",MERGE$DUPLICATED)] <-
gsub("(&[[:space:]]+&)|&$", "",MERGE$DUPLICATED[grepl("(&[[:space:]]+&)|&$",MERGE$DUPLICATED)])
MERGE$DUPLICATED[grepl("[[:space:]]+",MERGE$DUPLICATED)] <- gsub("[[:space:]]+&[[:space:]]+"," & ",MERGE$DUPLICATED[grepl("[[:space:]]+",MERGE$DUPLICATED)])
MERGE$DUPLICATED[grepl("[[:space:]]+$",MERGE$DUPLICATED)] <- gsub("[[:space:]]+$","",MERGE$DUPLICATED[grepl("[[:space:]]+$",MERGE$DUPLICATED)])
MERGE$DUPLICATED[grepl("O[[:space:]]+I",MERGE$DUPLICATED)] <- gsub("O[[:space:]]+I","O & I",MERGE$DUPLICATED[grepl("O[[:space:]]+I",MERGE$DUPLICATED)])
MERGE$DUPLICATED[grepl("O&",MERGE$DUPLICATED)] <- gsub("O&","O &",MERGE$DUPLICATED[grepl("O&",MERGE$DUPLICATED)])
### Create a dup flag and compare with the original duplicate column
MERGE$DUPS <- duplicated(MERGE[,ID])|duplicated(MERGE[,ID],fromLast = T)
names(MERGE)[names(MERGE)=="DUPLICATED"] <- "DUPLICATED IN"
### Create the verification columns and move the N_ISSUES to the front
MERGE$Problem <- rep("",nrow(MERGE))
MERGE$WHAT <- rep("",nrow(MERGE))
MERGE$change.from <- rep("",nrow(MERGE))
MERGE$change.to <- rep("",nrow(MERGE))
MERGE$DATABASE <- rep("",nrow(MERGE))
MERGE$Note <- rep("",nrow(MERGE))
### Reorder the ISSUES variables
source(paste0(FNPATH, "Reorder.R"))
MERGE <- ReOrder(MERGE,"Note",ID)
MERGE <- ReOrder(MERGE,"DATABASE",ID)
MERGE <- ReOrder(MERGE,"change.to",ID)
MERGE <- ReOrder(MERGE,"change.from",ID)
MERGE <- ReOrder(MERGE,"WHAT",ID)
MERGE <- ReOrder(MERGE,"Problem",ID)
MERGE <- ReOrder(MERGE,"DUPS","Note")
MERGE <- ReOrder(MERGE,"DUPLICATED IN","Note")
MERGE <- ReOrder(MERGE,"N_ISSUES","Note")
for( n in 1:length(NAMES)){
## Position of first column that contains the name of each variable checked
first <- grep(paste0("^",NAMES[n],"_"),names(MERGE))[1]
previous <- names(MERGE)[first-1]
print(paste0("Moving: ",NAMES[n],"_ISSUE, before: ",names(MERGE)[first]))
## Move the ISSUE column to the begining of the group
MERGE <- ReOrder(MERGE,paste0(NAMES[n],"_ISSUE"),previous)
}
### Temporary chunk for workshop - data security ----------------------------------------
### -------------------------------------------------------------------------------------
# namevars <- names(MERGE)[grepl("NAME|INITIAL",names(MERGE),ignore.case = T)]
# if(length(namevars)>0){
# MERGE <- encrypt(MERGE,namevars,public_key_path=file.path(folder,"Code/id_workshop.pub"))
# }
# MERGE <- merge(MERGE, STIDS,by="STID_Y1",all.x=TRUE,all.y=FALSE)
# MERGE$STID_Y1[grepl("^AR|^BR",MERGE$STID_Y1)] <- NA
# MERGE <- ReOrder(MERGE,"STID_Y2","STID_Y1")
### -------------------------------------------------------------------------------------
### -------------------------------------------------------------------------------------
### Export file for mismatches verification
write.csv(MERGE,paste0(OUTPATH,"ALL_MISMATCHES.csv"),row.names = F)
}