-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBasic Cleaning.R
351 lines (287 loc) · 14.3 KB
/
Basic Cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
### BASIC & GENERAL CLEANING ###
##Old code kept for reference
CLEAN <- function(DATA,DATEVAR=NULL,DATEFORMAT=NULL,BEGINDATE=NULL){
## Turn all non-numeric variables into character class variables:
###################################################################
## Create a function that applies the character conversion (sapply is not working well for this)
convert.char<- function(Data,Types){
for (i in 1:length(Data)){
FUN <- switch(Types[i],character = as.character,
numeric = as.numeric,
factor = as.factor)
Data[,i] <- FUN(Data[,i])
}
Data
}
###################################################################################################
#Do some general cleaning:
###################################################################################################
## Remove -99s & spaces from ID variables
if( class(DATA[,grepl("ID$|ID..",names(DATA))])=="data.frame"){
DATA[,grepl("ID|ID..",names(DATA))]<-as.data.frame(sapply(DATA[,grepl("ID|ID..",names(DATA))],function(x){
gsub("^-99$|^99$|^[[:space:]]+-99[[:space:]]+$|^[[:space:]]+-99[[:space:]]+$|^[[:space:]]+-99$|^[[:space:]]+99$|^-99[[:space:]]+$|^99[[:space:]]+$",NA,x)
}))
DATA[,grepl("ID$|ID..",names(DATA))]<-as.data.frame(sapply(DATA[,grepl("ID$|ID..",names(DATA))],function(x){
gsub("[[:space:]]+","",x)}))
}else{
DATA[,grepl("ID$|ID..",names(DATA))] <- gsub("[[:space:]]+","",DATA[,grepl("ID$|ID..",names(DATA))])
DATA[,grepl("ID$|ID..",names(DATA))] <- gsub("^-99$|^99$|^[[:space:]]+-99[[:space:]]+$|^[[:space:]]+-99[[:space:]]+$|^[[:space:]]+-99$|^[[:space:]]+99$|^-99[[:space:]]+$|^99[[:space:]]+$",NA,DATA[,grepl("ID$|ID..",names(DATA))])
}
#Remove spaces
DATA<-as.data.frame(sapply(DATA,function(x){gsub("^[[:space:]]+$|^[[:space:]]+|[[:space:]]+$","",x)}))
##**change character NAs to missing in order to be able to keep the levels in a numeric format.**
DATA<-as.data.frame(sapply(DATA,function(x){gsub("^Na$|^NA$|^N A$|n/a|#N/A|#DIV/0!|#VALUE!",NA,x)}))
#Delete empty variables
## First, identify the empty variables
Empty <- sapply(DATA,function(x){sum(is.na(x)|grepl("^$|^[[:space:]]+$",x))==length(x)})
## Keep only the non-empty variables
DATA <- DATA[,!Empty]
## Delete empty observations
empty <- as.vector(apply(DATA,1,function(x){sum(is.na(x)|grepl("^$|^[[:space:]]+$",x))}))==length(DATA)
DATA <- DATA[!empty,]
## Turn variables that have only numeric values to numeric class
#################################################################
## If all levels of a variable are numeric
for(v in 1:length(DATA)){
if (sum(!is.na(as.integer(names(table(DATA[,v])))))==length(names(table(DATA[,v])))){
#Then turn the variable into numeric type:
DATA[,v]<-as.numeric(as.character(DATA[,v]))
}
}
## If more than one string variable in the dataset:
strings <- sapply(DATA,function(x){is.character(x)|is.factor(x)})
if(sum(strings)>1){
## If character variable, then turn it to upper case
DATA[,strings] <- as.data.frame(sapply(DATA[,strings],toupper))
## Apply the function that converts factors into character to the factor subset:
DATA[,strings]<-convert.char(DATA[,strings],rep("character",length(DATA[,strings])))
}else{
if(sum(strings)==1){
## If character variable, then turn it to upper case
DATA[,strings] <- toupper(DATA[,strings])
## Apply the function that converts factors into character to the factor subset:
DATA[,strings] <- as.character(DATA[,strings])
}
}
## If a start of data collection date was specified then delete observations
if(all(c(!is.null(DATEVAR), !is.null(DATEFORMAT), !is.null(BEGINDATE)))){
DATES <- as.POSIXlt(DATA [,DATEVAR], format=DATEFORMAT)
DATA <- DATA[DATES>=BEGINDATE,]
}else{
if(sum(c(!is.null(DATEVAR), !is.null(DATEFORMAT), !is.null(BEGINDATE)))>0){
print(paste0("You didn't specify ", which(c(is.null(DATEVAR), is.null(DATEFORMAT), is.null(BEGINDATE)))))
}}
## Delete empty observations
empty <- as.vector(apply(DATA,1,function(x){sum(is.na(x))}))==length(DATA)
DATA <- DATA[!empty,]
return(DATA)
}
#' Perform an initial cleaning pass
#'
#' This removes simple artifacts commonly found in raw data (-99 flags, strange NA values, and factors). Some features may already be handled by data import, but this attempts to catch the rest. Furthermore, all character data gets capitalized; this is known to clash with some multibyte Unicode strings. Inspect your raw data before using this!
#'
#' @param dat A `data.frame` to be cleaned
#' @param remove_accents If you wish accented characters to be "down-converted" to their Latin base, set this to `TRUE`.
#' @param date_cols Any columns whose data are considered dates that could be coerced to POSIXct objects
#' @param date_format A character vector (length 1 or `length(date_cols)`) that contains the desired date format for `date_cols`
#' @param begin_date A `POSIXct` date that is a reference date. If `date_cols` is length 1, all rows with a date less
#' than `begin_date` will be dropped.
#' @param factors_to_chars If `TRUE`, then all factors will be converted to character vectors
#' @return A cleaned data.frame
#'
#' @export
basic_clean <- function(dat,
remove_accents = TRUE,
date_cols = NULL,
date_format = NULL,
begin_date = NULL,
factors_to_chars = TRUE,
drop_id_ws = TRUE,
id_cols = NULL) {
stopifnot(inherits(dat, "data.frame"))
datl <- as.list(dat)
empty_cols <- rep(FALSE, length(datl))
detected_id_cols <- rep(FALSE, length(datl))
for (j in seq_along(datl)) {
current_class <- class(datl[[j]])
if ("factor" %in% current_class && isTRUE(factors_to_chars)) {
datl[[j]] <- as.character(datl[[j]])
}
if ("character" %in% current_class) {
datl[[j]] <- tryCatch({
toupper(datl[[j]])
}, error = function(e) {
warning(paste0("Multibyte string failed to make column all uppercase: ", names(datl)[[j]]), immediate. = TRUE, call. = FALSE)
datl[[j]]
})
datl[[j]] <- cleanenv$clear_artifacts(datl[[j]])
datl[[j]] <- cleanenv$coerce_class(datl[[j]])
current_class <- class(datl[[j]])
}
if (!is.null(date_cols) && names(datl)[[j]] %in% date_cols) {
if (is.null(date_format)) {
stop("Date format must be defined", call. = FALSE)
} else if (!length(date_format) %in% c(1L, length(date_cols))) {
warning("Used the first date format provided. This may not be correct. Ensure the number of formats matches the number of date columns", call. = FALSE, immediate. = TRUE)
date_format <- date_format[[1L]]
}
if (length(date_format) == 1L) {
date_format <- rep(date_format, length(date_cols))
}
datl[[j]] <- unlist(Map(cleanenv$to_date, datl[[j]], date_format))
}
if ("character" %in% current_class) {
# Final pass to get rid of dummy 99 and -99s in non-numeric data
datl[[j]][grepl("^\\s*-?99\\s*$", datl[[j]])] <- NA
if (isTRUE(remove_accents)) {
datl[[j]] <- cleanenv$remove_accents(datl[[j]])
}
if (isTRUE(drop_id_ws)) {
if (!is.null(id_cols) && names(datl)[[j]] %in% id_cols) {
datl[[j]] <- vapply(datl[[j]], function(x) gsub("\\s+", "", x), character(1L))
} else if (is.null(id_cols) & grepl(cleanenv$ID_WS, names(datl)[[j]])) {
detected_id_cols[[j]] <- TRUE
datl[[j]] <- vapply(datl[[j]], function(x) gsub("\\s+", "", x), character(1L))
}
}
}
empty_cols[[j]] <- all(is.na(datl[[j]]) | datl[[j]] == "")
}
if (isTRUE(drop_id_ws)) {
columns <- names(datl)[detected_id_cols]
if (!is.null(id_cols)) {
columns <- c(id_cols, columns)
}
message(paste0("ALL whitespace removed in ID columns: ", paste0(columns, collapse = ", ")))
}
datl <- datl[!empty_cols]
datl <- cleanenv$quick_df(datl)
if (length(date_cols) == 1L && !is.null(begin_date)) {
datl <- datl[datl[[date_cols]] >= begin_date, ]
}
empty_rows <- unlist(apply(datl, 1, function(dat_row) all(is.na(dat_row) | dat_row == "")))
datl[!empty_rows, ]
}
# Function name to be deprecated once the cleaning toolkit is converted to a package
# Private variable scoping
cleanenv <- new.env()
# Using an environment to take advantage of the hashmap implementation of environments to drop the O(N) behavior
# of lookup to O(1). chartr was being slow for some reason. This implementation is now just kinda slow.
cleanenv$BAD_CHARS <- list('a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'c', 'C', 'o', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'n', 'N', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 's', 'S', 'S', 'u', 'U', 'u', 'U', 'u', 'U', 'U', 'y', 'y', 'Y', 'y', 'z', 'Z', 'b', 'B')
names(cleanenv$BAD_CHARS) <- c('á', 'Á', 'à', 'À', 'â', 'Â', 'å', 'Å', 'ä', 'Ä', 'ã', 'Ã', 'æ', 'Æ', 'ç', 'Ç', 'ð', 'é', 'É', 'è', 'È', 'ê', 'Ê', 'ë', 'Ë', 'í', 'Í', 'ì', 'Ì', 'î', 'Î', 'ï', 'Ï', 'ñ', 'Ñ', 'ó', 'Ó', 'ò', 'Ò', 'ô', 'Ô', 'ö', 'Ö', 'õ', 'Õ', 'ø', 'Ø', 'š', 'Š', 'ß', 'ú', 'Ú', 'ù', 'Ù', 'û', 'Û', 'Ü', 'ý', 'ý', 'Ý', 'ÿ', 'ž', 'Ž', 'þ', 'Þ')
cleanenv$BAD_CHARS <- list2env(cleanenv$BAD_CHARS)
# Pattern to test if column is ID. Preferably, supply ID columns via id_cols
cleanenv$ID_WS <- "\\w+ID[^[:alpha:]]|ID$|ID.."
#' Quickly convert a list to a dataframe
#'
#' For lists that have a structure like a `data.frame` (constant number of rows in each column and each column is an atomic vector), add the necessary metadata to the list to transform to a `data.frame` without any checks and change the class. This is a much quicker form of `as.data.frame` when reasonable assumptions of the input list's structure are in place. [Written by Hadley Wickham](http://adv-r.had.co.nz/Profiling.html#be-lazy)
#'
#' @param l A list whose structure mirrors that of a `data.frame`
#' @return A `data.frame` form of the list
#'
#' @name cleanenv$quick_df
#' @keywords internal
cleanenv$quick_df <- function(l) {
class(l) <- "data.frame"
attr(l, "row.names") <- .set_row_names(length(l[[1]]))
l
}
#' Remove whitespace and common NA values
#'
#' If not already taken care of during file reading using \code{\link[data.table]{fread}}, \code{\link[readr]{read_csv}} (and relatives), or \code{\link{read.table}} (and relatives), get rid of excess whitespace in cells and convert extra common "NA" strings to `NA`. To prevent excess class coercion, this only works for character vectors.
#'
#' @param vec A character vector
#' @return A cleared character vector
#'
#' @name cleanenv$clear_artifacts
#' @keywords internal
cleanenv$clear_artifacts <- function(vec) {
stopifnot(is.character(vec))
not_na <- !is.na(vec)
ws_pattern <- "^\\s+|\\s+$"
na_pattern <- "^N\\s?\\/?A$|^#N/A$|^#DIV/0!$|^#VALUE!$"
ws_match <- not_na & grepl(ws_pattern, vec)
na_match <- not_na & grepl(na_pattern, vec, ignore.case = T)
vec[ws_match] <- gsub(ws_pattern, "", vec[ws_match])
vec[na_match] <- gsub(na_pattern, "", vec[na_match], ignore.case = T)
vec[not_na & (vec == "")] <- NA
vec
}
#' Safely coerce character vectors into other classes
#'
#' Converts a character vector to other classes. This tries to coerce each element to a `integer`, `double`, or `complex` (in that order), and if all non-`NA` elements can be coerced, it will execute the coercion.
#'
#' @param vec A character vector to be coerced
#' @return The possibly-coerced vector
#'
#' @name cleanenv$coerce_class
#' @keywords internal
cleanenv$coerce_class <- function(vec) {
stopifnot(is.character(vec))
tryCatch({
as.integer(vec)
}, warning = function(warn) {
tryCatch({
as.double(vec)
}, warning = function(warn) {
tryCatch({
as.complex(vec)
}, warning = function(warn) {
vec
})
})
}, error = function(e) {
vec
})
}
#' Convert integral or character data to dates
#'
#' Essentially, this is a wrapper to as.POSIXct with known pre-defined parameters because as.POSIXct is a pain.
#'
#' @param vec A vector that represents temporal information
#' @return A `POSIXct` vector
#'
#' @name cleanenv$to_date
#' @keywords internal
cleanenv$to_date <- function(vec, format) {
if (is.numeric(vec)) {
as.POSIXct(vec, format = format, origin = "1960-01-01")
} else {
as.POSIXct(vec, format = format)
}
}
#' Drop accented characters
#'
#' Maps known accented characters to their English counterparts.
#'
#' If `stringi` is installed, this will use stringi::stri_trans_general() convert to Latin-ASCII.
#' Otherwise, it will use a hard-coded transformation (please install stringi).
#'
#' @param vec A character vector which may have "accented" characters (Latin/Germanic)
#' @return A character vector without accented characters
#'
#' @name cleanenv$remove_accents
#' @keywords internal
cleanenv$remove_accents <- function(vec) {
stopifnot(is.character(vec))
if (requireNamespace("stringi", quietly = TRUE)) {
return(stringi::stri_trans_general(vec, id = "Latin-ASCII"))
}
message("Using the hard-coded character translations to remove \"accented\" characters. Please install `stringi` to use a better engine.")
vec[!is.na(vec)] <- vapply(vec[!is.na(vec)], function(x) {
tryCatch({
chars <- strsplit(x, "")[[1L]]
for (.c in chars) {
if (!is.null(cleanenv$BAD_CHARS[[.c]])) {
x <- gsub(.c, cleanenv$BAD_CHARS[[.c]], x)
}
}
x
}, error = function(e) {
browser()
warning(paste0("Bad string encountered. Leaving as-is in accent removal..."), call. = FALSE, immediate. = TRUE)
x
})
}, character(1L))
vec
}