From 4f31123a1c41380d0713a89484ef07b639390def Mon Sep 17 00:00:00 2001 From: Gregor de Cillia Date: Fri, 15 Jan 2021 15:01:51 +0100 Subject: [PATCH] hoteck: cleanup and document --- R/hotdeck.R | 61 ++++++++++++++++++++++++++------------------------ man/hotdeck.Rd | 29 +++++++++++++++--------- 2 files changed, 50 insertions(+), 40 deletions(-) diff --git a/R/hotdeck.R b/R/hotdeck.R index 350d52d..f632c8c 100644 --- a/R/hotdeck.R +++ b/R/hotdeck.R @@ -15,15 +15,15 @@ #' Hot-Deck Imputation -#' +#' #' Implementation of the popular Sequential, Random (within a domain) hot-deck #' algorithm for imputation. -#' -#' +#' +#' #' @param data data.frame or matrix #' @param variable variables where missing values should be imputed (not overlapping with ord_var) #' @param ord_var variables for sorting the data set before imputation (not overlapping with variable) -#' @param domain_var variables for building domains and impute within these +#' @param domain_var variables for building domains and impute within these #' domains #' @param makeNA list of length equal to the number of variables, with values, that should be converted to NA for each variable #' @param NAcond list of length equal to the number of variables, with a condition for imputing a NA @@ -43,30 +43,37 @@ #' @keywords manip #' @family imputation methods #' @examples -#' +#' #' data(sleep) #' sleepI <- hotdeck(sleep) #' sleepI2 <- hotdeck(sleep,ord_var="BodyWgt",domain_var="Pred") -#' +#' #' # Usage of donorcond in a simple example -#' sleepI3 <- hotdeck(sleep,variable=c("NonD","Dream","Sleep","Span","Gest"), -#' ord_var="BodyWgt",domain_var="Pred", -#' donorcond = list(">4","<17",">1.5","%between%c(8,13)",">5")) -#' +#' sleepI3 <- hotdeck( +#' sleep, +#' variable = c("NonD", "Dream", "Sleep", "Span", "Gest"), +#' ord_var = "BodyWgt", domain_var = "Pred", +#' donorcond = list(">4", "<17", ">1.5", "%between%c(8,13)", ">5") +#' ) +#' #' set.seed(132) #' nRows <- 1e3 #' # Generate a data set with nRows rows and several variables -#' x<-data.frame(x=rnorm(nRows),y=rnorm(nRows),z=sample(LETTERS,nRows,replace=TRUE), -#' d1=sample(LETTERS[1:3],nRows,replace=TRUE),d2=sample(LETTERS[1:2],nRows,replace=TRUE), -#' o1=rnorm(nRows),o2=rnorm(nRows),o3=rnorm(100)) +#' x <- data.frame( +#' x = rnorm(nRows), y = rnorm(nRows), +#' z = sample(LETTERS, nRows, replace = TRUE), +#' d1 = sample(LETTERS[1:3], nRows, replace = TRUE), +#' d2 = sample(LETTERS[1:2], nRows, replace = TRUE), +#' o1 = rnorm(nRows), o2 = rnorm(nRows), o3 = rnorm(100) +#' ) #' origX <- x -#' x[sample(1:nRows,nRows/10),1] <- NA -#' x[sample(1:nRows,nRows/10),2] <- NA -#' x[sample(1:nRows,nRows/10),3] <- NA -#' x[sample(1:nRows,nRows/10),4] <- NA -#' xImp <- hotdeck(x,ord_var = c("o1","o2","o3"),domain_var="d2") -#' -#' +#' x[sample(1:nRows,nRows/10), 1] <- NA +#' x[sample(1:nRows,nRows/10), 2] <- NA +#' x[sample(1:nRows,nRows/10), 3] <- NA +#' x[sample(1:nRows,nRows/10), 4] <- NA +#' xImp <- hotdeck(x,ord_var = c("o1", "o2", "o3"), domain_var = "d2") +#' +#' #' @export hotdeck <- function(data , variable=NULL, ord_var=NULL,domain_var=NULL, makeNA=NULL,NAcond=NULL,impNA=TRUE,donorcond=NULL, @@ -179,7 +186,7 @@ imputeHD <- function(xx,variableX,varTypeX,imp_varX,imp_suffixX, setkeyv(xx,prevKey) xx$UniqueIdForImputation <- 1:nrow(xx) } - + if(!is.null(makeNAX)){ # eval(parse(text="xx[xx>1]")) setnames(xx,v,"weirdandlongname") @@ -188,13 +195,9 @@ imputeHD <- function(xx,variableX,varTypeX,imp_varX,imp_suffixX, } setkeyv(xx,v) if(varTypeX[v]%in%c("numeric","integer")){ - setnames(xx,v,"VariableWhichIsCurrentlyImputed") impPart <- xx[J(NA_real_),UniqueIdForImputation,nomatch=FALSE]#$UniqueIdForImputation - setnames(xx,"VariableWhichIsCurrentlyImputed",v) }else{ - setnames(xx,v,"VariableWhichIsCurrentlyImputed") impPart <- xx[J(NA_character_),UniqueIdForImputation,nomatch=FALSE]#$UniqueIdForImputation - setnames(xx,"VariableWhichIsCurrentlyImputed",v) } if((length(impPart)>0)&&(length(impPart)50){ + if(add>min(50, nrow(xx))){ TF <- FALSE # remaining missing values will be set to a random value from the group @@ -263,9 +266,9 @@ imputeHD <- function(xx,variableX,varTypeX,imp_varX,imp_suffixX, # print(v) # print(identical(xImp[,v],xImp1[,v])) #} -# -# -# +# +# +# #require(microbenchmark) #res <- microbenchmark(xImp <- hd(x,ord_var = c("o1","o2","o3"),domain_var="d2"),times=10) diff --git a/man/hotdeck.Rd b/man/hotdeck.Rd index 337f4bf..e9c87c3 100644 --- a/man/hotdeck.Rd +++ b/man/hotdeck.Rd @@ -59,22 +59,29 @@ sleepI <- hotdeck(sleep) sleepI2 <- hotdeck(sleep,ord_var="BodyWgt",domain_var="Pred") # Usage of donorcond in a simple example -sleepI3 <- hotdeck(sleep,variable=c("NonD","Dream","Sleep","Span","Gest"), -ord_var="BodyWgt",domain_var="Pred", -donorcond = list(">4","<17",">1.5","\%between\%c(8,13)",">5")) +sleepI3 <- hotdeck( + sleep, + variable = c("NonD", "Dream", "Sleep", "Span", "Gest"), + ord_var = "BodyWgt", domain_var = "Pred", + donorcond = list(">4", "<17", ">1.5", "\%between\%c(8,13)", ">5") +) set.seed(132) nRows <- 1e3 # Generate a data set with nRows rows and several variables -x<-data.frame(x=rnorm(nRows),y=rnorm(nRows),z=sample(LETTERS,nRows,replace=TRUE), - d1=sample(LETTERS[1:3],nRows,replace=TRUE),d2=sample(LETTERS[1:2],nRows,replace=TRUE), - o1=rnorm(nRows),o2=rnorm(nRows),o3=rnorm(100)) +x <- data.frame( + x = rnorm(nRows), y = rnorm(nRows), + z = sample(LETTERS, nRows, replace = TRUE), + d1 = sample(LETTERS[1:3], nRows, replace = TRUE), + d2 = sample(LETTERS[1:2], nRows, replace = TRUE), + o1 = rnorm(nRows), o2 = rnorm(nRows), o3 = rnorm(100) +) origX <- x -x[sample(1:nRows,nRows/10),1] <- NA -x[sample(1:nRows,nRows/10),2] <- NA -x[sample(1:nRows,nRows/10),3] <- NA -x[sample(1:nRows,nRows/10),4] <- NA -xImp <- hotdeck(x,ord_var = c("o1","o2","o3"),domain_var="d2") +x[sample(1:nRows,nRows/10), 1] <- NA +x[sample(1:nRows,nRows/10), 2] <- NA +x[sample(1:nRows,nRows/10), 3] <- NA +x[sample(1:nRows,nRows/10), 4] <- NA +xImp <- hotdeck(x,ord_var = c("o1", "o2", "o3"), domain_var = "d2") }