BS 24/02/2023
Evaluation of the three missing value imputation method 1. Random forest 2. KNN 3. Left censored data imputation
all three imputation will be performed row-wise (proteins as rows) and column-wise (proteins as columns)
https://rformassspectrometry.github.io/MsCoreUtils/reference/imputation.html
library(tidyverse)
library(missForest)
library(impute)
can be downloaded from ProteomeXchange Consortium via the PRIDE partner repository, http://proteomecentral.proteomexchange.org; dataset id: PXD040305 (not public yet - 24.02.2023)
data_raw <- read.delim("proteingroups.txt") %>%
select(Genes, starts_with("LFQ")) %>%
rename(Variable = Genes) %>%
column_to_rownames("Variable") %>%
log2
# count missingness
n_total <- nrow(data_raw) * ncol(data_raw)
n_missing <- sum(colSums(is.na(data_raw)))
perc_missing <- (n_total/n_missing)
cat(paste0(round(perc_missing), " %"), "of the data is missing")
## 18 % of the data is missing
data_filtered <- data_raw %>%
mutate(n_valid = 100 - (100 * rowSums(is.na(.))/ncol(data_raw))) %>%
filter(n_valid >= 60) %>%
select(-n_valid)
# count percentage of missing values after filtering
n_total_afterfiltering <- nrow(data_filtered) * ncol(data_filtered)
n_missing_afterfiltering <- sum(colSums(is.na(data_filtered)))
perc_missing_afterfiltering <- (n_missing_afterfiltering/n_total_afterfiltering)
cat(paste0(perc_missing_afterfiltering, " %"), "of the data is missing after filtering")
## 0.0232595062316425 % of the data is missing after filtering
rm(n_missing, n_total, n_total_afterfiltering, perc_missing, n_missing_afterfiltering)
RF_function <- function(data) {
data_imputed <- missForest::missForest(as.matrix(data))$ximp
data_imputed <- data_imputed %>%
as.data.frame()
return(data_imputed)
}
knn_function <- function(data) {
data_imputed <- impute.knn(as.matrix(data), rng.seed = NULL)$data
data_imputed <- data_imputed %>%
as.data.frame()
return(data_imputed)
}
impute_left_fn <- function(data, downshift = 1.8, width = 0.3) {
# obtain statistics from the data to build a distribution from which values will be imputed
valid_data_descriptives <- data %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity", -Variable) %>%
group_by(Bioreplicate) %>%
summarise(mean_valid = mean(Intensity, na.rm = T),
median_valid = median(Intensity, na.rm = T),
sd_valid = sd(Intensity, na.rm = T),
n_valid = sum(!is.na(Intensity)),
n_missing = nrow(data) - n_valid) %>%
ungroup()
# impute missing values
# imputation
column_indices <- 1:nrow(valid_data_descriptives)
random_data <- list()
# makes the list which contains as many elements as the samples are and as many random values as the
# missing values are in each sample
for (i in column_indices) {
random_data[[i]] <- rnorm(n = valid_data_descriptives$n_missing[i],
mean = valid_data_descriptives$median_valid[i] - (downshift * valid_data_descriptives$sd_valid[i]),
sd = valid_data_descriptives$sd_valid[i] * width)
# impute the missing values
data[is.na(data)[, valid_data_descriptives$Bioreplicate[i]],
valid_data_descriptives$Bioreplicate[i]] <- random_data[[i]]}
return(data)
}
leftcensored_function <- function(data) {
data_imputed <- impute_left_fn(data = data)
return(data_imputed)
}
# n_variable -> how many genes will be randomly taken from an inital expression data which contains 5000 genes without missing values
# imp_fn -> which imputation function
# varialbe_in -> variables in rows or in columns (rows = data is learned for each sample, columns=data is learned for each gene)
# perc_na -> what should be total percantage of missing values in a simulated data (which originally contained only valid values)
RMSE_fn_generic <- function(raw_data=data_raw, n_variable = 4500, imp_fn, variable_in, perc_na = perc_missing_afterfiltering){
# select matrix with valid values only
data_valid <- raw_data %>%
drop_na() %>%
slice(sample(1:5000, n_variable, replace = F))
# randomly introduce NA values which mimics original data after filtering
data_simulated_mis <- prodNA(data_valid, noNA = perc_na)
na_count <- sum(is.na(data_simulated_mis) == TRUE)
# variables are rows
data_variable_in_row <- data_simulated_mis
# variables are columns
data_variable_in_col <- data_simulated_mis %>%
t() %>%
as.data.frame()
# case when variables are rows
if (variable_in == "row") {
# simulated data where missing values are imputed
data_simulated_imputed <- imp_fn(data=data_variable_in_row) %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_with_imp", -Variable)
# original data with all valid values
data_original <- data_valid %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_valid", -Variable)
# combine two data
data_combined <- data_simulated_imputed %>%
left_join(data_original)
}
# case when variables are columns
if (variable_in == "column") {
# simulated data where missing values are imputed
data_simulated_imputed <- imp_fn(data=data_variable_in_col) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_with_imp", -Variable)
# original data with all valid values
data_original <- data_valid %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_valid", -Variable)
# combine two data
data_combined <- data_simulated_imputed %>%
left_join(data_original)
}
# calculate RMSE
RMSE <- sqrt(sum((data_combined$Intensity_with_imp - data_combined$Intensity_valid)^2)/na_count)
return(RMSE)
}
# number of replication
n_iteration <- 5
# Random forest
rmse_rf_row <- replicate(n_iteration, RMSE_fn_generic(imp_fn = RF_function, variable_in = "row"))
rmse_rf_col <- replicate(n_iteration, RMSE_fn_generic(imp_fn = RF_function, variable_in = "column"))
# KNN
rmse_knn_row <- replicate(n_iteration, RMSE_fn_generic(imp_fn = knn_function, variable_in = "row"))
rmse_knn_col <- replicate(n_iteration, RMSE_fn_generic(imp_fn = knn_function, variable_in = "column"))
# left censored data imputation
rmse_left_row <- replicate(n_iteration, RMSE_fn_generic(imp_fn = leftcensored_function, variable_in = "row"))
rmse_left_col <- replicate(n_iteration, RMSE_fn_generic(imp_fn = leftcensored_function, variable_in = "column"))
data.frame(rmse_rf_row, rmse_rf_col, rmse_knn_row, rmse_knn_col, rmse_left_row, rmse_left_col) %>%
mutate(iteration = seq_along(1:5)) %>%
rename_all(~str_replace(., "rmse_", "")) %>%
pivot_longer(names_to = "imputation", values_to = "RMSE", -iteration) %>%
mutate(imputation_name = case_when(imputation == "knn_col" | imputation == "knn_row" ~ "KNN",
imputation == "rf_col" | imputation == "rf_row" ~ "RF",
imputation == "left_col"| imputation == "left_row" ~ "Left-censored")) %>%
ggplot(aes(x=imputation, y = RMSE)) +
geom_point(size = 3) +
theme_bw()+
stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") +
facet_wrap(~imputation_name, scales = "free")
data.frame(rmse_rf_row, rmse_rf_col, rmse_knn_row, rmse_knn_col, rmse_left_row, rmse_left_col) %>%
rownames_to_column("row") %>%
pivot_longer(names_to = "imputation", values_to = "rmse", -row) %>%
group_by(imputation) %>%
summarise(mean_rmse = mean(rmse)) %>%
arrange(-desc(mean_rmse)) %>%
mutate(imputation = str_remove(imputation, "rmse_"))
## # A tibble: 6 × 2
## imputation mean_rmse
## <chr> <dbl>
## 1 rf_col 0.245
## 2 rf_row 0.248
## 3 knn_col 0.273
## 4 knn_row 0.527
## 5 left_col 0.614
## 6 left_row 4.48