-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmissimpute.Rmd
257 lines (199 loc) · 8.66 KB
/
missimpute.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
---
title: "missing value imputation (proteomics)"
author: "BS"
date: "24/02/2023"
output: github_document
---
Evaluation of the three missing value imputation method
1. Random forest
2. KNN
3. Left censored data imputation
all three imputation will be performed row-wise (proteins as rows) and column-wise (proteins as columns)
https://rformassspectrometry.github.io/MsCoreUtils/reference/imputation.html
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## load libraries
```{r message=FALSE, warning=FALSE}
library(tidyverse)
library(missForest)
library(impute)
```
## read data and tidy (this is protein groups file from data-independent acquisition (DIA) proteomics)
can be downloaded from ProteomeXchange Consortium via the PRIDE partner
repository, http://proteomecentral.proteomexchange.org; dataset id: PXD040305 (not public yet - 24.02.2023)
```{r}
data_raw <- read.delim("proteingroups.txt") %>%
select(Genes, starts_with("LFQ")) %>%
rename(Variable = Genes) %>%
column_to_rownames("Variable") %>%
log2
```
## count missingness in original data
```{r}
# count missingness
n_total <- nrow(data_raw) * ncol(data_raw)
n_missing <- sum(colSums(is.na(data_raw)))
perc_missing <- (n_total/n_missing)
cat(paste0(round(perc_missing), " %"), "of the data is missing")
```
## before imputation data will be prefiltered (60 % valid values out of all replicates -12/20)
```{r}
data_filtered <- data_raw %>%
mutate(n_valid = 100 - (100 * rowSums(is.na(.))/ncol(data_raw))) %>%
filter(n_valid >= 60) %>%
select(-n_valid)
# count percentage of missing values after filtering
n_total_afterfiltering <- nrow(data_filtered) * ncol(data_filtered)
n_missing_afterfiltering <- sum(colSums(is.na(data_filtered)))
perc_missing_afterfiltering <- (n_missing_afterfiltering/n_total_afterfiltering)
cat(paste0(perc_missing_afterfiltering, " %"), "of the data is missing after filtering")
rm(n_missing, n_total, n_total_afterfiltering, perc_missing, n_missing_afterfiltering)
```
## define functions which perform different types of imputation
### imputation with random forest (RF)
```{r}
RF_function <- function(data) {
data_imputed <- missForest::missForest(as.matrix(data))$ximp
data_imputed <- data_imputed %>%
as.data.frame()
return(data_imputed)
}
```
### imputation with KNN
```{r}
knn_function <- function(data) {
data_imputed <- impute.knn(as.matrix(data), rng.seed = NULL)$data
data_imputed <- data_imputed %>%
as.data.frame()
return(data_imputed)
}
```
### left censored missing data imputation (similar to Perseus, or DEP R package)
```{r}
impute_left_fn <- function(data, downshift = 1.8, width = 0.3) {
# obtain statistics from the data to build a distribution from which values will be imputed
valid_data_descriptives <- data %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity", -Variable) %>%
group_by(Bioreplicate) %>%
summarise(mean_valid = mean(Intensity, na.rm = T),
median_valid = median(Intensity, na.rm = T),
sd_valid = sd(Intensity, na.rm = T),
n_valid = sum(!is.na(Intensity)),
n_missing = nrow(data) - n_valid) %>%
ungroup()
# impute missing values
# imputation
column_indices <- 1:nrow(valid_data_descriptives)
random_data <- list()
# makes the list which contains as many elements as the samples are and as many random values as the
# missing values are in each sample
for (i in column_indices) {
random_data[[i]] <- rnorm(n = valid_data_descriptives$n_missing[i],
mean = valid_data_descriptives$median_valid[i] - (downshift * valid_data_descriptives$sd_valid[i]),
sd = valid_data_descriptives$sd_valid[i] * width)
# impute the missing values
data[is.na(data)[, valid_data_descriptives$Bioreplicate[i]],
valid_data_descriptives$Bioreplicate[i]] <- random_data[[i]]}
return(data)
}
leftcensored_function <- function(data) {
data_imputed <- impute_left_fn(data = data)
return(data_imputed)
}
```
## function which calculates RMSE
```{r}
# n_variable -> how many genes will be randomly taken from an inital expression data which contains 5000 genes without missing values
# imp_fn -> which imputation function
# varialbe_in -> variables in rows or in columns (rows = data is learned for each sample, columns=data is learned for each gene)
# perc_na -> what should be total percantage of missing values in a simulated data (which originally contained only valid values)
RMSE_fn_generic <- function(raw_data=data_raw, n_variable = 4500, imp_fn, variable_in, perc_na = perc_missing_afterfiltering){
# select matrix with valid values only
data_valid <- raw_data %>%
drop_na() %>%
slice(sample(1:5000, n_variable, replace = F))
# randomly introduce NA values which mimics original data after filtering
data_simulated_mis <- prodNA(data_valid, noNA = perc_na)
na_count <- sum(is.na(data_simulated_mis) == TRUE)
# variables are rows
data_variable_in_row <- data_simulated_mis
# variables are columns
data_variable_in_col <- data_simulated_mis %>%
t() %>%
as.data.frame()
# case when variables are rows
if (variable_in == "row") {
# simulated data where missing values are imputed
data_simulated_imputed <- imp_fn(data=data_variable_in_row) %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_with_imp", -Variable)
# original data with all valid values
data_original <- data_valid %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_valid", -Variable)
# combine two data
data_combined <- data_simulated_imputed %>%
left_join(data_original)
}
# case when variables are columns
if (variable_in == "column") {
# simulated data where missing values are imputed
data_simulated_imputed <- imp_fn(data=data_variable_in_col) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_with_imp", -Variable)
# original data with all valid values
data_original <- data_valid %>%
rownames_to_column("Variable") %>%
pivot_longer(names_to = "Bioreplicate", values_to = "Intensity_valid", -Variable)
# combine two data
data_combined <- data_simulated_imputed %>%
left_join(data_original)
}
# calculate RMSE
RMSE <- sqrt(sum((data_combined$Intensity_with_imp - data_combined$Intensity_valid)^2)/na_count)
return(RMSE)
}
```
## calculate RMSE for all three imputation methods (also row- and column-wise)
```{r message=FALSE, warning=FALSE}
# number of replication
n_iteration <- 5
# Random forest
rmse_rf_row <- replicate(n_iteration, RMSE_fn_generic(imp_fn = RF_function, variable_in = "row"))
rmse_rf_col <- replicate(n_iteration, RMSE_fn_generic(imp_fn = RF_function, variable_in = "column"))
# KNN
rmse_knn_row <- replicate(n_iteration, RMSE_fn_generic(imp_fn = knn_function, variable_in = "row"))
rmse_knn_col <- replicate(n_iteration, RMSE_fn_generic(imp_fn = knn_function, variable_in = "column"))
# left censored data imputation
rmse_left_row <- replicate(n_iteration, RMSE_fn_generic(imp_fn = leftcensored_function, variable_in = "row"))
rmse_left_col <- replicate(n_iteration, RMSE_fn_generic(imp_fn = leftcensored_function, variable_in = "column"))
```
## combine imputation results and visualize
```{r}
data.frame(rmse_rf_row, rmse_rf_col, rmse_knn_row, rmse_knn_col, rmse_left_row, rmse_left_col) %>%
mutate(iteration = seq_along(1:5)) %>%
rename_all(~str_replace(., "rmse_", "")) %>%
pivot_longer(names_to = "imputation", values_to = "RMSE", -iteration) %>%
mutate(imputation_name = case_when(imputation == "knn_col" | imputation == "knn_row" ~ "KNN",
imputation == "rf_col" | imputation == "rf_row" ~ "RF",
imputation == "left_col"| imputation == "left_row" ~ "Left-censored")) %>%
ggplot(aes(x=imputation, y = RMSE)) +
geom_point(size = 3) +
theme_bw()+
stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") +
facet_wrap(~imputation_name, scales = "free")
```
## RMSE
```{r}
data.frame(rmse_rf_row, rmse_rf_col, rmse_knn_row, rmse_knn_col, rmse_left_row, rmse_left_col) %>%
rownames_to_column("row") %>%
pivot_longer(names_to = "imputation", values_to = "rmse", -row) %>%
group_by(imputation) %>%
summarise(mean_rmse = mean(rmse)) %>%
arrange(-desc(mean_rmse)) %>%
mutate(imputation = str_remove(imputation, "rmse_"))
```