-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrepurposing_pipeline_general.Rmd
231 lines (180 loc) · 6.71 KB
/
repurposing_pipeline_general.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
---
title: "repurposing dose selection"
output: html_document
---
```{r setup, include=FALSE}
library(magrittr)
library(dplyr)
library(stringr)
library(reshape2)
library(ggplot2)
```
The data are already preprossed (feature selection with findCorrelation was performed).
52223 observations, 414 features, 22 metadata
625 different MOA
1552 different Metadata_pert_iname
1571 different Metadata_broad_sample
6 doses for each compounds
136 plates, 384 wells in each
12, 24 or 30 replicates for each compounds (be carefull without looking at doses...)
for each doses: 2, 4, 5 (loosing some replicates in cell profiler if blank, etc.)
3264 DMSO
```{r data}
filename <- "../../input/repurposing/repurposing_normalized.rds"
Pf <- readRDS(filename)
metadata <-
colnames(Pf) %>% str_subset("^Metadata_")
variables <-
colnames(Pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_")
# remove features with zero variance
#Pf %<>%
# cytominer::select(
# sample =
# Pf %>%
# filter(Metadata_broad_sample %in% "DMSO"),
# variables = variables,
# operation = "variance_threshold"
# )
#variables <-
# names(profiles) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_") # 1784 to 1784 (no change)
```
```{r filtering}
# select dose at 10um
Pf %<>% filter(Metadata_mmoles_per_liter == 10)
# there are two compounds that are removed because no MOA are associated and also DMSO
Pf %<>% filter(!is.na(Metadata_moa))
# remove "BRD-K60230970-001-10-0@19.9991253536431" because it is a toxic compounds
Pf %<>% filter(Metadata_broad_sample != "BRD-K60230970-001-10-0@19.9991253536431")
# remove features with NA
Pf <- Pf[ , colSums(is.na(Pf)) == 0] # from 1784 to 1615
metadata <-
colnames(Pf) %>% str_subset("^Metadata_") # loosing 1 metadata.
variables <-
colnames(Pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_")
```
1531 unique compounds after filetering only doses at 10um
```{r feature selection}
######################
# Features Selection #
######################
#find Correlation
# remove features that are too correlated (threshold of .9)
Pf %<>%
cytominer::select(
sample = Pf,
variables = variables,
operation = "correlation_threshold"
)
variables <-
names(Pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_") # 794 features (74441x726)
#########################
# End of feat selection #
#########################
Pf %>%
saveRDS("../../input/repurposing/profile/findCorrelation/repurposing_fc.rds")
```
```{r hit selection}
#################
# Hit Selection #
#################
# uplodaing functions
source("hit_selection_correlation_function.R")
source("hit_selection_jaccard_function.R")
start.time <- Sys.time()
# reproductibility
set.seed(42)
N <- 10
seeds <- sample(1:10000, N, replace=F)
# directory:::
#single cell -> findCorrelation, SVD
#profile -> random, findCorrelation, SVD, all
dir.save <- "repurposing/profile/findCorrelation"
n.rep <- 5
hit.ratio.p <- c()
i <- 0
for(s in seeds){
print(i)
i <- i + 1
hit <- hit_selection_correlation(Pf,
n.replicate = n.rep,
#filename = filenames[n],
cor.method = "pearson",
seed = s,
nCPU = detectCores(),
N = 5000,
dir.save = dir.save,
dir.save.plus = "/hit_selected/Pearson/")
hit.ratio.p <- cbind(hit.ratio.p, hit)
}
print("mean hit ratio: ")
print(mean(hit.ratio.p))
print("standard deviation hit ratio: ")
print(sd(hit.ratio.p)/N)
hit.ratio.j <- c()
num.feat <- round(0.05*length(variables))
i <- 0
for(s in seeds){
print(i)
i <- i + 1
hit <- hit_selection_jaccard(Pf,
n.replicate = n.rep,
#filename = filenames[n],
n.feat = num.feat,
seed = s,
nCPU = detectCores(),
N = 5000,
dir.save = dir.save,
dir.save.plus = "/hit_selected/Jaccard/")
hit.ratio.j <- cbind(hit.ratio.j, hit)
}
print("mean hit ratio: ")
print(mean(hit.ratio.j))
print("standard deviation hit ratio: ")
print(sd(hit.ratio.j)/N)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
```
```{r enrichment ratio}
source("enrichment_ratio_function_repurposing.R")
data.filenames.p <- list.files(path = paste("../../input/", dir.save, "/hit_selected/Pearson/", sep = ""))
data.filenames.j <- list.files(path = paste("../../input/", dir.save, "/hit_selected/Jaccard/", sep = ""))
# dataframe of result
enrichment.ratio <- data.frame(mean = numeric(0), quant = numeric(0), percent = numeric(0), filename = character(0), method = character(0))
for(i in 1:length(data.filenames.p)){
print(i)
seed <- str_split(data.filenames.p[i], "_") %>% unlist
seed <- str_split(seed, ".rds") %>% unlist
seed <- seed[length(seed)-1]
pf.p <- readRDS(paste("../../input/", dir.save, "/hit_selected/Pearson/", data.filenames.p[i], sep = ""))
e.r.p <- enrichment_ratio(pf.p, top.x = 0.02, seed = seed,
nCPU = detectCores(), ##### TODO: change that!!!
N = 1000, filename = data.filenames.p[i], method = "Pearson")
print("enrichment ratio pearson")
print(e.r.p$percent/e.r.p$mean)### TODO: test that this is working
enrichment.ratio <- bind_rows(enrichment.ratio, e.r.p)
pf.j <- readRDS(paste("../../input/", dir.save, "/hit_selected/Jaccard/", data.filenames.j[i], sep = ""))
e.r.j <- enrichment_ratio(pf.j, top.x = 0.02, seed = seed,
nCPU = detectCores(), ##### TODO: change that!!!
N = 1000, filename = data.filenames.j[i], method = "Jaccard")
print("enrichment ratio jaccard")
print(e.r.j$percent/e.r.j$mean) ### TODO: test that
enrichment.ratio <- bind_rows(enrichment.ratio, e.r.j)
}
###### result
enr.ratio <- enrichment.ratio %>% mutate(ratio = percent/mean)
enr.ratio %<>%
group_by(method) %>%
summarise(ratio.mean = mean(ratio),
ratio.sd = sd(ratio)/sqrt(n()))
enr.ratio
enri.save.name <- "enr_ratio_all.Rda" #"enr_ratio_feat_sel_findCorr_200000.Rda"
filename.enr.ratio <- paste("../../input/", dir.save, "/enrichment_ratio/",
enri.save.name, #data.filenames.p[i],
#".Rda",
sep = "")
enr.ratio[enr.ratio=="Jaccard"] <- "Jaccard_profile_all"
enr.ratio[enr.ratio=="Pearson"] <- "Pearson_profile_all"
enr.ratio %<>% mutate(n.feat = length(variables)) #dim(Pf)[2])
save(enr.ratio, file=filename.enr.ratio)
```