repurposing_pipeline_general.Rmd

---
title: "repurposing dose selection"
output: html_document
---

```{r setup, include=FALSE}
library(magrittr)
library(dplyr)
library(stringr)
library(reshape2)
library(ggplot2)
```

The data are already preprossed (feature selection with findCorrelation was performed).
52223 observations, 414 features, 22 metadata
625 different MOA
1552 different Metadata_pert_iname
1571 different Metadata_broad_sample
6 doses for each compounds
136 plates, 384 wells in each
12, 24 or 30 replicates for each compounds (be carefull without looking at doses...)
for each doses: 2, 4, 5 (loosing some replicates in cell profiler if blank, etc.)
3264 DMSO


```{r data}
filename <- "../../input/repurposing/repurposing_normalized.rds"
Pf <- readRDS(filename)

metadata <-
  colnames(Pf) %>% str_subset("^Metadata_")

variables <-
  colnames(Pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_")

# remove features with zero variance
#Pf %<>%
#  cytominer::select(
#    sample = 
#      Pf %>% 
#      filter(Metadata_broad_sample %in% "DMSO"),
#    variables = variables,
#    operation = "variance_threshold"
#  )

#variables <-
#  names(profiles) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_") # 1784 to 1784 (no change) 
```

```{r filtering}
# select dose at 10um
Pf %<>% filter(Metadata_mmoles_per_liter == 10)

# there are two compounds that are removed because no MOA are associated and also DMSO
Pf %<>% filter(!is.na(Metadata_moa))

# remove "BRD-K60230970-001-10-0@19.9991253536431" because it is a toxic compounds
Pf %<>% filter(Metadata_broad_sample != "BRD-K60230970-001-10-0@19.9991253536431")

# remove features with NA
Pf <- Pf[ , colSums(is.na(Pf)) == 0] # from 1784 to 1615

metadata <-
  colnames(Pf) %>% str_subset("^Metadata_") # loosing 1 metadata.

variables <-
  colnames(Pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_")

```

1531 unique compounds after filetering only doses at 10um

```{r feature selection}
######################
# Features Selection #
######################
#find Correlation
# remove features that are too correlated (threshold of .9)
Pf %<>%
  cytominer::select(
    sample = Pf,
    variables = variables,
    operation = "correlation_threshold"
  )

variables <-
  names(Pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_") # 794 features (74441x726)
#########################
# End of feat selection #
#########################
Pf %>%
  saveRDS("../../input/repurposing/profile/findCorrelation/repurposing_fc.rds")

```

```{r hit selection}
#################
# Hit Selection #
#################

# uplodaing functions
source("hit_selection_correlation_function.R")
source("hit_selection_jaccard_function.R")

start.time <- Sys.time()

# reproductibility
set.seed(42)
N <- 10
seeds <- sample(1:10000, N, replace=F)
# directory:::
#single cell -> findCorrelation, SVD
#profile -> random, findCorrelation, SVD, all
dir.save <- "repurposing/profile/findCorrelation"
n.rep <- 5

hit.ratio.p <- c()
i <- 0
for(s in seeds){
  print(i)
  i <- i + 1
  hit <- hit_selection_correlation(Pf, 
                                   n.replicate = n.rep, 
                                   #filename = filenames[n],
                                   cor.method = "pearson", 
                                   seed = s, 
                                   nCPU = detectCores(),
                                   N = 5000, 
                                   dir.save = dir.save,
                                   dir.save.plus = "/hit_selected/Pearson/")
  hit.ratio.p <- cbind(hit.ratio.p, hit)
}
print("mean hit ratio: ")
print(mean(hit.ratio.p))
print("standard deviation hit ratio: ")
print(sd(hit.ratio.p)/N)

hit.ratio.j <- c()

num.feat <- round(0.05*length(variables))

i <- 0
for(s in seeds){
  print(i)
  i <- i + 1
  hit <- hit_selection_jaccard(Pf, 
                               n.replicate = n.rep,
                               #filename = filenames[n],
                               n.feat = num.feat,
                               seed = s, 
                               nCPU = detectCores(),
                               N = 5000,
                               dir.save = dir.save,
                               dir.save.plus = "/hit_selected/Jaccard/")
  hit.ratio.j <- cbind(hit.ratio.j, hit)
}
print("mean hit ratio: ")
print(mean(hit.ratio.j))
print("standard deviation hit ratio: ")
print(sd(hit.ratio.j)/N)

end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken

```

```{r enrichment ratio}
source("enrichment_ratio_function_repurposing.R")

data.filenames.p <- list.files(path = paste("../../input/", dir.save, "/hit_selected/Pearson/", sep = ""))
data.filenames.j <- list.files(path = paste("../../input/", dir.save, "/hit_selected/Jaccard/", sep = ""))

# dataframe of result
enrichment.ratio <- data.frame(mean = numeric(0), quant = numeric(0), percent = numeric(0), filename = character(0), method = character(0))

for(i in 1:length(data.filenames.p)){
  print(i)
  seed <- str_split(data.filenames.p[i], "_") %>% unlist
  seed <- str_split(seed, ".rds") %>% unlist
  seed <- seed[length(seed)-1]
  
  pf.p <- readRDS(paste("../../input/", dir.save, "/hit_selected/Pearson/", data.filenames.p[i], sep = ""))
  e.r.p <- enrichment_ratio(pf.p, top.x = 0.02, seed = seed,
                                                 nCPU = detectCores(), ##### TODO: change that!!!
                                                 N = 1000, filename = data.filenames.p[i], method = "Pearson")
  print("enrichment ratio pearson")
  print(e.r.p$percent/e.r.p$mean)### TODO: test that this is working
  enrichment.ratio <- bind_rows(enrichment.ratio, e.r.p)
  
  
  pf.j <- readRDS(paste("../../input/", dir.save, "/hit_selected/Jaccard/", data.filenames.j[i], sep = ""))  
  e.r.j <- enrichment_ratio(pf.j, top.x = 0.02, seed = seed, 
                                                 nCPU = detectCores(), ##### TODO: change that!!!
                                                 N = 1000, filename = data.filenames.j[i], method = "Jaccard")
  print("enrichment ratio jaccard")
  print(e.r.j$percent/e.r.j$mean) ### TODO: test that
  enrichment.ratio <- bind_rows(enrichment.ratio, e.r.j)
}


###### result
enr.ratio <- enrichment.ratio %>% mutate(ratio = percent/mean)

enr.ratio %<>%
  group_by(method) %>%
  summarise(ratio.mean = mean(ratio), 
            ratio.sd = sd(ratio)/sqrt(n()))
enr.ratio

enri.save.name <- "enr_ratio_all.Rda" #"enr_ratio_feat_sel_findCorr_200000.Rda"

filename.enr.ratio <- paste("../../input/", dir.save, "/enrichment_ratio/", 
                  enri.save.name, #data.filenames.p[i], 
                  #".Rda",
                  sep = "")

enr.ratio[enr.ratio=="Jaccard"] <- "Jaccard_profile_all"
enr.ratio[enr.ratio=="Pearson"] <- "Pearson_profile_all"

enr.ratio %<>% mutate(n.feat = length(variables)) #dim(Pf)[2])

save(enr.ratio, file=filename.enr.ratio)


```