-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRunningCogena.R
102 lines (78 loc) · 4.74 KB
/
RunningCogena.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Astrid M Manuel
# Runing Cogena for drug repositioning strategies
# Last Update 04/05/2021
library(cogena)
setwd("C:/Users/amanuel1/Dev/ZhaoLab/Cogena")
RNAseq <- read.table("C:/users/amanuel1/Dev/ZhaoLab/MS/GSE111972_norm_data.txt", header=T, https://urldefense.proofpoint.com/v2/url?u=http-3A__as.is&d=DwIGAg&c=bKRySV-ouEg_AT-w2QWsTdd9X__KYh9Eq2fdmQDVZgw&r=0_Ey8ObnWR6RQ9TmFjrMS3UcaG_BibJ1VjW_J_E1t10&m=HvT-jxUKm5XwhenTSUiJK6EXJFG6QxqRyB1KepUeEek&s=O_BK_nzMn3aoOgKF9Hm5nHcz7EiQitAVpsLNjBlwgzs&e= = T)
row.names(RNAseq) <- RNAseq[,1]
RNAseq <- RNAseq[,-c(1)]
RNAseq <- data.matrix(RNAseq)
#quantile normalization by Dave Tang, source: https://urldefense.proofpoint.com/v2/url?u=https-3A__davetang.org_muse_2014_07_07_quantile-2Dnormalisation-2Din-2Dr_&d=DwIGAg&c=bKRySV-ouEg_AT-w2QWsTdd9X__KYh9Eq2fdmQDVZgw&r=0_Ey8ObnWR6RQ9TmFjrMS3UcaG_BibJ1VjW_J_E1t10&m=HvT-jxUKm5XwhenTSUiJK6EXJFG6QxqRyB1KepUeEek&s=0kYakmXsG_91I29y3gdYAXO6xpzH9gStvGXALbEZzrY&e=
quantile_normalisation <- function(df){
df_rank <- apply(df,2,rank,ties.method="min")
df_sorted <- data.frame(apply(df, 2, sort))
df_mean <- apply(df_sorted, 1, mean)
index_to_mean <- function(my_index, my_mean){
return(my_mean[my_index])
}
df_final <- apply(df_rank, 2, index_to_mean, my_mean=df_mean)
rownames(df_final) <- rownames(df)
return(df_final)
}
#Log 2 transform, this is common for RNA seq analysis:
RNA_log2T <- (log2(RNAseq+1))
#performing quantile normalization, this is not common for RNA seq analysis, but may help for our analysis:
RNA_normal <- quantile_normalisation(RNA_log2T)
#plotting distributions to show the log2 transform, and normalized distributions:
par(mfrow=c(2,1))
boxplot(RNA_log2T, main="After log2 Tranformation")
boxplot(RNA_normal, main= "After log2 Transform & Quantile Normalization")
#separating cases vs control
GE.control <- RNA_normal[,c(seq(1,16))]
https://urldefense.proofpoint.com/v2/url?u=http-3A__GE.case&d=DwIGAg&c=bKRySV-ouEg_AT-w2QWsTdd9X__KYh9Eq2fdmQDVZgw&r=0_Ey8ObnWR6RQ9TmFjrMS3UcaG_BibJ1VjW_J_E1t10&m=HvT-jxUKm5XwhenTSUiJK6EXJFG6QxqRyB1KepUeEek&s=SQ5bRIcImHZJoMdCdfONRZmgYNna3l9WjvJ7OMUkrIQ&e= <- RNA_normal[,c(seq(17,31))]
#seperating sub-phenotypes (white matter vs grey matter samples)
GE.control.WM <- GE.control[,c(2,4,6,8,10,11,12,13,14,15,16)]
https://urldefense.proofpoint.com/v2/url?u=http-3A__GE.control.GM&d=DwIGAg&c=bKRySV-ouEg_AT-w2QWsTdd9X__KYh9Eq2fdmQDVZgw&r=0_Ey8ObnWR6RQ9TmFjrMS3UcaG_BibJ1VjW_J_E1t10&m=HvT-jxUKm5XwhenTSUiJK6EXJFG6QxqRyB1KepUeEek&s=Vn0OuJBfmWLTeyxriwQPzWZAdj3t1xKoJ0Aiu7IctPo&e= <- GE.control[,-c(2,4,6,8,10,11,12,13,14,15,16)]
GE.case.WM <- GE.case[,c(2,4,6,8,10,11,12,13,14,15)]
https://urldefense.proofpoint.com/v2/url?u=http-3A__GE.case.GM&d=DwIGAg&c=bKRySV-ouEg_AT-w2QWsTdd9X__KYh9Eq2fdmQDVZgw&r=0_Ey8ObnWR6RQ9TmFjrMS3UcaG_BibJ1VjW_J_E1t10&m=HvT-jxUKm5XwhenTSUiJK6EXJFG6QxqRyB1KepUeEek&s=MFG5E0226a2B6fH3pd8zB-L-pVczFmYdMqYTcCK1q1U&e= <- GE.case[,-c(2,4,6,8,10,11,12,13,14,15)]
#Only getting the MS-PRGs from Bayesian framework (Andi's work)
MS_PRGs <- read.table("MS_Risk_GeneList_methy.txt", https://urldefense.proofpoint.com/v2/url?u=http-3A__as.is&d=DwIGAg&c=bKRySV-ouEg_AT-w2QWsTdd9X__KYh9Eq2fdmQDVZgw&r=0_Ey8ObnWR6RQ9TmFjrMS3UcaG_BibJ1VjW_J_E1t10&m=HvT-jxUKm5XwhenTSUiJK6EXJFG6QxqRyB1KepUeEek&s=O_BK_nzMn3aoOgKF9Hm5nHcz7EiQitAVpsLNjBlwgzs&e= = T, header = T)
MS_PRGs <- MS_PRGs$x
ExMatrix <- cbind(GE.case.WM,GE.control.WM)
idx <- which(rownames(ExMatrix) %in% MS_PRGs)
ExMatrix <- ExMatrix[idx,]
sampleLabel <- colnames(ExMatrix)
labels <- c(rep("MS",10), rep("control",11))
names(sampleLabel) <- labels
sampleLabel <- factor(sampleLabel)
#### Running Cogena
devtools::load_all("./")
# KEGG Pathway gene set
annoGMT <- "c2.cp.kegg.v7.01.symbols.gmt.xz"
# GO biological process gene set
# annoGMT <- "c5.bp.v7.0.symbols.gmt.xz"
annofile <- system.file("extdata", annoGMT, package="cogena")
# the number of clusters. It can be a vector.
# nClust <- 2:20
nClust <- 10
# Making factor "Psoriasis" behind factor "ct" means Psoriasis Vs Control
# is up-regualted
sampleLabel <- factor(sampleLabel, levels=c("ct", "Psoriasis"))
# the number of cores.
# ncore <- 8
ncore <- 2
# the clustering methods
# clMethods <- c("hierarchical","kmeans","diana","fanny","som","model",
# "sota","pam","clara","agnes") # All the methods can be used together.
clMethods <- c("hierarchical","pam")
# the distance metric
metric <- "correlation"
# the agglomeration method used for hierarchical clustering
# (hierarchical and agnes)
method <- "complete"
#### Pathway Analysis
# Co-expression Analysis
genecl_result <- coExp(ExMatrix, nClust=nClust, clMethods=clMethods,
metric=metric, method=method, ncore=ncore)
# Enrichment (Pathway) analysis for the co-expressed genes
clen_res <- clEnrich(genecl_result, annofile=annofile, sampleLabel=sampleLabel)