forked from reubenthomas/PFOCRInPathwayAnalyses
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPlotDistributionsOfTDPs.R
168 lines (146 loc) · 8.42 KB
/
PlotDistributionsOfTDPs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
rm(list = ls())
setwd("~/Dropbox (Gladstone)/PFOCRInPathwayAnalyses")
load(paste0(getwd(), "/PFOCRInPathwayAnalyses/databases.RData"))
ocr_length <- sapply(pfocr_list, length)
go_length <- sapply(go_list, length)
wp_length <- sapply(wp_list, length)
GSEid <- list.files(path = "./PFOCRInPathwayAnalyses/GSE/")
minSig <- 5
minSize <- 10
theta <- 0.9
thetaL <- as.character(100*theta)
ChooseGSE <- 1
getTDPQuantiles <- function(ChooseGSE, GSEid, ocr_length, go_length, wp_length, minSig, minSize, theta) {
print(ChooseGSE)
q_ocr <- NA
q_wp <- NA
q_go <- NA
ocr_result <- read.table(paste0(getwd(), "/PFOCRInPathwayAnalyses/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR_3sets/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 & temp_ocr_length >= minSize, na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 & temp_ocr_length >= minSize, ], database=rep("ocr", Nocr))
q_ocr <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
wp_result <- read.table(paste0(getwd(), "/PFOCRInPathwayAnalyses/GSE/",GSEid[ChooseGSE],"/rSEA/WP/result.txt"), header = T)
tempIndices <- match(wp_result$set_id, names(wp_length))
temp_wp_length <- wp_length[tempIndices]
Nwp <- sum(!is.na(wp_result$Comp.adjP) & wp_result$Comp.adjP < 0.05 & temp_wp_length >= minSize, na.rm = T)
if(Nwp > minSig) {
sig_wp_result <- data.frame(wp_result[!is.na(wp_result$Comp.adjP) & wp_result$Comp.adjP < 0.05 & temp_wp_length >= minSize, ], database=rep("wp", Nwp))
q_wp <- quantile(sig_wp_result$TDP.estimate, theta, na.rm = T)
}
go_result <- read.table(paste0(getwd(), "/PFOCRInPathwayAnalyses/GSE/",GSEid[ChooseGSE],"/rSEA/GO/result.txt"), header = T)
tempIndices <- match(go_result$set_id, names(go_length))
temp_go_length <- go_length[tempIndices]
Ngo <- sum(!is.na(go_result$Comp.adjP) & go_result$Comp.adjP < 0.05 & temp_go_length >= minSize & temp_go_length < 500, na.rm = T)
if(Ngo > minSig) {
sig_go_result <- data.frame(go_result[!is.na(go_result$Comp.adjP) & go_result$Comp.adjP < 0.05 & temp_go_length >= minSize & temp_go_length < 500, ], database=rep("go", Ngo))
q_go <- quantile(sig_go_result$TDP.estimate, theta, na.rm = T)
}
# PlotData <- rbind(sig_ocr_result, sig_wp_result)
# PlotData <- data.frame(rbind(PlotData, sig_go_result))
# ggplot(PlotData, aes(x=TDP.estimate, color=database)) + stat_ecdf() + geom_vline(xintercept = q_go,lty=2)
# ggplot(PlotData, aes(y=TDP.estimate, x=database)) + geom_boxplot()
return(c(q_ocr, q_wp, q_go))
}
q_results <- t(sapply(1:length(GSEid), getTDPQuantiles, GSEid, ocr_length, go_length, wp_length, minSig, minSize, theta))
q_results <- data.frame(q_results)
colnames(q_results) <- c("median_tdp_ocr_3sets", "median_tdp_wp", "median_tdp_go")
require(ggplot2)
pdf(paste0("pathway_databases_w_pfocr_3sets_comparison_theta_level_", thetaL,"_percent.pdf"))
for(i in 1:(ncol(q_results) - 1)) {
for(j in (i+1):ncol(q_results)) {
databases <- names(q_results)
x <- databases[i]
y <- databases[j]
tempRes <- t.test(q_results[,i], q_results[,j], paired = T)
print(ggplot(q_results, aes(x=q_results[,i], y=q_results[,j]))
+ geom_point()
+ geom_abline(slope = 1, intercept = 0, lty=2, col="red")
+ xlab(x) + ylab(y)
+ ggtitle(paste0("Mean difference = ", round(1e4*tempRes$estimate)/1e4, ";", "pvalue = ", round(1e4*tempRes$p.value)/1e4)))
}
}
dev.off()
get_pfocr_TDPQuantiles <- function(ChooseGSE, GSEid, ocr_length, go_length, wp_length, minSig, minSize, theta) {
print(ChooseGSE)
q_ocr_orig <- NA
q_ocr_noalias <- NA
q_ocr_nobe0 <- NA
q_ocr_nobe2 <- NA
q_ocr_nobe3 <- NA
q_ocr_noprev <- NA
ocr_result <- read.table(paste0(getwd(), "/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , ], database=rep("ocr_noalias", Nocr))
q_ocr_orig <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
ocr_result <- read.table(paste0(getwd(), "/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR_noalias/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , ], database=rep("ocr_noalias", Nocr))
q_ocr_noalias <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
ocr_result <- read.table(paste0(getwd(), "/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR_nobe0/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , ], database=rep("ocr_nobe0", Nocr))
q_ocr_nobe0 <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
ocr_result <- read.table(paste0(getwd(), "/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR_nobe2/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , ], database=rep("ocr_nobe2", Nocr))
q_ocr_nobe2 <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
ocr_result <- read.table(paste0(getwd(), "/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR_nobe3/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , ], database=rep("ocr_nobe3", Nocr))
q_ocr_nobe3 <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
ocr_result <- read.table(paste0(getwd(), "/GSE/",GSEid[ChooseGSE],"/rSEA/PFOCR_noprev/result.txt"), header = T)
tempIndices <- match(ocr_result$set_id, names(ocr_length))
temp_ocr_length <- ocr_length[tempIndices]
Nocr <- sum(!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , na.rm = T)
if(Nocr > minSig) {
sig_ocr_result <- data.frame(ocr_result[!is.na(ocr_result$Comp.adjP) & ocr_result$Comp.adjP < 0.05 , ], database=rep("ocr_noprev", Nocr))
q_ocr_noprev <- quantile(sig_ocr_result$TDP.estimate, theta, na.rm = T)
}
# PlotData <- rbind(sig_ocr_result, sig_wp_result)
# PlotData <- data.frame(rbind(PlotData, sig_go_result))
# ggplot(PlotData, aes(x=TDP.estimate, color=database)) + stat_ecdf() + geom_vline(xintercept = q_go,lty=2)
# ggplot(PlotData, aes(y=TDP.estimate, x=database)) + geom_boxplot()
return(c(q_ocr_orig, q_ocr_noalias, q_ocr_nobe0, q_ocr_nobe2, q_ocr_nobe3, q_ocr_noprev))
}
ocr_q_results <- t(sapply(1:length(GSEid), get_pfocr_TDPQuantiles, GSEid, ocr_length, go_length, wp_length, minSig, minSize, theta))
ocr_q_results <- data.frame(ocr_q_results)
colnames(ocr_q_results) <- c("median_tdp_ocr_orig","median_tdp_ocr_noalias", "median_tdp_ocr_nobe0", "median_tdp_ocr_nobe2", "median_tdp_ocr_nobe3", "median_tdp_ocr_noprev")
pdf(paste0("ocr_databases_comparison_theta_level_", thetaL,"_percent.pdf"))
for(i in 1:(ncol(ocr_q_results) - 1)) {
for(j in (i+1):ncol(ocr_q_results)) {
databases <- names(ocr_q_results)
x <- databases[i]
y <- databases[j]
tempRes <- t.test(ocr_q_results[,i], ocr_q_results[,j], paired = T)
print(ggplot(ocr_q_results, aes(x=ocr_q_results[,i], y=ocr_q_results[,j]))
+ geom_point()
+ geom_abline(slope = 1, intercept = 0, lty=2, col="red")
+ xlab(x) + ylab(y)
+ ggtitle(paste0("Mean difference = ", round(1e4*tempRes$estimate)/1e4, ";", "pvalue = ", round(1e4*tempRes$p.value)/1e4)))
}
}
dev.off()