-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathde_functions.R
230 lines (181 loc) · 6.75 KB
/
de_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
suppressPackageStartupMessages({
# cran
library("devtools")
library("ActivePathways")
library("gplots")
# bioconductor
library("getDEE2")
library("DESeq2")
library("edgeR")
})
# tool to write gmt file
# let's assume species is human for now
SPECIES = "hsapiens"
main<-function(x1,mdat) {
message(x1)
# define the SRP
SRP = strsplit(x1,":")[[1]][1]
# define contrast name
contrast_name = strsplit(x1,":")[[1]][2]
# get control group info
control = strsplit(x1,":")[[1]][3]
# get control group name
control_name = strsplit(control,";")[[1]][1]
# get control groups samples
# note that we are extracting SRX data not SRR
control_data = strsplit(control,";")[[1]][2]
control_data <- unlist(strsplit(control_data, ","))
control_data <- gsub(" ","",control_data)
control_data <- unique(control_data)
# get case group info
case = strsplit(x1,":")[[1]][4]
# get case group name
case_name = strsplit(case,";")[[1]][1]
# get case groups samples
case_data = strsplit(case,";")[[1]][2]
case_data <- unlist(strsplit(case_data, ","))
case_data <- gsub(" ","",case_data)
case_data <- unique(case_data)
# Check that control and case samples do not have any common samples
itx <- length(intersect(control_data,case_data))
if (itx>0) {
return("Error: there are some samples marked as both controls and treatments")
}
# set up the sample sheet starting with a list of samples
ss <- as.data.frame(paste(c(control_data,case_data),sep=","),stringsAsFactors=FALSE)
colnames(ss) <- SRP
# define case group
ss$case <- as.numeric(ss[,1] %in% case_data)
ss
# set up the model
mm <- model.matrix(~case,ss)
rownames(mm) <- ss[,1]
mm
# filter for samples in this study
mdat1 <- mdat[which(mdat$SRP_accession==SRP),]
# filter for only the relevant samples
mdat1 <- mdat1[which(mdat1$SRX_accession %in% rownames(mm)),]
# show the file
mdat1
# fetch the data
y <- getDEE2(SRRvec = mdat1$SRR_accession, species = SPECIES, metadata = mdat, legacy = TRUE)
# look at the data structure
str(y)
head(y$GeneCounts)
# remove failed runs - gotta be removed from metadata as well
pass <- y$GeneCounts[,grep("FAIL",y$MetadataSummary$QC_summary,invert = TRUE)]
dim(pass)
if (is.null(ncol(pass))) {
return(paste("Only one sample available",SRP))
}
if (ncol(pass)==0) {
return(paste("No runs passing QC for",SRP))
} else {
y$GeneCounts <- pass
y$MetadataSummary <- y$MetadataSummary[which(rownames(y$MetadataSummary) %in% colnames(pass)),]
}
# aggregate to experiment level
yy <- getDEE2::srx_agg(y,counts="GeneCounts")
# show the count matrix
head(yy)
# stick on the gene names
rownames(yy) <- paste(rownames(yy),as.character(y$GeneInfo$GeneSymbol),sep=" ")
# remove genes with fewer than 10 reads
yy <- yy[which(rowMeans(yy)>10),]
# if after filtering there are no genes left
if (is.null(dim(yy))) { return("There no detected genes after filtering.") }
# remove samples with fewer than 1000 expressed genes
n_expressed_genes <- apply(yy,2,function(x) { length(which(x>10)) } )
yy <- yy[,which(n_expressed_genes > 1000)]
dim(yy)
# remove data from sampleshet and modelmatrix
ss <- ss[which(ss[,1] %in% colnames(yy)),]
mm <- mm[which(rownames(mm) %in% colnames(yy)),]
# make sure that control and treatment have 2 or more replicates
n_ctrls=length(which(ss$case==0))
n_cases=length(which(ss$case==1))
if (n_ctrls<2) { return("There are fewer than two samples in the control group") }
if (n_cases<2) { return("There are fewer than two samples in the case group") }
MDS <- function(x,ss, ...) {
mydist <- cmdscale(dist(t(x)))
myrange <- range(mydist[,1])*1.3
colour_palette <- c("blue","red")
colours <- colour_palette[as.integer(factor(ss$case))]
plot(mydist, xlab="Coordinate 1", ylab="Coordinate 2",
type = "n", main=colnames(ss)[1], xlim=myrange, ...)
text(mydist, labels=ss[,1], cex=0.9, col=colours)
}
MDS(yy, ss )
dds=NULL
dds <- DESeqDataSetFromMatrix(countData = yy, colData = ss, design= mm )
dds <- DESeq(dds)
de <- results(dds)
# RPM
yyy <- yy/colMeans(yy) * 1000000
res <- cbind(de,yyy,yy)
res <- res[order(res$pvalue),]
res[1:10,1:6]
# define up and down-regulated gene lists
up <- head(rownames(subset(de, log2FoldChange>0 & padj<0.05 )),500)
dn <- head(rownames(subset(de, log2FoldChange<0 & padj<0.05 )),500)
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=0.5, col="dark gray",
main=contrast_name, cex.main=0.7)
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=0.5, col="red")
mtext(SUBHEADER,cex = 0.7)
top <- res[1:40,7:ncol(res)]
top <- top[,1:(ncol(top)/2)]
colfunc <- colorRampPalette(c("blue", "white", "red"))
colCols <- as.character(ss$case)
colCols <- gsub("1","orange",colCols)
colCols <- gsub("0","yellow",colCols)
heatmap.2( as.matrix(top), col=colfunc(25),scale="row", trace="none",
margins = c(6,6), cexRow=.4, cexCol = 0.6, main=SRP,
ColSideColors = colCols )
# curate the gene sets (ensembl accessions)
if(length(up)>9) {
upg <- unique(sapply(strsplit(up," "),"[[",1))
setid = paste(SRP, contrast_name," upregulated",sep=":")
setname = paste("GeneSetCommons",SRP,as.integer(as.numeric(Sys.time())),sep=" ")
upes <- list("id"=setid,"name"=setname,"genes"=upg)
} else {
upes=NULL
}
if(length(dn)>9) {
dng <- unique(sapply(strsplit(dn," "),"[[",1))
setid = paste(SRP, contrast_name," downregulated",sep=":")
setname = paste("GeneSetCommons",SRP,as.integer(as.numeric(Sys.time())),sep=" ")
dnes <- list("id"=setid,"name"=setname,"genes"=dng)
} else {
dnes=NULL
}
# curate the gene sets (gene symbols)
if(length(up)>9) {
upg <- unique(sapply(strsplit(up," "),"[[",2))
setid = paste(SRP, contrast_name," upregulated",sep=":")
setname = paste("GeneSetCommons",SRP,as.integer(as.numeric(Sys.time())),sep=" ")
upgs <- list("id"=setid,"name"=setname,"genes"=upg)
} else {
upgs=NULL
}
if(length(dn)>9) {
dng <- unique(sapply(strsplit(dn," "),"[[",2))
setid = paste(SRP, contrast_name," downregulated",sep=":")
setname = paste("GeneSetCommons",SRP,as.integer(as.numeric(Sys.time())),sep=" ")
dngs <- list("id"=setid,"name"=setname,"genes"=dng)
} else {
dngs=NULL
}
list("counts"=yy,"samplesheet"=ss,"design"=mm,
"de"=de,
"ens_up"=upes, "ens_dn"=dnes,
"gs_up"=upgs, "gs_dn"=dngs)
}