-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathx3_rnaseq_limma.R
307 lines (240 loc) · 11.6 KB
/
x3_rnaseq_limma.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
### A PARTIR DEL PROTOCOL DEL MASTER UPF: http://functionalgenomics.upf.edu/courses/IEO/DEanalysis2/index.html#21
### i HTSCOUNTS
# setwd("/home/mtormo/ssh_gpfs_projects_lab_genomica/Externs/180413_ACCC6EANXX_ccorral")
out <- "test_DEG/"
###################################################################
### 1. PREPARE THE DATA
### load data:
# library(SummarizedExperiment)
# se <- readRDS("countsSOunionFrg.STAR.GRCm38.84.gtf.ignore-true.rds")
library(DESeq2)
### get counts from htseq-count
htseq.path <- "htscount/"
### phenotype data
pfile <- read.csv(file = "pheno.csv",sep = "\t",header = T)
sampleFiles <- grep("csv",list.files(htseq.path),value=TRUE)
sampleTable <- data.frame(sampleName = pfile$iden,
fileName = sampleFiles,
condition = pfile$sample)
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = htseq.path,
design= ~ condition)
ddsHTSeq
### Add gene metadata: get gene information from GTF
require(rtracklayer)
gtf_tags <- readGFF("gtf/Schizosaccharomyces_pombe.ASM294v2.35.chr.gtf", tags = c("gene_id", "gene_name", "gene_biotype"))
genemdata <- data.frame(gtf_tags$gene_id,gtf_tags$gene_name,gtf_tags$gene_biotype)
### get uniques
genemdata <- unique(genemdata)
rownames(genemdata) <- genemdata[,1]
### assign rownames (gene_id)
genemdata[,1] <- NULL
genemdata <- genemdata[ order(rownames(genemdata)) , ]
head(genemdata)
nrow(ddsHTSeq)
nrow(genemdata)
mcols(rowRanges(ddsHTSeq)) <- genemdata
### create a dgelist object, including metadata
library(edgeR)
dge <- DGEList(counts = assays(ddsHTSeq)$counts, group = ddsHTSeq$condition, genes = as.data.frame(mcols(ddsHTSeq)))
dim(dge)
names(dge)
### 2. QUALITY ASSESSMENT
###################################################################
### Sequencing across lanes (ONLY WITH NEXTSEQ)
# png(sprintf("%s1_lanes_distribution.png",out))
# plotMDS(dge, labels=colnames(ddsHTSeq), col=as.integer(ddsHTSeq$condition)+1, cex=0.8, las=1, main ="Distribution of samples across lanes")
#
# # plotMDS(dge, labels=as.integer(as.factor(colnames(ddsHTSeq))), col=as.integer(ddsHTSeq$condition)+1, cex=0.8, las=1, main ="Distribution of samples across lanes")
# # plotMDS(dge, labels=as.integer(se$PatientId), col=as.integer(se$Lane)+1, cex=0.8, las=1)
# # legend("topleft", levels(se$Lane), fill=2:(nlevels(se$Lane)+1))
# # legend("topleft", colnames(ddsHTSeq), fill=2:(nlevels(ddsHTSeq$condition)+1))
# dev.off()
#
# ### they cluster together, collapse replicates then
# dds <- collapseReplicates(ddsHTSeq,groupby = ddsHTSeq$Lane)
#### with summarized experiment
# ### they cluster together by ID, let's sum counts per ID
# ### create unique names (without lane id)
# f <- factor(sub("\\_L00[0-9]", "", colnames(se)))
# ### create a matrix with indexes
# idx <- matrix(rep(f, times=nrow(se)), ncol=ncol(se), byrow=TRUE)
# ### get index of first's occurence sample and create colnames
# mt <- which(!duplicated(idx[1, ]))
# names(mt) <- idx[1, !duplicated(idx[1, ])]
# ### count by sample: create a matrix with all values of each sample, sum counts, and put the name
# cntby <- split(assays(se)$counts, idx)
# cntby <- lapply(cntby, matrix, nrow=nrow(se))
# cntby <- sapply(cntby, rowSums)
# rownames(cntby) <- rownames(se)
# cntby <- cntby[, names(mt)]
# ### get the colData without Lane
# cdata <- colData(se)[mt, -match("Lane", colnames(colData(se)))]
# rownames(cdata) <- names(mt)
# ### get the new summarizedExperiment
# se2 <- SummarizedExperiment(assays=list(counts=cntby),
# rowRanges=rowRanges(se),
# colData=cdata)
# se2
# se <- se2
# rm(se2,cdata,cntby,idx,pdata,pfile)
#
# ### create a dgelist object
# dge <- DGEList(counts = assays(se)$counts)
# dim(dge)
# ### Calculate log2-CPM values and store them into the SummarizedExperiment object.
# assays(se)$logCPM <- cpm(dge, log = TRUE, prior.count=0.5)
# ### create a dgelist object
# dge <- DGEList(counts = assays(dds)$counts, genes = as.data.frame(mcols(dds)))
# dim(dge)
### without lane concatenation, impute dds object
dds <- ddsHTSeq
### Calculate log2-CPM values and store them into the DDS object.
assays(dds)$logCPM <- cpm(dge, log = TRUE, prior.count=0.5)
###################################################################
### Library sizes (total number of sequence read counts per sample)
count_per_sample <- as.data.frame(colSums(assays(dds)$counts))
colnames(count_per_sample) <- "counts"
png(sprintf("%s2_reads_per_sample.png",out))
barplot(sort(count_per_sample$counts/1000000), xlab = "Samples", ylab = "M Reads",main = "Reads per sample", names.arg = rownames(count_per_sample),cex.names = 0.6,las=2)
dev.off()
###################################################################
### Distribution of expression levels among samples
library(lattice)
library(reshape2)
cpm_log <- melt(assays(dds)$logCPM)
png(sprintf("%s3_expression_samples.png",out))
densityplot(~ value, groups = Var2, data = cpm_log, xlab="log2-CPM",main="Expression among samples",plot.points=FALSE,auto.key=TRUE)
dev.off()
###################################################################
### Filtering of lowly-expressed genes
avgexp <- rowMeans(assays(dds)$logCPM)
png(sprintf("%s4_expression_genes.png",out))
hist(avgexp, xlab="log2 CPM", main="Distribution of average expression level per gene", las=1)
abline(v=1, col="red", lwd=2)
dev.off()
### Filter out lowly-expressed genes.
mask <- avgexp > 1
# dim(dds)
# [1] 47729 20
dds.filt <- dds[mask, ]
# dim(dds.filt)
# [1] 13342 20
# dim(dge)
# [1] 47729 20
dge.filt <- dge[mask, ]
# dim(dge.filt)
# [1] 13342 20
###################################################################
### 3. NORMALIZATION
### calculate normalization factors
### The Trimmed Mean of M-values (TMM) method addresses the issue of the different RNA composition of the samples by estimating a scaling factor for each library
dge.filt <- calcNormFactors(dge.filt)
### and replace cpm by normalized cpm
assays(dds.filt)$logCPM <- cpm(dge.filt, normalized.lib.sizes=TRUE,log=TRUE, prior.count=0.5)
###################################################################
### MA-plots: one condition VS another
png(sprintf("%s5_MA-plots.png",out))
par(mfrow = c(1, 2), oma=c(0,0,1,0))
### first: between samples
dge.filt$samples$group <- dds.filt$condition
table(dge.filt$samples$group)
plotSmear(dge.filt, lowess = TRUE, las = 1, cex.lab = 1.5, cex.axis = 1.2)
abline(h = 0, col = "blue", lwd = 2)
# ### second: between treatments
# dge.filt$samples$group <- dds.filt$treatment
# table(dge.filt$samples$group)
# plotSmear(dge.filt, lowess = TRUE, las = 1, cex.lab = 1.5, cex.axis = 1.2)
# abline(h = 0, col = "blue", lwd = 2)
title("MA plots", outer = TRUE)
dev.off()
###################################################################
### Batch identification
### EDITAR!!!
png(sprintf("%s6_batch_identification1.png",out))
par(mfrow = c(1, 2), oma=c(0,0,1,0))
plotMDS(dge.filt, labels=colnames(dds.filt), col=as.integer(dds.filt$condition)+1, cex=0.8, las=1)
legend("topleft", levels(dds.filt$treatment), fill=2:(nlevels(dds.filt$treatment)+1))
# plotMDS(dge.filt, labels=dds.filt$condition, col=as.integer(dds.filt$sample)+1, cex=0.8, las=1)
# legend("topleft", levels(se.filt$sample), fill=2:(nlevels(se.filt$sample)+1))
title("Batch identification", outer = TRUE)
dev.off()
png(sprintf("%s6_batch_identification2.png",out))
par(mfrow = c(1, 1), oma=c(0,0,1,0))
plotMDS(dge.filt, labels=colnames(dds.filt), col=as.integer(dds.filt$condition)+1, cex=0.8, las=1, main="Batch identification by sample & treatment")
legend("topleft", levels(dds.filt$samp_treat), fill=2:(nlevels(dds.filt$samp_treat)+1),cex=0.6)
dev.off()
### cluster dendrogram
tcounts <- t(assays(dds.filt)$logCPM)
clusters <- hclust(dist(tcounts))
plot(clusters)
###################################################################
### 4. DIFFERENTIAL EXPRESSION
###################################################################
### FACTORIAL DESIGNS
### Factorial designs, also known as factorial experiments, are experimental designs where the outcome of interest consists of two or more factors.
### The limma Users's Guide, Section 9.5 and elsewhere, provides guidance and specific examples on how to analyze a factorial design.
### There are several ways to approach this kind of analysis. One that facilitates the interpretation is building a single factor out of the combination
### of the factors under study and set the intercept term to zero.
### Fit a linear regression model using this new factor variable without intercept term.
mod <- model.matrix(~0 + dds.filt$condition, colData(dds.filt))
colnames(mod) <- levels(dds.filt$condition)
mod0 <- model.matrix(~1, colData(dds.filt))
### We can adjust for unkown covariates using surrogate variable analysis (SVA).
### First, estimate surrogate variables (SVs) from the log-CPM values calculated by voom:
library(sva)
sv <- sva(assays(dds.filt)$logCPM, mod = mod, mod0 = mod0)
### Second, add these (SVs) to the design matrix:
mod <- cbind(mod, sv$sv)
colnames(mod) <- c(colnames(mod)[1:3], paste0("SV", 1:sv$n))
### Third, calculate again mean-variance weights:
v <- voom(dge.filt, mod, plot = FALSE)
### adjust for correlations with technical repilcats: NO!
# corfit <- duplicateCorrelation(v, mod, block = se.filt$Iden)
# v <- voom(dge, mod, block = se$celline, correlation = corfit$consensus, plot = FALSE)
# fit6 <- lmFit(v, mod, block = se$cellline, correlation = corfit$consensus)
### Fourth, fit again the linear models for each gene with the updated design matrix:
fit6 <- lmFit(v, mod)
### EDITAR!!
### Build the contrast matrix that specifies the contrasts of interest between a set of coefficients estimated from a linear regression model.
#cont.matrix <- makeContrasts(wt2=wt_rim-wt_v, ko2=fx_rim-fx_v, treat=fx_rim-wt_rim, untreated=fx_v-wt_v, levels=mod)
cont.matrix <- makeContrasts(can=CAN, hs=HS, mm=MM, levels=mod)
### Estimate coefficients for a given set of contrasts.
fit6 <- contrasts.fit(fit6, cont.matrix)
### Calculate moderated t-statistics.
fit6 <- eBayes(fit6)
head(fit6$t)
### Fetch table of results for each coefficient.
# wt2 = topTable(fit6, coef = "wt2", n=Inf)
# treated = topTable(fit6, coef = "treat", n=Inf)
# untreated = topTable(fit6, coef = "untreated", n=Inf)
cantt = topTable(fit6, coef = "can", n=Inf)
# nrow(wt2[wt2$adj.P.Val<0.1,])
# nrow(ko2[ko2$adj.P.Val<0.1,])
# nrow(treated[treated$adj.P.Val<0.1,])
# nrow(untreated[untreated$adj.P.Val<0.1,])
nrow(cantt[cantt$adj.P.Val<0.05,])
### Explore graphically the overlap of DE genes between contrasts of interest.
res <- decideTests(fit6, p.value = 0.1)
png(sprintf("%s7_pvalues_distribution.png",out))
par(mfrow = c(2, 2), oma=c(0,0,1,0))
# hist(wt2$P.Value, xlab = "Raw P-values", main = "WT", las = 1)
# hist(ko2$P.Value, xlab = "Raw P-values", main = "KO", las = 1)
# hist(treated$P.Value, xlab = "Raw P-values", main = "TREATED", las = 1)
# hist(untreated$P.Value, xlab = "Raw P-values", main = "UNTREATED", las = 1)
hist(cantt$P.Value, xlab = "Raw P-values", main = "CAN", las = 1)
title("Raw P-values distribution",outer=T)
dev.off()
summary(res)
head(fit6$t["ENSMUSG00000000838",])
png(sprintf("%s8_venn_diagram.png",out))
vennDiagram(res)
dev.off()
### volcanoplot
# volcanoplot(fit6, coef = 2, highlight = 7, names = rownames(fit6$t), main = "DE genes",las = 1)
# ###
# top <- order(fit6$lods[, 2], decreasing = TRUE)[1:7]
# limma::plotMA(fit6, coef = 2, status = rownames(fit6$lods), legend = FALSE,main = "Model 5", hl.pch = 46, hl.cex = 4, bg.pch = 46, bg.cex = 3, las = 1)
# text(fit6$Amean[top], fit6$coef[top, 2], cex = 0.5, pos = 4)
save.image()
load(".RData")