-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02_deseq.Rmd
360 lines (266 loc) · 14.8 KB
/
02_deseq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
---
title: "DESeq analysis"
author: "Natalia Andrade and Ira Cooke"
date: "07/08/2017"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.width=12, fig.height=8,
echo=FALSE, warning=FALSE, message=FALSE)
knitr::opts_chunk$set(cache=TRUE)
options(width = 60)
# Run if needed to install ComplexHeatmap
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("ComplexHeatmap")
library(knitr)
library(DESeq2)
library(tidyverse)
library(ggrepel)
library(ComplexHeatmap)
library(ggpubr)
library(circlize)
library(colorspace)
library(cowplot)
```
Counts obtained from Corset were analysed with DESeq to identify differentially expressed genes between HardCoral treatments.
```{r, include=FALSE}
#First we prepare the data.
#This involves reading in a counts matrix from corset.
#Then we extract the sample names from the columns and match those to conditions in an excel file that describes the experimental conditions for each sample.
counts <- read_tsv("hpc/corset/03-counts.txt") %>%
column_to_rownames(var="X1") %>%
as.matrix()
sample_data <- read_csv("raw_data/Samples_data.csv",trim_ws = TRUE)
colnames(sample_data) <- c("ID","Tank","HardCoral",'treat',"SoftCoralControl","PdvsL_Other")
sample_data_row_order <- match(colnames(counts),sample_data$ID)
```
```{r}
clusters <- read_tsv("hpc/corset/03-clusters.txt",col_names = c("transcript_id","cluster_id"))
symb_barnacle_transcripts <- read_tsv("hpc/minimap/transcriptome2ref_nonplut.tsv", col_names = c("transcript_id","hit_id","percent_identity","num_matches","score"))
barnacle_transcripts <- read_tsv("hpc/minimap/transcriptome2ref_aa.tsv", col_names = c("transcript_id","hit_id","percent_identity","num_matches","score"))
symb_barnacle_clusters <- clusters %>% filter(transcript_id %in% symb_barnacle_transcripts$transcript_id)
barnacle_clusters <- clusters %>% filter(transcript_id %in% barnacle_transcripts$transcript_id)
counts_plut <- counts[which(!(rownames(counts) %in% symb_barnacle_clusters$cluster_id)),]
counts_barnacle <- counts[which((rownames(counts) %in% barnacle_clusters$cluster_id)),]
```
Next we identify cluster ids that correspond to Symbiont or Barnacle transcripts. A total of `r barnacle_transcripts %>% nrow()` barnacle transcripts from `r nrow(barnacle_clusters)` were excluded as a result.
Initial data exploration with PCA revealed that PdLd is an extreme outlier. We therefore excluded this sample from further analysis. We also can see from the Barnacle counts that these were heavily concentrated in three samples suggesting that only these three samples were infected with coral-inhabiting barnacles. It is also clear from this figure that a small fraction of transcripts identified as potentially of barnacle origin were found across all samples. Although these transcripts are are most likely not of barnacle origin they were excluded from analysis because their status (coral or barnacle) was ambiguous.
```{r}
col_fun_barn = colorRamp2(c(0, 9), c("white", "red"))
hm <- Heatmap(log2(counts_barnacle+1),
col = col_fun_barn,
show_row_names = FALSE,
heatmap_legend_param = list(title="Log Count", legend_width = unit(10,"cm")))
hm_plot <- grid.grabExpr(draw(hm))
plot_grid(hm_plot)
ggsave2("figures/barnacle_hm.png", width = 18,height = 10, units = "cm", dpi = 300)
ggsave2("figures/barnacle_hm.pdf", width = 18,height = 10, units = "cm", dpi = 300)
```
```{r, include=FALSE, eval=FALSE}
dds <- DESeqDataSetFromMatrix(countData = counts_plut,colData=sample_data[sample_data_row_order,],design = ~ treat+ HardCoral)
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
pca_data <- plotPCA(vsd,intgroup=c("HardCoral","SoftCoralControl"),returnData=TRUE)
percentVar <- round(100 * attr(pca_data, "percentVar"))
pca_data %>%
ggplot(aes(x=PC1,y=PC2)) +
geom_point(aes(color=SoftCoralControl,shape=HardCoral),size=4) +
geom_label_repel(aes(label=group)) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme(text = element_text(size=20))
```
Differential Expression Analysis with the outlier excluded was then performed based on a model with no intercept and a separate coefficient for each level of the variable `HardCoralTrt`. This model gives us maximum flexibility for statistical testing using different contrasts.
> ~ 0+HardCoralTrt
One issue with the above analysis is that it estimates a separate value for each of the control samples, meaning that each such estimate will be done without replicates. Perhaps a better model to capture this experiment would be to estimate a treatment effect, a hard coral effect, and their interaction
```{r}
#counts2 <- data.frame(counts_plut) %>% select(-PdLd,-PfLc,-PdLe,-PdLa) %>% as.matrix()
counts2 <- data.frame(counts_plut) %>% select(-PdLd) %>% as.matrix()
sample_data2 <- sample_data[match(colnames(counts2),sample_data$ID),] %>%
unite("HardCoralTrt",HardCoral,treat,remove = FALSE) %>%
mutate(HardCoralTrt = ifelse(ID %in% c("PfC","PdC"),"C",HardCoralTrt))
dds2_raw <- DESeqDataSetFromMatrix(countData = counts2,colData=sample_data2,design = ~ 0+HardCoralTrt)
# Exclude samples where one of the controls (but not both) have zero counts
# This is because;
# A. Cook's distance filtering doesn't work for factors with 2 replicates
# B. If only one of the controls is 0 then it will be an outlier. But if they both are we will assume this is consistent low counts
#
keep <- rowSums(counts(dds2_raw)[,c("PfC","PdC")]<5) != 1
dds2 <- dds2_raw[keep,]
if (file.exists("cache/dds2.rds")) {
dds2 <- read_rds("cache/dds2.rds")
} else {
dds2 <- DESeq(dds2, parallel = TRUE)
write_rds(dds2,"cache/dds2.rds")
}
resultsNames(dds2)
```
Before proceeding with contrasts we first check a PCA with the outlier excluded. This shows very clearly that Porites genotype is the dominant source of variation in the data, accounting for 89%. PC2 accounts for just 5% of variation and captures differences due to the competing Lobophytum colony and whether competition exists at all (Control).
```{r}
vsd2 <- varianceStabilizingTransformation(dds2, blind=FALSE)
pca_data2 <- plotPCA(vsd2,intgroup=c("HardCoral","SoftCoralControl"),returnData=TRUE)
percentVar2 <- round(100 * attr(pca_data2, "percentVar"))
pca_data2 %>%
ggplot(aes(x=PC1,y=PC2)) +
geom_point(aes(color=SoftCoralControl,shape=HardCoral),size=4) +
geom_label_repel(aes(label=group)) +
xlab(paste0("PC1: ",percentVar2[1],"% variance")) +
ylab(paste0("PC2: ",percentVar2[2],"% variance")) +
theme(text = element_text(size=20)) +
theme_pubclean() +
theme(legend.position = "right")
ggsave("figures/pca.png", width = 12, height = 8)
```
To set up contrasts we first use `resultsNames()` to extract a list of fitted coefficients in the model
```{r}
resultsNames(dds2)
```
Based on this the following contrast should highlight genes consistently different between control and treatment
`contrast_ct <- c(1,-0.5,-0.5)` which captures genes consistently different between treatment and control across both Porites colonies
```{r}
res_ct <- results(dds2,contrast = list(c("HardCoralTrtC"),c("HardCoralTrtPd_T","HardCoralTrtPf_T")),listValues=c(1,-1/2), tidy = TRUE) %>%
arrange(padj) %>%
filter(padj<0.1)
quantifiable_clusters <- results(dds2,contrast = list(c("HardCoralTrtC"),c("HardCoralTrtPd_T","HardCoralTrtPf_T")),listValues=c(1,
-1/2), tidy = TRUE) %>%
filter(!is.na(padj)) %>%
select(cluster_id=row)
```
```{r, eval=FALSE}
# This is to support later analysis that draws on the DE results
write_rds(res_ct,"cache/res_ct.rds")
write_rds(quantifiable_clusters,"cache/quantifiable_clusters.rds")
write_rds(vsd2,"cache/vsd2.rds")
write_rds(pca_data2,"cache/pca_data2.rds")
```
For the top genes differentially expressed between control and treatment scatterplots of the raw data provide a useful check that the statistical analysis identifies genuine differentially expressed transcripts.
```{r}
# For genes DE ONLY in contrast btw Control and treatment
top_genes <- res_ct %>% arrange(padj) %>% top_n(12) %>% pull(row)
ct_genes_tidy <- assay(vsd2)[which(rownames(vsd2) %in% res_ct$row),] %>%
as.data.frame() %>%
rownames_to_column("cluster_id")
ct_genes_tidy %>%
filter(cluster_id %in% top_genes) %>%
pivot_longer(-cluster_id,names_to = "ID",values_to="count") %>%
left_join(sample_data) %>%
ggplot(aes(x=SoftCoralControl, y=log(count), colour=HardCoral)) +
geom_point(aes(shape=treat)) +
facet_wrap(~cluster_id,ncol = 3,scales = "free_y")
```
Now a heatmap for all the DE genes between treatment and control. For this the relative change in expression is plotted (relative to the mean for a gene) so that clustering is meaningful. The clusters reveal some interesting patterns in terms of samples (matching the PCA) and in terms of genes (identifying alternative types of molecular response to competition).
```{r}
# PCA Plot
#
hm_col_groups <- list(blue = c("Pd-Lb","Pf-Ld","Pf-La"), grey = c("Pd-C","Pf-C"),white=c("Pf-Lc","Pf-Le","Pf-Lb"), red = c("Pd-Le","Pd-La","Pd-Lc"))
pca_data3 <- pca_data2 %>%
mutate(name = str_replace(group,":","-")) %>%
mutate(colgroup = "grey") %>%
mutate(colgroup = ifelse(name %in% hm_col_groups$red,"red",colgroup)) %>%
mutate(colgroup = ifelse(name %in% hm_col_groups$white,"white",colgroup)) %>%
mutate(colgroup = ifelse(name %in% hm_col_groups$blue,"blue",colgroup))
diverging_hcl(5, palette = "Blue-Red 2")
#"#4A6FE3" "#9DA8E2" "#E2E2E2" "#E495A5" "#D33F6A"
#"#4A6FE3" "#9DA8E2" "#E2E2E2" "#E495A5" "#D33F6A"
colgroup_colors <- c(blue="#4A6FE3",white="#9DA8E2", grey="#E2E2E2", red="#D33F6A")
pca_plot <- pca_data3 %>%
ggplot(aes(y=PC1,x=PC2)) +
# geom_point(aes(x=-1,y=PC2, color=colgroup),size=4)+
geom_label_repel(aes(label=name),point.padding = unit(0.5,"cm"), min.segment.length = 0.1, size=0.5) +
geom_point(aes(shape=HardCoral, fill=colgroup),size=4) +
ylab(paste0("PC1: ",percentVar2[1],"% variance")) +
xlab(paste0("PC2: ",percentVar2[2],"% variance")) +
scale_fill_manual(values=colgroup_colors) +
scale_shape_manual(values=c("Pd"=22,"Pf"=24))+
# scale_color_discrete_qualitative(palette = "Dark 3") +
theme(text = element_text(size=20)) +
theme_pubr() +
guides(fill=FALSE,col=FALSE) +
theme(legend.position = "right", legend.title = element_blank())
```
```{r, include=FALSE}
library(ampir)
rd <- readxl::read_excel("raw_data/annotated_DEG_hm_ira.xlsx",na = "NA") %>%
select(cluster_id,peptide) %>%
filter(peptide!=".") %>%
filter(!is.na(peptide)) %>%
mutate(peptide=str_replace(peptide,pattern="\\*",""))
rdp <- predict_amps(rd) %>% filter(prob_AMP>0.8)
```
```{r}
# Setup Data for the Heatmap itself and for the various annotations
#
ct_genes_tidy <- assay(vsd2)[res_ct$row,] %>%
as.data.frame() %>%
rownames_to_column("cluster_id")
ct_heatmap_data <- ct_genes_tidy %>%
as.data.frame() %>%
column_to_rownames("cluster_id")
ct_heatmap_row_counts <- counts(dds2, normalized=TRUE)[res_ct$row,] %>%
as.data.frame() %>% rowMeans()
mesenteries_data <- readxl::read_excel("raw_data/Mesentries Obsv_Bothcorals.xlsx") %>%
dplyr::rename(HARD=`Hard corasls`) %>%
filter(HARD %in% c("Pa","Pc")) %>%
mutate(HARD = ifelse(HARD=="Pa","Pd","Pf"))
# You can eliminate the grey bar on PCA for: PdLc , PfLe and PfLd.
mesenteries_groups <- mesenteries_data %>%
unite("pair",HARD,Softies,sep="-") %>%
filter(!(pair %in% c("Pd-Lc","Pf-Le","Pf-Ld"))) %>%
pull(pair) %>% unique()
ct_heatmap_data_relative <- sweep(ct_heatmap_data, MARGIN=1, STATS= rowMeans(ct_heatmap_data)) %>% as.matrix()
colnames(ct_heatmap_data_relative) <- colnames(ct_heatmap_data_relative) %>% str_replace("Pd","Pd-") %>% str_replace("Pf","Pf-")
row_anno_data <- readxl::read_excel("raw_data/annotated_DEG_hm_ira.xlsx",na = "NA") %>%
mutate(Secreted = ifelse(!is.na(secreted),"SE","FALSE")) %>%
select(cluster_id,Secreted,log2FoldChange,evidence_level,display_name) %>% as.data.frame()
#write_rds(row_anno_data,"cache/row_anno_data.rds")
col_anno_data <- data.frame(Mesenteries = colnames(ct_heatmap_data_relative) %in% mesenteries_groups)
row_anno_data_m <- row_anno_data[match(rownames(ct_heatmap_data_relative),row_anno_data$cluster_id),] %>%
select(-cluster_id,-log2FoldChange) %>%
select(Secreted)
```
```{r}
# Perform kmeans clustering and save the result in cache.
# There is some randomness in this so we save it for consistency
# This also allows transcripts within cluster to be extracted in later scripts
#
if ( file.exists("cache/row_km.rds") & file.exists("cache/col_km.rds")){
col_kmeans <- read_rds("cache/col_km.rds")
row_kmeans <- read_rds("cache/row_km.rds")
} else {
row_kmeans <- kmeans(ct_heatmap_data_relative,centers = 5,nstart = 100)
col_kmeans <- kmeans(t(ct_heatmap_data_relative), centers = 4, nstart = 100)
write_rds(row_kmeans,"cache/row_km.rds")
write_rds(col_kmeans,"cache/col_km.rds")
}
col_fun = colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))
anno_colors <- c(qualitative_hcl(7, palette = "Dark 3"),"white")
names(anno_colors) <- c("SE","I","SR","CC","A","T","G", "FALSE")
row_anno_r <- HeatmapAnnotation(abund = anno_barplot(log2(ct_heatmap_row_counts)), which = "row")
row_anno_l <- HeatmapAnnotation(info = as.matrix(row_anno_data_m),which = "row",
col = list(info=anno_colors), show_legend = FALSE)
col_anno <- HeatmapAnnotation(Mesenteries = col_anno_data$Mesenteries, col = list(Mesenteries = c("TRUE"="grey","FALSE"="white")),show_legend = FALSE, show_annotation_name = FALSE)
hm <- Heatmap(ct_heatmap_data_relative,
col = col_fun,
column_split = factor(col_kmeans$cluster,levels = c(3,1,4,2)), #
cluster_column_slices = FALSE,
row_split = row_kmeans$cluster,
show_row_dend = FALSE,
show_row_names = FALSE,
show_column_dend = FALSE,
column_title_gp = gpar(fill = colgroup_colors[c(1,3,4,2)]),
heatmap_legend_param = list(title="LogFC", legend_width = unit(10,"cm")),
# right_annotation = row_anno_l,
left_annotation = row_anno_l,
bottom_annotation = col_anno
)
hm_plot <- grid.grabExpr(draw(hm))
plot_grid(pca_plot + coord_cartesian(ylim = c(-28,25)),
hm_plot ,
ncol = 1,
rel_heights = c(0.4,0.6),
labels = c("A","B"),
label_x = 0, label_y = 0,
hjust = -0.5, vjust = -0.5)
ggsave2("figures/pca_hm.png", width = 18,height = 15, units = "cm", dpi = 300)
ggsave2("figures/pca_hm.pdf", width = 18,height = 15, units = "cm", dpi = 300)
```