02_deseq.Rmd

---
title: "DESeq analysis"
author: "Natalia Andrade and Ira Cooke"
date: "07/08/2017"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.width=12, fig.height=8,
                      echo=FALSE, warning=FALSE, message=FALSE)
knitr::opts_chunk$set(cache=TRUE)
options(width = 60)

# Run if needed to install ComplexHeatmap
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("ComplexHeatmap")

library(knitr)
library(DESeq2)
library(tidyverse)
library(ggrepel)
library(ComplexHeatmap)
library(ggpubr)
library(circlize)
library(colorspace)
library(cowplot)
```

Counts obtained from Corset were analysed with DESeq to identify differentially expressed genes between HardCoral treatments. 

```{r, include=FALSE}
#First we prepare the data.  
#This involves reading in a counts matrix from corset.  
#Then we extract the sample names from the columns and match those to conditions in an excel file that describes the experimental conditions for each sample.
counts <- read_tsv("hpc/corset/03-counts.txt") %>% 
  column_to_rownames(var="X1") %>% 
  as.matrix()

sample_data <- read_csv("raw_data/Samples_data.csv",trim_ws = TRUE)
colnames(sample_data) <- c("ID","Tank","HardCoral",'treat',"SoftCoralControl","PdvsL_Other")

sample_data_row_order <- match(colnames(counts),sample_data$ID)
```


```{r}
clusters <- read_tsv("hpc/corset/03-clusters.txt",col_names = c("transcript_id","cluster_id"))
symb_barnacle_transcripts <- read_tsv("hpc/minimap/transcriptome2ref_nonplut.tsv", col_names = c("transcript_id","hit_id","percent_identity","num_matches","score"))
barnacle_transcripts <- read_tsv("hpc/minimap/transcriptome2ref_aa.tsv", col_names = c("transcript_id","hit_id","percent_identity","num_matches","score"))
symb_barnacle_clusters <- clusters %>% filter(transcript_id %in% symb_barnacle_transcripts$transcript_id)
barnacle_clusters <- clusters %>% filter(transcript_id %in% barnacle_transcripts$transcript_id)

counts_plut <- counts[which(!(rownames(counts) %in% symb_barnacle_clusters$cluster_id)),]
counts_barnacle <- counts[which((rownames(counts) %in% barnacle_clusters$cluster_id)),]
```

Next we identify cluster ids that correspond to Symbiont or Barnacle transcripts.  A total of `r barnacle_transcripts %>% nrow()` barnacle transcripts from `r nrow(barnacle_clusters)` were excluded as a result.

Initial data exploration with PCA revealed that PdLd is an extreme outlier.  We therefore excluded this sample from further analysis. We also can see from the Barnacle counts that these were heavily concentrated in three samples suggesting that only these three samples were infected with coral-inhabiting barnacles.  It is also clear from this figure that a small fraction of transcripts identified as potentially of barnacle origin were found across all samples.  Although these transcripts are are most likely not of barnacle origin they were excluded from analysis because their status (coral or barnacle) was ambiguous. 

```{r}
col_fun_barn = colorRamp2(c(0, 9), c("white", "red"))

hm <- Heatmap(log2(counts_barnacle+1), 
        col = col_fun_barn, 
        show_row_names = FALSE, 
        heatmap_legend_param = list(title="Log Count", legend_width = unit(10,"cm")))
hm_plot <- grid.grabExpr(draw(hm))

plot_grid(hm_plot)
ggsave2("figures/barnacle_hm.png", width = 18,height = 10, units = "cm", dpi = 300)
ggsave2("figures/barnacle_hm.pdf", width = 18,height = 10, units = "cm", dpi = 300)
```


```{r, include=FALSE, eval=FALSE}
dds <- DESeqDataSetFromMatrix(countData = counts_plut,colData=sample_data[sample_data_row_order,],design = ~  treat+ HardCoral)

vsd <- varianceStabilizingTransformation(dds, blind=FALSE)

pca_data <- plotPCA(vsd,intgroup=c("HardCoral","SoftCoralControl"),returnData=TRUE)
percentVar <- round(100 * attr(pca_data, "percentVar"))

pca_data %>% 
  ggplot(aes(x=PC1,y=PC2)) + 
  geom_point(aes(color=SoftCoralControl,shape=HardCoral),size=4) + 
  geom_label_repel(aes(label=group)) + 
  xlab(paste0("PC1: ",percentVar[1],"% variance")) + 
  ylab(paste0("PC2: ",percentVar[2],"% variance"))  +
  theme(text = element_text(size=20))
```

Differential Expression Analysis with the outlier excluded was then performed based on a model with no intercept and a separate coefficient for each level of the variable `HardCoralTrt`.  This model gives us maximum flexibility for statistical testing using different contrasts.

> ~ 0+HardCoralTrt

One issue with the above analysis is that it estimates a separate value for each of the control samples, meaning that each such estimate will be done without replicates. Perhaps a better model to capture this experiment would be to estimate a treatment effect, a hard coral effect, and their interaction


```{r}
#counts2 <- data.frame(counts_plut) %>% select(-PdLd,-PfLc,-PdLe,-PdLa) %>% as.matrix()
counts2 <- data.frame(counts_plut) %>% select(-PdLd) %>% as.matrix()

sample_data2 <- sample_data[match(colnames(counts2),sample_data$ID),] %>%
  unite("HardCoralTrt",HardCoral,treat,remove = FALSE) %>% 
  mutate(HardCoralTrt = ifelse(ID %in% c("PfC","PdC"),"C",HardCoralTrt))


dds2_raw <- DESeqDataSetFromMatrix(countData = counts2,colData=sample_data2,design = ~  0+HardCoralTrt)
# Exclude samples where one of the controls (but not both) have zero counts
# This is because;
# A. Cook's distance filtering doesn't work for factors with 2 replicates
# B. If only one of the controls is 0 then it will be an outlier. But if they both are we will assume this is consistent low counts
#
keep <- rowSums(counts(dds2_raw)[,c("PfC","PdC")]<5) != 1

dds2 <- dds2_raw[keep,]


if (file.exists("cache/dds2.rds")) {
    dds2 <- read_rds("cache/dds2.rds")
} else {
  dds2 <- DESeq(dds2, parallel = TRUE)
  write_rds(dds2,"cache/dds2.rds")
}

resultsNames(dds2)
```

Before proceeding with contrasts we first check a PCA with the outlier excluded. This shows very clearly that Porites genotype is the dominant source of variation in the data, accounting for 89%. PC2 accounts for just 5% of variation and captures differences due to the competing Lobophytum colony and whether competition exists at all (Control).

```{r}
vsd2 <- varianceStabilizingTransformation(dds2, blind=FALSE)
pca_data2 <- plotPCA(vsd2,intgroup=c("HardCoral","SoftCoralControl"),returnData=TRUE)
percentVar2 <- round(100 * attr(pca_data2, "percentVar"))

pca_data2 %>% 
  ggplot(aes(x=PC1,y=PC2)) + 
  geom_point(aes(color=SoftCoralControl,shape=HardCoral),size=4) + 
  geom_label_repel(aes(label=group))  + 
  xlab(paste0("PC1: ",percentVar2[1],"% variance")) + 
  ylab(paste0("PC2: ",percentVar2[2],"% variance"))  +  
  theme(text = element_text(size=20)) + 
  theme_pubclean() +
  theme(legend.position = "right")
ggsave("figures/pca.png", width = 12, height = 8)
```

To set up contrasts we first use `resultsNames()` to extract a list of fitted coefficients in the model

```{r}
resultsNames(dds2)
```

Based on this the following contrast should highlight genes consistently different between control and treatment

`contrast_ct <- c(1,-0.5,-0.5)` which captures genes consistently different between treatment and control across both Porites colonies

```{r}
res_ct <- results(dds2,contrast = list(c("HardCoralTrtC"),c("HardCoralTrtPd_T","HardCoralTrtPf_T")),listValues=c(1,-1/2), tidy = TRUE) %>% 
  arrange(padj) %>% 
  filter(padj<0.1)

quantifiable_clusters <- results(dds2,contrast = list(c("HardCoralTrtC"),c("HardCoralTrtPd_T","HardCoralTrtPf_T")),listValues=c(1,
 -1/2), tidy = TRUE) %>% 
  filter(!is.na(padj)) %>% 
  select(cluster_id=row)

```


```{r, eval=FALSE}
# This is to support later analysis that draws on the DE results
write_rds(res_ct,"cache/res_ct.rds")
write_rds(quantifiable_clusters,"cache/quantifiable_clusters.rds")
write_rds(vsd2,"cache/vsd2.rds")
write_rds(pca_data2,"cache/pca_data2.rds")
```


For the top genes differentially expressed between control and treatment scatterplots of the raw data provide a useful check that the statistical analysis identifies genuine differentially expressed transcripts.  

```{r}
# For genes DE ONLY in  contrast btw Control and treatment

top_genes <- res_ct %>% arrange(padj) %>% top_n(12) %>% pull(row)

ct_genes_tidy <- assay(vsd2)[which(rownames(vsd2) %in% res_ct$row),] %>%
  as.data.frame() %>%
  rownames_to_column("cluster_id")

ct_genes_tidy %>%
  filter(cluster_id %in% top_genes) %>% 
  pivot_longer(-cluster_id,names_to = "ID",values_to="count") %>%
  left_join(sample_data) %>%
  ggplot(aes(x=SoftCoralControl, y=log(count), colour=HardCoral)) +
  geom_point(aes(shape=treat)) +
  facet_wrap(~cluster_id,ncol = 3,scales = "free_y")
```

Now a heatmap for all the DE genes between treatment and control. For this the relative change in expression is plotted (relative to the mean for a gene) so that clustering is meaningful. The clusters reveal some interesting patterns in terms of samples (matching the PCA) and in terms of genes (identifying alternative types of molecular response to competition).  


```{r}
# PCA Plot
#
hm_col_groups <- list(blue = c("Pd-Lb","Pf-Ld","Pf-La"), grey = c("Pd-C","Pf-C"),white=c("Pf-Lc","Pf-Le","Pf-Lb"), red = c("Pd-Le","Pd-La","Pd-Lc"))

pca_data3 <- pca_data2 %>% 
  mutate(name = str_replace(group,":","-")) %>% 
  mutate(colgroup = "grey") %>% 
  mutate(colgroup = ifelse(name %in% hm_col_groups$red,"red",colgroup)) %>% 
  mutate(colgroup = ifelse(name %in% hm_col_groups$white,"white",colgroup)) %>% 
  mutate(colgroup = ifelse(name %in% hm_col_groups$blue,"blue",colgroup)) 

diverging_hcl(5, palette = "Blue-Red 2")
#"#4A6FE3" "#9DA8E2" "#E2E2E2" "#E495A5" "#D33F6A"
#"#4A6FE3" "#9DA8E2" "#E2E2E2" "#E495A5" "#D33F6A"
colgroup_colors <- c(blue="#4A6FE3",white="#9DA8E2", grey="#E2E2E2", red="#D33F6A")

pca_plot <- pca_data3 %>% 
  ggplot(aes(y=PC1,x=PC2)) + 
#  geom_point(aes(x=-1,y=PC2, color=colgroup),size=4)+
  geom_label_repel(aes(label=name),point.padding = unit(0.5,"cm"), min.segment.length = 0.1, size=0.5)  + 
  geom_point(aes(shape=HardCoral, fill=colgroup),size=4) + 
  ylab(paste0("PC1: ",percentVar2[1],"% variance")) + 
  xlab(paste0("PC2: ",percentVar2[2],"% variance"))  +  
  scale_fill_manual(values=colgroup_colors) +
  scale_shape_manual(values=c("Pd"=22,"Pf"=24))+
#  scale_color_discrete_qualitative(palette = "Dark 3") +
  theme(text = element_text(size=20)) + 
  theme_pubr() +
  guides(fill=FALSE,col=FALSE) +
  theme(legend.position = "right", legend.title = element_blank())
```

```{r, include=FALSE}
library(ampir)

rd <- readxl::read_excel("raw_data/annotated_DEG_hm_ira.xlsx",na = "NA") %>% 
  select(cluster_id,peptide) %>% 
  filter(peptide!=".") %>% 
  filter(!is.na(peptide)) %>% 
  mutate(peptide=str_replace(peptide,pattern="\\*",""))

rdp <- predict_amps(rd) %>% filter(prob_AMP>0.8)
```


```{r}
# Setup Data for the Heatmap itself and for the various annotations
#
ct_genes_tidy <- assay(vsd2)[res_ct$row,] %>%
  as.data.frame() %>%
  rownames_to_column("cluster_id")

ct_heatmap_data <- ct_genes_tidy %>%
  as.data.frame() %>%
  column_to_rownames("cluster_id")


ct_heatmap_row_counts <- counts(dds2, normalized=TRUE)[res_ct$row,] %>%
  as.data.frame() %>% rowMeans()

mesenteries_data <- readxl::read_excel("raw_data/Mesentries Obsv_Bothcorals.xlsx") %>%
  dplyr::rename(HARD=`Hard corasls`) %>% 
  filter(HARD %in% c("Pa","Pc")) %>% 
  mutate(HARD = ifelse(HARD=="Pa","Pd","Pf"))


# You can eliminate the grey bar on PCA for:  PdLc , PfLe and PfLd.
mesenteries_groups <- mesenteries_data %>% 
  unite("pair",HARD,Softies,sep="-") %>%
  filter(!(pair %in% c("Pd-Lc","Pf-Le","Pf-Ld"))) %>% 
  pull(pair) %>% unique()


ct_heatmap_data_relative <- sweep(ct_heatmap_data, MARGIN=1, STATS= rowMeans(ct_heatmap_data)) %>% as.matrix()
colnames(ct_heatmap_data_relative) <- colnames(ct_heatmap_data_relative) %>% str_replace("Pd","Pd-") %>% str_replace("Pf","Pf-")

row_anno_data <- readxl::read_excel("raw_data/annotated_DEG_hm_ira.xlsx",na = "NA") %>% 
  mutate(Secreted = ifelse(!is.na(secreted),"SE","FALSE")) %>% 
  select(cluster_id,Secreted,log2FoldChange,evidence_level,display_name) %>% as.data.frame()

#write_rds(row_anno_data,"cache/row_anno_data.rds")

col_anno_data <- data.frame(Mesenteries = colnames(ct_heatmap_data_relative) %in% mesenteries_groups)


row_anno_data_m <- row_anno_data[match(rownames(ct_heatmap_data_relative),row_anno_data$cluster_id),] %>% 
  select(-cluster_id,-log2FoldChange) %>% 
  select(Secreted)
```


```{r}
# Perform kmeans clustering and save the result in cache.
# There is some randomness in this so we save it for consistency
# This also allows transcripts within cluster to be extracted in later scripts
#
if ( file.exists("cache/row_km.rds") & file.exists("cache/col_km.rds")){
 col_kmeans <- read_rds("cache/col_km.rds")
 row_kmeans <- read_rds("cache/row_km.rds")
} else {

  row_kmeans <- kmeans(ct_heatmap_data_relative,centers = 5,nstart = 100)
  col_kmeans <- kmeans(t(ct_heatmap_data_relative), centers = 4, nstart = 100)

  write_rds(row_kmeans,"cache/row_km.rds")
  write_rds(col_kmeans,"cache/col_km.rds")
}

col_fun = colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))

anno_colors <- c(qualitative_hcl(7, palette = "Dark 3"),"white")
names(anno_colors) <- c("SE","I","SR","CC","A","T","G", "FALSE")

row_anno_r <- HeatmapAnnotation(abund = anno_barplot(log2(ct_heatmap_row_counts)), which = "row")
row_anno_l <- HeatmapAnnotation(info = as.matrix(row_anno_data_m),which = "row", 
                              col = list(info=anno_colors), show_legend = FALSE)

col_anno <- HeatmapAnnotation(Mesenteries = col_anno_data$Mesenteries, col = list(Mesenteries = c("TRUE"="grey","FALSE"="white")),show_legend = FALSE, show_annotation_name = FALSE)

hm <- Heatmap(ct_heatmap_data_relative, 
                                      col = col_fun,
                                      column_split = factor(col_kmeans$cluster,levels = c(3,1,4,2)),  #
                                      cluster_column_slices = FALSE,
                                      row_split = row_kmeans$cluster,
                                      show_row_dend = FALSE,
                                      show_row_names = FALSE,
                                      show_column_dend = FALSE,
                                      column_title_gp = gpar(fill = colgroup_colors[c(1,3,4,2)]),
                                      heatmap_legend_param = list(title="LogFC", legend_width = unit(10,"cm")),
#                                      right_annotation = row_anno_l, 
                                      left_annotation = row_anno_l,
                                      bottom_annotation = col_anno
  )

hm_plot <- grid.grabExpr(draw(hm))

plot_grid(pca_plot + coord_cartesian(ylim = c(-28,25)),
          hm_plot ,
          ncol = 1, 
          rel_heights = c(0.4,0.6), 
          labels = c("A","B"),
          label_x = 0, label_y = 0,
          hjust = -0.5, vjust = -0.5)

ggsave2("figures/pca_hm.png", width = 18,height = 15, units = "cm", dpi = 300)
ggsave2("figures/pca_hm.pdf", width = 18,height = 15, units = "cm", dpi = 300)
```