Merge pull request #65 from MacoskoLab/plot-impute

Add plotGeneLoadings function and documentation
welch-lab · Jun 4, 2019 · 2f4a927 · 2f4a927
2 parents 2e98b75 + 0e52860
commit 2f4a927
Show file tree

Hide file tree

Showing 6 changed files with 316 additions and 9 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -27,10 +27,13 @@ Imports: Rcpp (>= 0.12.10),
 	 foreach,
 	 snow,
 	 doSNOW,
-	 mclust
+	 mclust,
+	 patchwork
+Remotes: thomasp85/patchwork
 LinkingTo: Rcpp, RcppArmadillo
 Depends: cowplot,
-         Matrix	 
+         Matrix,
+	 patchwork
 RoxygenNote: 6.0.1
 Suggests: Seurat,
 	  knitr,

diff --git a/R/liger.R b/R/liger.R
@@ -2458,6 +2458,193 @@ plotWordClouds <- function(object, dataset1 = NULL, dataset2 = NULL, num.genes =
   }
 }
 
+#' Generate t-SNE plots and gene loading plots
+#' 
+#' @description 
+#' Plots t-SNE coordinates of all cells by their loadings on each factor. Underneath it displays the
+#' most highly loading shared and dataset-specific genes, along with the overall gene loadings 
+#' for each dataset. 
+#' 
+#' It is recommended to call this function into a PDF due to the large number of 
+#' plots produced.
+#'
+#' @param object \code{liger} object. Should call runTSNE before calling.
+#' @param dataset1 Name of first dataset (by default takes first two datasets for dataset1 and 2)
+#' @param dataset2 Name of second dataset
+#' @param num.genes Number of genes to show in word clouds (default 30).
+#' @param mark.top.genes Plot points corresponding to top loading genes in different color (default
+#'   TRUE).
+#' @param factor.share.thresh Use only factors with a dataset specificity less than or equal to 
+#'   threshold (default 10).
+#' @param log.fc.thresh Lower log-fold change threshold for differential expression in markers
+#'   (default 1).
+#' @param umi.thresh Lower UMI threshold for markers (default 30).
+#' @param frac.thresh Lower threshold for fraction of cells expressing marker (default 0).
+#' @param pval.thresh Upper p-value threshold for Wilcoxon rank test for gene expression
+#'   (default 0.05).
+#' @param do.spec.plot Include dataset specificity plot in printout (default TRUE).
+#' @param max.val Value between 0 and 1 at which color gradient should saturate to max color. Set to
+#'   NULL to revert to default gradient scaling. (default 0.1)
+#' @inheritParams plotGene
+#' @param return.plots Return ggplot objects instead of printing directly (default FALSE).
+#'   
+#' @importFrom grid gpar
+#' @export
+#' @examples
+#' \dontrun{
+#' # liger object, factorization complete 
+#' ligerex
+#' ligerex <- quantileAlignSNF(ligerex)
+#' ligerex <- runTSNE(ligerex)
+#' pdf('gene_loadings.pdf')
+#' plotGeneLoadings(ligerex, num.genes = 20)
+#' dev.off()
+#' }
+
+plotGeneLoadings <- function(object, dataset1 = NULL, dataset2 = NULL, num.genes.show = 12,
+                             num.genes = 30, mark.top.genes = T, factor.share.thresh = 10,
+                             log.fc.thresh = 1, umi.thresh = 30, frac.thresh = 0,
+                             pval.thresh = 0.05, do.spec.plot = T, max.val = 0.1, pt.size = 0.1,
+                             option = "plasma", zero.color = "#F5F5F5", return.plots = F) {
+  if (is.null(dataset1) | is.null(dataset2)) {
+    dataset1 <- names(object@H)[1]
+    dataset2 <- names(object@H)[2]
+  }
+
+  H_aligned <- object@H.norm
+  W_orig <- t(object@W)
+  V1 <- t(object@V[[dataset1]])
+  V2 <- t(object@V[[dataset2]])
+  W <- pmin(W_orig + V1, W_orig + V2)
+
+  dataset.specificity <- calcDatasetSpecificity(object,
+                                                dataset1 = dataset1,
+                                                dataset2 = dataset2, do.plot = do.spec.plot
+  )
+
+  factors.use <- which(abs(dataset.specificity[[3]]) <= factor.share.thresh)
+
+
+  markers <- getFactorMarkers(object,
+                              dataset1 = dataset1, dataset2 = dataset2,
+                              factor.share.thresh = factor.share.thresh,
+                              num.genes = num.genes, log.fc.thresh = log.fc.thresh,
+                              umi.thresh = umi.thresh, frac.thresh = frac.thresh,
+                              pval.thresh = pval.thresh,
+                              dataset.specificity = dataset.specificity
+  )
+
+  rownames(W) <- rownames(V1) <- rownames(V2) <- rownames(W_orig) <- object@var.genes
+  loadings_list <- list(V1, W, V2)
+  names_list <- list(dataset1, "Shared", dataset2)
+  tsne_coords <- object@tsne.coords
+  pb <- txtProgressBar(min = 0, max = length(factors.use), style = 3)
+  return_plots <- list()
+  for (i in factors.use) {
+    tsne_df <- data.frame(H_aligned[, i], tsne_coords)
+    factorlab <- paste("Factor", i, sep = "")
+    colnames(tsne_df) <- c(factorlab, "tSNE1", "tSNE2")
+    tsne_df[[factorlab]][tsne_df[[factorlab]] == 0] <- NA
+    factor_ds <- paste("Factor", i, "Dataset Specificity:", dataset.specificity[[3]][i])
+    data.max <- max(object@H.norm[, i])
+    # plot TSNE
+    if (!is.null(max.val)) {
+      values <- c(0, max.val, 1)
+    } else {
+      values <- NULL
+    }
+    p1 <- ggplot(tsne_df, aes_string(x = "tSNE1", y = "tSNE2", color = factorlab)) +
+      geom_point(size = pt.size) +
+      scale_color_viridis_c(
+        option = option,
+        direction = -1,
+        na.value = zero.color, values = values
+      ) +
+      ggtitle(label = factor_ds)
+
+    # subset to specific factor and sort by p-value
+    top_genes_V1 <- markers[[1]][markers[[1]]$factor_num == i, ]
+    top_genes_V1 <- top_genes_V1[order(top_genes_V1$p_value), ]$gene
+    # don't sort for W
+    top_genes_W <- markers[[2]][markers[[2]]$factor_num == i, ]$gene
+    top_genes_V2 <- markers[[3]][markers[[3]]$factor_num == i, ]
+    top_genes_V2 <- top_genes_V2[order(top_genes_V2$p_value), ]$gene
+
+    top_genes_list <- list(top_genes_V1, top_genes_W, top_genes_V2)
+    # subset down to those which will be shown if sorting by p-val
+
+    top_genes_list <- lapply(top_genes_list, function(x) {
+      if (length(x) > num.genes.show) {
+        # to avoid subset warning
+        x <- x[1:num.genes.show]
+      }
+      x
+    })
+
+    plot_list <- lapply(seq_along(top_genes_list), function(x) {
+      top_genes <- top_genes_list[[x]]
+      # make dataframe for cum gene loadings plot
+      sorted <- sort(loadings_list[[x]][, i])
+      # sort by loadings instead - still only showing num.genes.show
+      # look through top num.genes in loadings
+      top_loaded <- names(rev(sorted[(length(sorted) - num.genes + 1):length(sorted)]))
+      top_genes <- top_loaded[which(top_loaded %in% top_genes)]
+      if (length(top_genes) == 0) {
+        top_genes <- c("no genes")
+      }
+
+      gene_df <- data.frame(
+        loadings = sorted,
+        xpos = seq(0, 1, length.out = length(sorted)),
+        top_k = names(sorted) %in% top_genes
+      )
+      y_lim_text <- max(gene_df$loadings)
+      # plot and annotate with top genes
+      out_plot <- ggplot(gene_df, aes(x = xpos, y = loadings)) + geom_point(size = 0.4) +
+        theme_bw() +
+        theme(
+          axis.ticks.x = element_blank(),
+          axis.line.x = element_blank(),
+          axis.title = element_blank(),
+          axis.text.x = element_blank(),
+          panel.grid.major.x = element_blank(),
+          panel.grid.minor.x = element_blank()
+        ) + ggtitle(label = names_list[[x]]) +
+        annotate("text",
+                 x = 1.1,
+                 y = seq(y_lim_text, 0, length.out = num.genes.show)[1:length(top_genes)],
+                 label = top_genes, hjust = 0, col = "#8227A0"
+        ) +
+        coord_cartesian(
+          xlim = c(0, 1), # This focuses the x-axis on the range of interest
+          clip = "off"
+        ) +
+        theme(plot.margin = unit(c(1, 4, 1, 1), "lines"))
+      if (mark.top.genes) {
+        out_plot <- out_plot + geom_point(
+          data = subset(gene_df, top_k == TRUE),
+          aes(xpos, loadings),
+          col = "#8227A0", size = 0.5
+        )
+      }
+      return(out_plot)
+    })
+
+    # p2 <- plot_grid(plotlist = plot_list, nrow = 1)
+
+    return_plots[[i]] <- p1 / (plot_list[[1]] | plot_list[[2]] | plot_list[[3]])
+    # if can figure out how to make cowplot work, might bring this back
+    # return_plots[[i]] <- plot_grid(p1, p2, nrow = 2, align = "h")
+    if (!return.plots) {
+      print(return_plots[[i]])
+    }
+    setTxtProgressBar(pb, i)
+  }
+  if (return.plots) {
+    return(return_plots)
+  }
+}
+
 #' Plot violin plots for gene expression
 #' 
 #' Generates violin plots of expression of specified gene for each dataset.

diff --git a/man/plotGeneLoadings.Rd b/man/plotGeneLoadings.Rd
diff --git a/...s/testthat/test_factorization_alignment.R → tests/testthat/test_post_factorization.R b/...s/testthat/test_factorization_alignment.R → tests/testthat/test_post_factorization.R
@@ -93,6 +93,8 @@ test_that("New alignment and clustering are correct", {
 # Tests for dimensional reduction
 # These are included here because these functions are object dependent,
 # to avoid recalculating factorization for fresh object as it's time-consuming
+# TODO: Add smaller example object (with H, H.norm included) that could be loaded
+# to make tests more modular 
 ####################################################################################
 context("Dimensional reduction")
 
@@ -103,3 +105,16 @@ test_that("Dimensions are correct", {
   expect_equal(dim(ligex@tsne.coords), c(494, 2))
   expect_equal(dim(ligex_rawtsne@tsne.coords), c(494, 2))
 })
+
+
+# Tests for plotting functions
+# Again, included here because these functions are object dependent (see above)
+####################################################################################
+context("Plotting")
+
+geneloadings_plots <- plotGeneLoadings(ligex, return.plots = T)
+
+test_that("Returns correct number of assembled ggplot objects", {
+  expect_equal(length(geneloadings_plots), ncol(ligex@H[[1]]))
+  expect_is(geneloadings_plots[[1]], class = c("ggassemble", "gg", "ggplot"))
+})
diff --git a/vignettes/liger-vignette.Rmd b/vignettes/liger-vignette.Rmd
@@ -64,8 +64,7 @@ ligerex = quantileAlignSNF(ligerex) #SNF clustering and quantile alignment
 ## Visualizing the results
 We can visualize the results by using dimensionality reduction techniques like t-SNE or UMAP (recommended 
 for larger datasets). Visualizations can be colored by dataset of origin or cluster assignment. 
-`plotWordClouds` is a useful way to visualize the most highly loading genes (both shared and dataset
-specific) for each factor, in conjunction with the factor loadings across cells.  
+`plotWordClouds` and `plotGeneLoadings` are useful ways to visualize the most highly loading genes (both shared and dataset specific) for each factor, in conjunction with the factor loadings across cells.  
 ```r
 ligerex = runTSNE(ligerex)
 # for larger datasets, may want to use UMAP instead
@@ -76,6 +75,10 @@ pdf("word_clouds.pdf")
 plotWordClouds(ligerex)
 dev.off()
 
+pdf("gene_loadings.pdf")
+plotGeneLoadings(ligerex)
+dev.off()
+
 ```
 
 ## Exploring factors and clusters