Merge pull request #1803 from lldelisle/addmotifs

Better control the number of GRanges
stuart-lab · Jan 16, 2025 · 51ceab1 · 51ceab1
2 parents 4fe0f8b + bdfd632
commit 51ceab1
Show file tree

Hide file tree

Showing 14 changed files with 200,210 additions and 34 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: Signac
 Title: Analysis of Single-Cell Chromatin Data
-Version: 1.14.9001
-Date: 2024-10-21
+Version: 1.14.9002
+Date: 2025-01-16
 Authors@R: c(
   person(given = 'Tim', family = 'Stuart', email = 'stuartt@gis.a-star.edu.sg', role = c('aut', 'cre'), comment = c(ORCID = '0000-0002-3044-0897')),
   person(given = 'Avi', family = 'Srivastava', email = 'asrivastava@nygenome.org', role = 'aut', comment = c(ORCID = '0000-0001-9798-2079')),

diff --git a/NAMESPACE b/NAMESPACE
@@ -312,6 +312,7 @@ importFrom(SeuratObject,"VariableFeatures<-")
 importFrom(SeuratObject,AddMetaData)
 importFrom(SeuratObject,Assays)
 importFrom(SeuratObject,Cells)
+importFrom(SeuratObject,CheckFeaturesNames)
 importFrom(SeuratObject,CreateAssayObject)
 importFrom(SeuratObject,CreateDimReducObject)
 importFrom(SeuratObject,CreateSeuratObject)

diff --git a/R/objects.R b/R/objects.R
@@ -225,12 +225,15 @@ CreateChromatinAssay <- function(
   if (!is.null(x = annotation) & !inherits(x = annotation, what = "GRanges")) {
     stop("Annotation must be a GRanges object.")
   }
-  if (!any(c("tx_id", "transcript_id") %in% colnames(x = mcols(x = annotation)))) {
-    stop("Annotation must have transcript id stored in `tx_id` or `transcript_id`.")
-  }
-  if (any(!c("gene_name", "gene_id", "gene_biotype", "type") %in% colnames(x = mcols(x = annotation)))) {
-    stop("Annotation must have `gene_name`, `gene_id`, `gene_biotype` and `type`.")
+  if (!is.null(x = annotation)) {
+    if (!any(c("tx_id", "transcript_id") %in% colnames(x = mcols(x = annotation)))) {
+      stop("Annotation must have transcript id stored in `tx_id` or `transcript_id`.")
+    }
+    if (any(!c("gene_name", "gene_id", "gene_biotype", "type") %in% colnames(x = mcols(x = annotation)))) {
+      stop("Annotation must have `gene_name`, `gene_id`, `gene_biotype` and `type`.")
+    }
   }
+
   # remove low-count cells
   ncount.cell <- colSums(x = data.use > 0)
   data.use <- data.use[, ncount.cell >= min.features]
@@ -808,7 +811,7 @@ RenameCells.Fragment <- function(object, new.names, ...) {
   return(object)
 }
 
-#' @importFrom SeuratObject SetAssayData
+#' @importFrom SeuratObject SetAssayData CheckFeaturesNames
 #' @importFrom GenomeInfoDb genome Seqinfo
 #' @importFrom lifecycle deprecated is_present
 #' @importFrom S4Vectors mcols
@@ -957,9 +960,19 @@ SetAssayData.ChromatinAssay <- function(
     if (!inherits(x = new.data, what = "Motif")) {
       stop("Must provide a Motif class object")
     }
+    # Set the feature names compatible with Seurat
+    new.data <- SetMotifData(
+      object = new.data,
+      slot = "data",
+      new.data = CheckFeaturesNames(
+        data = GetMotifData(object = new.data, slot = "data")
+        )
+    )
+
     # TODO allow mismatching row names, but check that the genomic ranges
     # are equivalent. Requires adding a granges slot to the motif class
-    if (!all(rownames(x = object) == rownames(x = new.data))) {
+    if (nrow(x = object) != nrow(x = new.data) ||
+        !all(rownames(x = object) == rownames(x = new.data))) {
       keep.features <- intersect(x = rownames(x = new.data),
                                  y = rownames(x = object))
       if (length(x = keep.features) == 0) {
@@ -968,8 +981,17 @@ SetAssayData.ChromatinAssay <- function(
       }
       else {
         warning("Features do not match in ChromatinAssay and Motif object.
-                Subsetting the Motif object.")
+                Subsetting/Filling the Motif object.")
         new.data <- new.data[keep.features, ]
+
+        new.data <- SetMotifData(
+          object = new.data,
+          slot = "data",
+          new.data = AddMissing(
+            GetMotifData(object = new.data, slot = "data"),
+            features = rownames(x = object)
+          )
+        )
       }
     }
     methods::slot(object = object, name = layer) <- new.data
@@ -1023,8 +1045,11 @@ SetMotifData.Motif <- function(object, slot, new.data, ...) {
 #' @export
 #' @concept motifs
 #' @examples
+#' new.data <- matrix(sample(c(0, 1), size = nrow(atac_small[["peaks"]]) * 10,
+#'                    replace = TRUE), nrow = nrow(atac_small[["peaks"]]))
+#' rownames(new.data) <- rownames(atac_small[["peaks"]])
 #' SetMotifData(
-#'   object = atac_small[['peaks']], slot = 'data', new.data = matrix(1:2)
+#'   object = atac_small[['peaks']], slot = 'data', new.data = new.data
 #' )
 #' @method SetMotifData ChromatinAssay
 SetMotifData.ChromatinAssay <- function(object, slot, new.data, ...) {

diff --git a/R/quantification.R b/R/quantification.R
@@ -191,9 +191,13 @@ GenomeBinMatrix <- function(
 #' the fragment object will still be searched.
 #' @param features A GRanges object containing a set of genomic intervals.
 #' These will form the rows of the matrix, with each entry recording the number
-#' of unique reads falling in the genomic region for each cell. If a genomic 
-#' region provided is on a chromosome that is not present in the fragment file,
-#' it will not be included in the returned matrix.
+#' of unique reads falling in the genomic region for each cell.
+#' @param keep_all_features By default, if a genomic region provided is on a
+#' chromosome that is not present in the fragment file,
+#' it will not be included in the returned matrix. Set `keep_all_features` to
+#' TRUE to force output to include all features in the input ranges. Note that
+#' features on chromosomes that are not present in the fragment file will be 
+#' filled with zero counts.
 #' @param cells Vector of cells to include. If NULL, include all cells found
 #' in the fragments file
 #' @param process_n Number of regions to load into memory at a time, per thread.
@@ -217,6 +221,7 @@ GenomeBinMatrix <- function(
 FeatureMatrix <- function(
   fragments,
   features,
+  keep_all_features = FALSE,
   cells = NULL,
   process_n = 2000,
   sep = c("-", "-"),
@@ -261,6 +266,7 @@ FeatureMatrix <- function(
       SingleFeatureMatrix(
         fragment = fragments[[x]],
         features = features,
+        keep_all_features = keep_all_features,
         cells = cells,
         sep = sep,
         verbose = verbose,
@@ -326,6 +332,7 @@ CombineTiles <- function(bins) {
 SingleFeatureMatrix <- function(
   fragment,
   features,
+  keep_all_features = FALSE,
   cells = NULL,
   process_n = 2000,
   sep = c("-", "-"),
@@ -359,6 +366,11 @@ SingleFeatureMatrix <- function(
     index = GetIndexFile(fragment = fragment.path, verbose = FALSE)
   )
   n_feat_start <- length(x = features)
+  if (keep_all_features) {
+    features_to_get <- GRangesToString(grange = features, sep = sep)
+  } else {
+    features_to_get <- NULL
+  }
   features <- keepSeqlevels(
     x = features,
     value = intersect(
@@ -371,7 +383,7 @@ SingleFeatureMatrix <- function(
     stop("No matching chromosomes found in fragment file.")
   }
   n_removed <- n_feat_start - length(x = features)
-  if (n_removed > 0) {
+  if (n_removed > 0 && ! keep_all_features) {
     if (n_removed == 1) {
       warning(n_removed, " feature is on a seqname not present in ",
               "the fragment file. This will be removed.")
@@ -418,7 +430,14 @@ SingleFeatureMatrix <- function(
       X = matrix.parts,
       FUN = AddMissing,
       cells = all.cells,
-      features = NULL
+      features = features_to_get
+    )
+  } else if (keep_all_features) {
+    matrix.parts <- lapply(
+      X = matrix.parts,
+      FUN = AddMissing,
+      cells = NULL,
+      features = features_to_get
     )
   }
   featmat <- do.call(what = rbind, args = matrix.parts)
@@ -429,7 +448,11 @@ SingleFeatureMatrix <- function(
     colnames(x = featmat) <- unname(obj = cell.convert[colnames(x = featmat)])
   }
   # reorder features
-  feat.str <- GRangesToString(grange = features, sep = sep)
+  if (keep_all_features) {
+    feat.str <- features_to_get
+  } else {
+    feat.str <- GRangesToString(grange = features, sep = sep)
+  }
   featmat <- featmat[feat.str, , drop=FALSE]
   return(featmat)
 }
diff --git a/R/utilities.R b/R/utilities.R
@@ -1220,20 +1220,22 @@ NonOverlapping <- function(x, all.features) {
 }
 
 #' @importFrom Matrix sparseMatrix
-AddMissing <- function(x, cells, features = NULL) {
+AddMissing <- function(x, cells = NULL, features = NULL) {
   # add columns with zeros for cells or features not in matrix
-  missing.cells <- setdiff(x = cells, y = colnames(x = x))
-  if (!(length(x = missing.cells) == 0)) {
-    null.mat <- sparseMatrix(
-      i = c(),
-      j = c(),
-      dims = c(nrow(x = x), length(x = missing.cells))
-    )
-    rownames(x = null.mat) <- rownames(x = x)
-    colnames(x = null.mat) <- missing.cells
-    x <- cbind(x, null.mat)
+  if (!is.null(cells)) {
+    missing.cells <- setdiff(x = cells, y = colnames(x = x))
+    if (!(length(x = missing.cells) == 0)) {
+      null.mat <- sparseMatrix(
+        i = c(),
+        j = c(),
+        dims = c(nrow(x = x), length(x = missing.cells))
+      )
+      rownames(x = null.mat) <- rownames(x = x)
+      colnames(x = null.mat) <- missing.cells
+      x <- cbind(x, null.mat)
+    }
+    x <- x[, cells, drop = FALSE]
   }
-  x <- x[, cells, drop = FALSE]
   if (!is.null(x = features)) {
     missing.features <- setdiff(x = features, y = rownames(x = x))
     if (!(length(x = missing.features) == 0)) {