add h5ad

broadinstitute · Aug 23, 2024 · 6b938c6 · 6b938c6
1 parent 5d8a84c
commit 6b938c6
Showing 1 changed file with 257 additions and 0 deletions.
diff --git a/tools/scripts/create_h5ad_snss2.py b/tools/scripts/create_h5ad_snss2.py
@@ -0,0 +1,257 @@
+import argparse
+import csv
+import os
+import numpy as np
+import scipy as sc
+import anndata as ad
+import pandas as pd
+
+def generate_col_attr(args):
+    """Converts the QC of Smart Seq2 gene file pipeline outputs to loom file
+    Args:
+        qc_path (str): path to the QCs csv
+    Retruns:
+         col_attrs: dict
+    """
+    # read the QC values
+    qc_path = [p for p in args.qc_files if p.endswith(".csv")][0]
+    with open(qc_path, 'r') as f:
+        qc_values = [row for row in csv.reader(f)]
+        n_metrics_files = len(qc_values)-2
+        metadata_labels = qc_values[0][1:]
+        metadata_values = []
+        for index_val in range(len(metadata_labels)):
+          arr_metric_val = []
+          for index_file in range(n_metrics_files):
+            arr_metric_val.append(qc_values[index_file+2][index_val+1])
+          metadata_values.append(','.join(s for s in arr_metric_val if s))
+
+    cell_id = args.input_id
+
+    string_metadata = {}
+    numeric_metadata = {}
+
+    for label, value in zip(metadata_labels, metadata_values):
+
+        # See if this is numeric or string
+        numeric_value = None
+        try:
+            numeric_value = float(value)
+        except ValueError:
+            try:
+                numeric_value = float(value.strip("%"))/100
+            except ValueError:
+                pass
+
+        if numeric_value is not None:
+            numeric_metadata[label] = numeric_value
+        else:
+            string_metadata[label] = value
+
+    # Metrics
+    # Write the string and numeric metadata separately
+    sorted_string_labels = sorted(string_metadata.keys())
+    sorted_string_values = [string_metadata[m] for m in sorted_string_labels]
+    sorted_numeric_labels = sorted(numeric_metadata.keys())
+    sorted_numeric_values = [numeric_metadata[m] for m in sorted_numeric_labels]
+
+    # Column attributes
+    col_attrs = dict()
+    col_attrs["cell_names"] = [cell_id]
+    col_attrs["CellID"] = [cell_id]
+    col_attrs['input_id'] = [args.input_id]
+
+    numeric_field_names = np.array(sorted_numeric_labels[:])
+
+    for i in range(0, numeric_field_names.shape[0]):
+        name = numeric_field_names[i]
+        data = np.array([sorted_numeric_values])[:,i]
+        col_attrs[name] = data
+    string_field_names = np.array(sorted_string_labels)
+    for i in range(0, string_field_names.shape[0]):
+        name = string_field_names[i]
+        data = np.array([sorted_string_values])[:,i]
+        col_attrs[name] = data
+
+    return col_attrs
+
+
+def generate_csr_sparse_coo(expr):
+
+    nrows, ncols = np.shape(expr)
+    expr_coo = sc.sparse.coo_matrix(expr[:])
+    xcoord = []
+    ycoord = []
+    value = []
+
+    for k in range(0, expr_coo.data.shape[0]):
+        xcoord.append(expr_coo.row[k])
+        ycoord.append(expr_coo.col[k])
+        value.append(expr_coo.data[k])
+
+    xcoord = np.asarray(xcoord)
+    ycoord = np.asarray(ycoord)
+    value = np.asarray(value)
+
+    expr_sp_t = sc.sparse.coo_matrix((value, (ycoord, xcoord)), shape=(ncols, nrows))
+    return expr_sp_t
+
+
+def generate_row_attr_and_matrix(count_results_path):
+    """Converts the Smart Seq2 intron and exon counts file to
+        csr_coos and row attribute dictionary
+    Args:
+        input_path (str): file where the SS2 pipeline expression counts are
+
+    Return:
+        row_attrs: dict of attributes
+        intron_expression_csr_coo: crs coo matrix for introns
+        exon_expression_csr_coo: crs coo matrix for exons
+    """
+    reader = csv.DictReader(open(count_results_path), delimiter="\t")
+
+    intron_expression_values = {}
+    exon_expression_values = {}
+    intron_lengths = {}
+    exon_lengths = {}
+
+    gene_ids = []
+    gene_names = []
+    count_values = {}
+    for row in reader:
+        intron_expression_values[row["gene_id"]] = int(row["introns"])
+        exon_expression_values[row["gene_id"]] = int(row["exons"])
+        intron_lengths[row["gene_id"]] = int(row["intron_length"])
+        exon_lengths[row["gene_id"]] = int(row["exon_length"])
+        gene_ids.append(row['gene_id'])
+        gene_names.append(row['gene_name'])
+
+    intron_counts = [intron_expression_values[g] for g in gene_ids]
+    exon_counts  = [exon_expression_values[g] for g in gene_ids]
+    intron_lengths = [intron_lengths[g] for g in gene_ids]
+    exon_lengths = [exon_lengths[g] for g in gene_ids]
+
+    intron_expression_csr_coo = generate_csr_sparse_coo([intron_counts])
+    exon_expression_csr_coo = generate_csr_sparse_coo([exon_counts])
+
+    row_attrs = { "ensembl_ids"  : np.array(gene_ids),
+                 "gene_names"    : np.array(gene_names),
+                 "Gene"          : np.array(gene_ids),
+                 "intron_lengths": np.array(intron_lengths),
+                 "exon_lengths"  : np.array(exon_lengths)
+                }
+
+    return row_attrs, intron_expression_csr_coo, exon_expression_csr_coo
+
+
+def create_h5ad_files(args):
+    """This function creates the loom file or folder structure in output_loom_path in
+       format file_format, with input_id from the input folder analysis_output_path
+    Args:
+        input_id (str): sample or cell id
+        qc_analysis_output_files_string (str): a string with the file names in the QCGroup of SS2
+            pipeline output, separated by commas
+        rsem_genes_results_file (str): the file for the expression count
+        output_loom_path (str): location of the output loom
+    """
+    # generate a dictionary of column attributes
+    col_attrs =  generate_col_attr(args)
+
+    # add the expression count matrix data
+    # generate a dictionary of row attributes
+    row_attrs, intron_expr_csr_coo, exon_expr_csr_coo = generate_row_attr_and_matrix(args.count_results_file)
+
+    # Convert the matrices to CSR format
+    exon_counts = exon_expr_csr_coo.transpose().tocsr()
+    intron_counts = intron_expr_csr_coo.transpose().tocsr()
+
+    gene_attrs = row_attrs
+    cell_attrs = col_attrs
+    adata = ad.AnnData(exon_counts)
+    print(adata.X)
+
+
+    attrDict = dict()
+    attrDict['pipeline_version'] = args.pipeline_version
+
+    # importing the cell and gene metrics as dataframes
+    dc = pd.DataFrame(cell_attrs)
+    dg = pd.DataFrame(gene_attrs)
+
+    # assign the cell and gene attrs to the obs and var field respectively
+    adata.obs = dc
+    adata.var = dg
+
+    # assign global attrs to unstructured data
+    adata.uns = attrDict
+    adata.obs_names = [x for x in adata.obs['CellID']]
+
+    # set variable names
+    adata.var_names = [x for x in adata.var['Gene']]
+
+    # set the layers to the intron counts
+    adata.layers['intron_counts'] = intron_counts
+
+    # write the h5ad file
+    adata.write_h5ad(args.output_h5ad_path + ".h5ad")
+
+
+    #generate loom file
+    #loompy.create(args.output_loom_path, exon_expr_csr_coo, row_attrs, col_attrs, file_attrs=attrDict)
+
+    #ds = loompy.connect(args.output_loom_path)
+
+    #ds.layers['intron_counts'] = intron_expr_csr_coo
+    #ds.close()
+
+
+def main():
+    description = """This script converts the some of the SmartSeq2 pipeline outputs in to
+                   loom format (https://linnarssonlab.org/loompy/format/index.html) relevant output.
+                   This script can be used as a module or run as a command line script."""
+
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument('--qc_files',
+                        dest="qc_files",
+                        nargs = "+",
+                        help=('the grouped QC files from the GroupQCOutputs task of SS2 '
+                              'Single Sample workflow'))
+
+    parser.add_argument('--count_results',
+                        dest="count_results_file",
+                        help='path to the intronic and exonic count and FPKM to be added to the loom')
+
+    parser.add_argument('--output_h5ad_path',
+                        dest="output_h5ad_path",
+                        help='path where the h5ad file is to be created')
+
+    parser.add_argument('--input_id',
+                        dest="input_id",
+                        help='the sample name in the bundle')
+
+    parser.add_argument("--input_name",
+                        dest="input_name",
+                        help= "sequencing_input.biomaterial_core.biomaterial_id in HCA metadata, defined by the user",
+    )
+
+    parser.add_argument("--input_id_metadata_field",
+                        dest="input_id_metadata_field",
+                        help= "sequencing_process.provenance.document_id: [UUID] defined by the user",
+    )
+
+    parser.add_argument("--input_name_metadata_field",
+                        dest="input_name_metadata_field",
+                        help= "sequencing_input.biomaterial_core.biomaterial_id defined by the user",
+    )
+
+    parser.add_argument('--pipeline_version',
+                        default="Unknown sample",
+                        help='the version of SS2 used to generate data')
+
+    args = parser.parse_args()
+
+    create_h5ad_files(args)
+
+
+if __name__ == '__main__':
+    main()