diff --git a/tools/TagSort/src/metricgatherer.cpp b/tools/TagSort/src/metricgatherer.cpp index 73172ae0..7d8d25e7 100644 --- a/tools/TagSort/src/metricgatherer.cpp +++ b/tools/TagSort/src/metricgatherer.cpp @@ -11,8 +11,19 @@ std::string to_nan(float x) return x==-1 ? "nan" : s.str(); } -MetricGatherer::MetricGatherer(std::string metric_output_file) +MetricGatherer::MetricGatherer(std::string metric_output_file, + TagOrder tag_order, + std::string gtf_file, + std::string mitochondrial_gene_names_filename) + : tag_order_(tag_order) { + if (gtf_file.empty()) + crash("MetricGatherer needs a non-empty gtf_file name!"); + // it's ok if mitochondrial_gene_names_filename is empty; + // getInterestingMitochondrialGenes() has logic to handle that case. + mito_genes_ = getInterestingMitochondrialGenes( + gtf_file, mitochondrial_gene_names_filename); + metrics_csv_outfile_.open(metric_output_file); if (!metrics_csv_outfile_) crash("Failed to open for writing " + metric_output_file); @@ -47,9 +58,29 @@ void MetricGatherer::clearCellAndGeneCommon() spliced_reads_ = 0; } +bool MetricGatherer::isMitochondrial(LineFields const& fields) const +{ + switch (tag_order_) + { + case TagOrder::GUB: + case TagOrder::GBU: + return mito_genes_.find(fields.tag_triple.first) != mito_genes_.end(); + + case TagOrder::BGU: + case TagOrder::UGB: + return mito_genes_.find(fields.tag_triple.second) != mito_genes_.end(); + + case TagOrder::UBG: + case TagOrder::BUG: + return mito_genes_.find(fields.tag_triple.third) != mito_genes_.end(); + } + crash("unknown TagOrder value"); + return false; +} + void MetricGatherer::ingestLineCellAndGeneCommon(LineFields const& fields) { - n_reads_++; + n_reads_++; //with/without mt? == uniquely + multimapped // the tags passed to this function define a molecule, this increments the counter, // identifying a new molecule only if a new tag combination is observed @@ -78,18 +109,22 @@ void MetricGatherer::parseAlignedReadFields(LineFields const& fields, std::strin is_strand + "\t" + hyphenated_tags; fragment_histogram_[ref_pos_str_tags] += 1; - if (fields.number_mappings == 1) { - reads_mapped_uniquely_ += 1; - if (fields.alignment_location == 1 || fields.alignment_location == 3) - reads_mapped_exonic_ += 1; - else if (fields.alignment_location == 2 || fields.alignment_location == 4) - reads_mapped_exonic_as_ += 1; - else if (fields.alignment_location == 5) - reads_mapped_intronic_ += 1; - else if (fields.alignment_location == 6) - reads_mapped_intronic_as_ += 1; } - else { - reads_mapped_multiple_ += 1; // without multi-mapping, this number is zero! + if (!isMitochondrial(fields)) + { + if (fields.number_mappings == 1) + { + reads_mapped_uniquely_ += 1; + if (fields.alignment_location == 1 || fields.alignment_location == 3) + reads_mapped_exonic_ += 1; + else if (fields.alignment_location == 2 || fields.alignment_location == 4) + reads_mapped_exonic_as_ += 1; + else if (fields.alignment_location == 5) + reads_mapped_intronic_ += 1; + else if (fields.alignment_location == 6) + reads_mapped_intronic_as_ += 1; + } + else + reads_mapped_multiple_ += 1; // without multi-mapping, this number is zero! } // in futher check if read maps outside window (when we add a gene model) @@ -157,17 +192,11 @@ void MetricGatherer::outputMetricsLineCellAndGeneCommon() //////////////// CellMetricGatherer //////////////////////// CellMetricGatherer::CellMetricGatherer(std::string metric_output_file, + TagOrder tag_order, std::string gtf_file, std::string mitochondrial_gene_names_filename) - : MetricGatherer(metric_output_file) + : MetricGatherer(metric_output_file, tag_order, gtf_file, mitochondrial_gene_names_filename) { - if (gtf_file.empty()) - crash("CellMetricGatherer needs a non-empty gtf_file name!"); - // it's ok if mitochondrial_gene_names_filename is empty; - // getInterestingMitochondrialGenes() has logic to handle that case. - mitochondrial_genes_ = getInterestingMitochondrialGenes( - gtf_file, mitochondrial_gene_names_filename); - // write metrics csv header std::string s; for (int i=0; i<25; i++) @@ -207,13 +236,11 @@ void CellMetricGatherer::ingestLine(std::string const& str) cell_barcode_fraction_bases_above_30_.update(fields.cell_barcode_base_above_30); perfect_cell_barcodes_ += fields.cell_barcode_perfect; - if (fields.alignment_location == 7) { - if (fields.number_mappings == 1) - reads_mapped_intergenic_ += 1; - } - else if(fields.alignment_location == 0) { + bool is_mito = isMitochondrial(fields); + if (fields.alignment_location == 7 && fields.number_mappings == 1 && !is_mito) + reads_mapped_intergenic_ += 1; + else if (fields.alignment_location == 0) reads_unmapped_ += 1; - } genes_histogram_[std::string(fields.tag_triple.third)] += 1; // END cell-metric-specific stuff @@ -242,7 +269,7 @@ void CellMetricGatherer::outputMetricsLine() { if (count > 1) genes_detected_multiple_observations++; - if (mitochondrial_genes_.find(gene) != mitochondrial_genes_.end()) + if (mito_genes_.find(gene) != mito_genes_.end()) { n_mitochondrial_genes++; n_mitochondrial_molecules += count; @@ -286,8 +313,11 @@ void CellMetricGatherer::clear() //////////////// GeneMetricGatherer //////////////////////// -GeneMetricGatherer::GeneMetricGatherer(std::string metric_output_file) - : MetricGatherer(metric_output_file) +GeneMetricGatherer::GeneMetricGatherer(std::string metric_output_file, + TagOrder tag_order, + std::string gtf_file, + std::string mitochondrial_gene_names_filename) + : MetricGatherer(metric_output_file, tag_order, gtf_file, mitochondrial_gene_names_filename) { // write metrics csv header std::string s; @@ -351,8 +381,11 @@ void GeneMetricGatherer::clear() //////////////// UmiMetricGatherer //////////////////////// -UmiMetricGatherer::UmiMetricGatherer(std::string metric_output_file, TagOrder tag_order) - : MetricGatherer(metric_output_file) +UmiMetricGatherer::UmiMetricGatherer(std::string metric_output_file, + TagOrder tag_order, + std::string gtf_file, + std::string mitochondrial_gene_names_filename) + : MetricGatherer(metric_output_file, tag_order, gtf_file, mitochondrial_gene_names_filename) { metrics_csv_outfile_ << tagOrderToString(tag_order) << ",count\n"; } diff --git a/tools/TagSort/src/metricgatherer.h b/tools/TagSort/src/metricgatherer.h index 67bf0a4f..3cd1a47c 100644 --- a/tools/TagSort/src/metricgatherer.h +++ b/tools/TagSort/src/metricgatherer.h @@ -62,7 +62,9 @@ class OnlineGaussianSufficientStatistic class MetricGatherer { public: - MetricGatherer(std::string metric_output_file); + MetricGatherer(std::string metric_output_file, TagOrder tag_order, + std::string gtf_file, + std::string mitochondrial_gene_names_filename); virtual ~MetricGatherer(); virtual void ingestLine(std::string const& str) = 0; @@ -77,6 +79,7 @@ class MetricGatherer void ingestLineCellAndGeneCommon(LineFields const& fields); void outputMetricsLineCellAndGeneCommon(); void clearCellAndGeneCommon(); + bool isMitochondrial(LineFields const& fields) const; const std::string kCommonHeaders[25] = { @@ -109,6 +112,9 @@ class MetricGatherer void parseAlignedReadFields(LineFields const& fields, std::string hyphenated_tags); + std::unordered_set mito_genes_; + const TagOrder tag_order_; + std::unordered_map molecule_histogram_; std::ofstream metrics_csv_outfile_; @@ -150,12 +156,15 @@ class MetricGatherer int spliced_reads_ = 0; const int kAntisenseReads = 0; // TODO is never changed from 0 // int plus_strand_reads_ = 0; // strand balance (currently unused) + // test for mt -- will need to remove + int n_mitochondrial_reads_ = 0; }; class CellMetricGatherer: public MetricGatherer { public: CellMetricGatherer(std::string metric_output_file, + TagOrder tag_order, std::string gtf_file, std::string mitochondrial_gene_names_filename); void ingestLine(std::string const& str) override; @@ -165,8 +174,6 @@ class CellMetricGatherer: public MetricGatherer void clear() override; private: - std::unordered_set mitochondrial_genes_; - int perfect_cell_barcodes_ = 0; // The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``) int reads_mapped_intergenic_ = 0; // The number of reads mapped to an intergenic region for this cell @@ -203,7 +210,10 @@ class CellMetricGatherer: public MetricGatherer class GeneMetricGatherer: public MetricGatherer { public: - explicit GeneMetricGatherer(std::string metric_output_file); + GeneMetricGatherer(std::string metric_output_file, + TagOrder tag_order, + std::string gtf_file, + std::string mitochondrial_gene_names_filename); void ingestLine(std::string const& str) override; @@ -224,7 +234,10 @@ class GeneMetricGatherer: public MetricGatherer class UmiMetricGatherer: public MetricGatherer { public: - UmiMetricGatherer(std::string metric_output_file, TagOrder tag_order); + UmiMetricGatherer(std::string metric_output_file, + TagOrder tag_order, + std::string gtf_file, + std::string mitochondrial_gene_names_filename); void ingestLine(std::string const& str) override; void outputMetricsLine() override; diff --git a/tools/TagSort/src/partial_file_merge.cpp b/tools/TagSort/src/partial_file_merge.cpp index a886ae93..52cc0f8b 100644 --- a/tools/TagSort/src/partial_file_merge.cpp +++ b/tools/TagSort/src/partial_file_merge.cpp @@ -106,15 +106,22 @@ std::unique_ptr maybeMakeMetricGatherer(INPUT_OPTIONS_TAGSORT co if (options.metric_type == MetricType::Cell) { return std::make_unique( - options.metric_output_file, options.gtf_file, + options.metric_output_file, getTagOrder(options), + options.gtf_file, options.mitochondrial_gene_names_filename); + } + if (options.metric_type == MetricType::Gene) + { + return std::make_unique( + options.metric_output_file, getTagOrder(options), options.gtf_file, + options.mitochondrial_gene_names_filename); + } + if (options.metric_type == MetricType::Umi) + { + return std::make_unique( + options.metric_output_file, getTagOrder(options), options.gtf_file, options.mitochondrial_gene_names_filename); } - else if (options.metric_type == MetricType::Gene) - return std::make_unique(options.metric_output_file); - else if (options.metric_type == MetricType::Umi) - return std::make_unique(options.metric_output_file, getTagOrder(options)); - else - crash("new MetricType enum value is not yet handled by MetricGatherer!"); + crash("new MetricType enum value is not yet handled by MetricGatherer!"); return nullptr; }