Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PD-2474: Remove reads from MT genes in Tagsort metrics #100

Merged
merged 45 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
c96929a
Playing around with mitochondrial gene unordered set
aawdeh Jan 11, 2024
e187078
Fixed constructors for GeneMetricGatherer and UmiMetricGatherer
aawdeh Jan 11, 2024
1b0a9f4
modify maybeMakeMetricGatherer in partial_file_merge
aawdeh Jan 11, 2024
06a5e90
Add print statements
aawdeh Jan 12, 2024
7090092
Add print statements
aawdeh Jan 12, 2024
466a716
Add print statements
aawdeh Jan 12, 2024
9b73358
Add condition for MT gene for intergenic
aawdeh Jan 16, 2024
66ca25d
Removed extra {
aawdeh Jan 16, 2024
fd1aa1a
commented out mt part for intergenic
aawdeh Jan 16, 2024
83c80b4
Added mt condition to not include unmapped in intergenic condition
aawdeh Jan 16, 2024
0af207b
Remove print statement
aawdeh Jan 18, 2024
8a7e32d
Add print statements
aawdeh Jan 18, 2024
2a4d7fd
Add print statements
aawdeh Jan 18, 2024
99facc8
Add print statements
aawdeh Jan 18, 2024
366f48c
Add print
aawdeh Jan 19, 2024
f99a019
Add tagorder in constructors
aawdeh Jan 22, 2024
aea51fe
Tagorder constructor again
aawdeh Jan 22, 2024
e320bae
Add tagorder in calls for different classes cell and gene
aawdeh Jan 22, 2024
6ed9477
test
aawdeh Jan 22, 2024
6ea0d09
test
aawdeh Jan 22, 2024
86ac539
test
aawdeh Jan 22, 2024
38b42f8
test
aawdeh Jan 22, 2024
91259ff
test
aawdeh Jan 22, 2024
dfa5b8a
test
aawdeh Jan 22, 2024
aedc43e
Changed G to gene_id
aawdeh Jan 22, 2024
0313bbc
test
aawdeh Jan 22, 2024
0d5dce9
test
aawdeh Jan 22, 2024
34e5839
test
aawdeh Jan 23, 2024
c9a5420
test
aawdeh Jan 23, 2024
17b26fa
added get functions in parent metric class
aawdeh Jan 23, 2024
1857d30
test
aawdeh Jan 23, 2024
0fc00be
test
aawdeh Jan 23, 2024
40ad27c
test
aawdeh Jan 23, 2024
d729d35
Remove/adjust print statements
aawdeh Jan 23, 2024
56e2081
Remove print statements
aawdeh Jan 24, 2024
9a94dd3
Fixed cell specific header
aawdeh Jan 25, 2024
317967a
test n_mitochondrial_reads
aawdeh Jan 31, 2024
9ee05cc
remove print statement
aawdeh Jan 31, 2024
951c7d9
test
aawdeh Feb 1, 2024
16e5c06
changed how tagorder is computed (#104)
khajoue2 Feb 1, 2024
a213288
fix name
aawdeh Feb 2, 2024
2f10191
test
aawdeh Feb 4, 2024
71022f6
set n_mitochondrial_reads_ to 0 when clearing
aawdeh Feb 5, 2024
57b3d4d
Add is not mito do intergenic condition
aawdeh Feb 6, 2024
32a2095
Remove n_mitochondrial_reads
aawdeh Feb 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 119 additions & 38 deletions tools/TagSort/src/metricgatherer.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#include "metricgatherer.h"

#include "mitochondrial_gene_selector.h"

#include <map>
#include <iostream>
#include <sstream>
#include <string>

constexpr int kMetricsFloatPrintPrecision = 10;

std::string to_nan(float x)
Expand All @@ -11,8 +15,23 @@ std::string to_nan(float x)
return x==-1 ? "nan" : s.str();
}

MetricGatherer::MetricGatherer(std::string metric_output_file)
MetricGatherer::MetricGatherer(std::string metric_output_file,
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename)
{

//Get and set gene_id position in tag_order. gene_id position varies depending on user input.
setGeneIdPosition(tag_order);

// get list of mitochondrial genes
if (gtf_file.empty())
crash("MetricGatherer needs a non-empty gtf_file name!");
// it's ok if mitochondrial_gene_names_filename is empty;
// getInterestingMitochondrialGenes() has logic to handle that case.
mitochondrial_genes_ = getInterestingMitochondrialGenes(
gtf_file, mitochondrial_gene_names_filename);

metrics_csv_outfile_.open(metric_output_file);
if (!metrics_csv_outfile_)
crash("Failed to open for writing " + metric_output_file);
Expand Down Expand Up @@ -49,8 +68,21 @@ void MetricGatherer::clearCellAndGeneCommon()

void MetricGatherer::ingestLineCellAndGeneCommon(LineFields const& fields)
{
n_reads_++;

n_reads_++; //with/without mt? == uniquely + multimapped

//-----------------------------------------------------------------------
// Will remove this
// std::cout << "TEST -- to increment number of n_mitochondrial_reads -- \n";
std::map<size_t, std::string> indexToField_TagOrder = {
{0, fields.tag_triple.first},
{1, fields.tag_triple.second},
{2, fields.tag_triple.third}};
std::string gene_id = indexToField_TagOrder[geneid_position];
if (mitochondrial_genes_.find(gene_id) != mitochondrial_genes_.end()){
n_mitochondrial_reads+=1;
}
//-----------------------------------------------------------------------

// the tags passed to this function define a molecule, this increments the counter,
// identifying a new molecule only if a new tag combination is observed
std::string hyphenated_tags = fields.tag_triple.first + "-" +
Expand Down Expand Up @@ -78,20 +110,29 @@ void MetricGatherer::parseAlignedReadFields(LineFields const& fields, std::strin
is_strand + "\t" + hyphenated_tags;
fragment_histogram_[ref_pos_str_tags] += 1;

if (fields.number_mappings == 1) {
reads_mapped_uniquely_ += 1;
if (fields.alignment_location == 1 || fields.alignment_location == 3)
reads_mapped_exonic_ += 1;
else if (fields.alignment_location == 2 || fields.alignment_location == 4)
reads_mapped_exonic_as_ += 1;
else if (fields.alignment_location == 5)
reads_mapped_intronic_ += 1;
else if (fields.alignment_location == 6)
reads_mapped_intronic_as_ += 1; }
else {
reads_mapped_multiple_ += 1; // without multi-mapping, this number is zero!
std::map<size_t, std::string> indexToField_TagOrder = {
{0, fields.tag_triple.first},
{1, fields.tag_triple.second},
{2, fields.tag_triple.third}};
std::string gene_id = indexToField_TagOrder[geneid_position];

// Check if not a mitochondrial gene
if (!(mitochondrial_genes_.find(gene_id) != mitochondrial_genes_.end())) {
if (fields.number_mappings == 1) {
reads_mapped_uniquely_ += 1;
if (fields.alignment_location == 1 || fields.alignment_location == 3)
reads_mapped_exonic_ += 1;
else if (fields.alignment_location == 2 || fields.alignment_location == 4)
reads_mapped_exonic_as_ += 1;
else if (fields.alignment_location == 5)
reads_mapped_intronic_ += 1;
else if (fields.alignment_location == 6)
reads_mapped_intronic_as_ += 1; }
else {
reads_mapped_multiple_ += 1; // without multi-mapping, this number is zero!
}
}

// in futher check if read maps outside window (when we add a gene model)
// and create distances from terminate side (needs gene model) uniqueness
duplicate_reads_ += fields.read_is_duplicate;
Expand Down Expand Up @@ -148,26 +189,46 @@ void MetricGatherer::outputMetricsLineCellAndGeneCommon()
<< reads_per_fragment << ","
<< fragments_per_molecule << ","
<< fragments_with_single_read_evidence << ","
<< molecules_with_single_read_evidence;
<< molecules_with_single_read_evidence << ","
<< n_mitochondrial_reads;
}

void MetricGatherer::setGeneIdPosition(TagOrder tag_order) {
// This function gets the gene_id position from the tag_order object

//tagOrderToString return gene_id,barcode,umi where order depends on user input
std::istringstream tag_order_i(tagOrderToString(tag_order));
std::string token;
int geneid_position_i = -1;

// Tokenize tag_order_str and check positions
for (int i = 0; std::getline(tag_order_i, token, ','); ++i) {
if (token == "gene_id" && (i == 0 || i == 1 || i == 2)) {
geneid_position_i = i;
std::cout << "'gene_id' is at position: " << geneid_position_i << std::endl;
break;
}
}
std::cout << "'gene_id' is at position outside of for loop: " << geneid_position_i << std::endl;
geneid_position = geneid_position_i;
}

int MetricGatherer::getGeneIdPosition() {
return geneid_position;
}

std::unordered_set<std::string> MetricGatherer::getMTgenes() {
return mitochondrial_genes_;
}

//////////////// CellMetricGatherer ////////////////////////

CellMetricGatherer::CellMetricGatherer(std::string metric_output_file,
std::string gtf_file,
std::string mitochondrial_gene_names_filename)
: MetricGatherer(metric_output_file)
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename)
: MetricGatherer(metric_output_file, tag_order, gtf_file, mitochondrial_gene_names_filename)
{
if (gtf_file.empty())
crash("CellMetricGatherer needs a non-empty gtf_file name!");
// it's ok if mitochondrial_gene_names_filename is empty;
// getInterestingMitochondrialGenes() has logic to handle that case.
mitochondrial_genes_ = getInterestingMitochondrialGenes(
gtf_file, mitochondrial_gene_names_filename);

// write metrics csv header
std::string s;
for (int i=0; i<25; i++)
Expand Down Expand Up @@ -207,13 +268,26 @@ void CellMetricGatherer::ingestLine(std::string const& str)
cell_barcode_fraction_bases_above_30_.update(fields.cell_barcode_base_above_30);
perfect_cell_barcodes_ += fields.cell_barcode_perfect;

// need to change this
// tag_order_str is a combination of BGU so find order of where gene_id is in gene_id,barcode,umi
mitochondrial_genes_ = getMTgenes();
geneid_position = getGeneIdPosition();

std::map<size_t, std::string> indexToField_TagOrder = {
{0, fields.tag_triple.first},
{1, fields.tag_triple.second},
{2, fields.tag_triple.third}};

std::string gene_id = indexToField_TagOrder[getGeneIdPosition()];

if (fields.alignment_location == 7) {
if (fields.number_mappings == 1)
reads_mapped_intergenic_ += 1;
}
else if(fields.alignment_location == 0) {
reads_unmapped_ += 1;
}
if (fields.number_mappings == 1)
if (!(mitochondrial_genes_.find(gene_id) != mitochondrial_genes_.end()))
reads_mapped_intergenic_ += 1;
}
else if(fields.alignment_location == 0) {
reads_unmapped_ += 1;
}

genes_histogram_[std::string(fields.tag_triple.third)] += 1;
// END cell-metric-specific stuff
Expand Down Expand Up @@ -286,9 +360,13 @@ void CellMetricGatherer::clear()


//////////////// GeneMetricGatherer ////////////////////////
GeneMetricGatherer::GeneMetricGatherer(std::string metric_output_file)
: MetricGatherer(metric_output_file)
GeneMetricGatherer::GeneMetricGatherer(std::string metric_output_file,
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename)
: MetricGatherer(metric_output_file, tag_order, gtf_file, mitochondrial_gene_names_filename)
{

// write metrics csv header
std::string s;
for (int i=0; i<25; i++)
Expand Down Expand Up @@ -351,8 +429,11 @@ void GeneMetricGatherer::clear()


//////////////// UmiMetricGatherer ////////////////////////
UmiMetricGatherer::UmiMetricGatherer(std::string metric_output_file, TagOrder tag_order)
: MetricGatherer(metric_output_file)
UmiMetricGatherer::UmiMetricGatherer(std::string metric_output_file,
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename)
: MetricGatherer(metric_output_file, tag_order, gtf_file, mitochondrial_gene_names_filename)
{
metrics_csv_outfile_ << tagOrderToString(tag_order) << ",count\n";
}
Expand Down
43 changes: 34 additions & 9 deletions tools/TagSort/src/metricgatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,18 @@ class OnlineGaussianSufficientStatistic
class MetricGatherer
{
public:
MetricGatherer(std::string metric_output_file);
MetricGatherer(std::string metric_output_file,
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename);
virtual ~MetricGatherer();

virtual void ingestLine(std::string const& str) = 0;
virtual void outputMetricsLine() = 0;

void setGeneIdPosition(TagOrder tag_order);
int getGeneIdPosition();
std::unordered_set<std::string> getMTgenes();

protected:
// Each line of metric output is built from all alignments with a given tag.
Expand All @@ -78,7 +85,7 @@ class MetricGatherer
void outputMetricsLineCellAndGeneCommon();
void clearCellAndGeneCommon();

const std::string kCommonHeaders[25] =
const std::string kCommonHeaders[26] =
{
"n_reads",
"noise_reads",
Expand All @@ -104,7 +111,8 @@ class MetricGatherer
"reads_per_fragment",
"fragments_per_molecule",
"fragments_with_single_read_evidence",
"molecules_with_single_read_evidence"
"molecules_with_single_read_evidence",
"n_mitochondrial_reads"
};

void parseAlignedReadFields(LineFields const& fields, std::string hyphenated_tags);
Expand All @@ -114,6 +122,11 @@ class MetricGatherer

std::string prev_tag_;

// Unordered set of mitochondrial genes
std::unordered_set<std::string> mitochondrial_genes_;
// Integer to tell us where geneid_position
int geneid_position;

private:
// count information
int n_reads_ = 0;
Expand Down Expand Up @@ -150,22 +163,28 @@ class MetricGatherer
int spliced_reads_ = 0;
const int kAntisenseReads = 0; // TODO is never changed from 0
// int plus_strand_reads_ = 0; // strand balance (currently unused)
// test for mt -- will need to remove
int n_mitochondrial_reads = 0;
};

class CellMetricGatherer: public MetricGatherer
{
public:
CellMetricGatherer(std::string metric_output_file,
std::string gtf_file,
std::string mitochondrial_gene_names_filename);
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename);
void ingestLine(std::string const& str) override;
void outputMetricsLine() override;

protected:
void clear() override;

private:
// Unordered set of mitochondrial genes
std::unordered_set<std::string> mitochondrial_genes_;
// Integer to tell us where geneid_position
int geneid_position;

int perfect_cell_barcodes_ = 0; // The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``)
int reads_mapped_intergenic_ = 0; // The number of reads mapped to an intergenic region for this cell
Expand Down Expand Up @@ -195,15 +214,18 @@ class CellMetricGatherer: public MetricGatherer
"genes_detected_multiple_observations",
"n_mitochondrial_genes",
"n_mitochondrial_molecules",
"pct_mitochondrial_molecules"
};
"pct_mitochondrial_molecules"
};
};


class GeneMetricGatherer: public MetricGatherer
{
public:
explicit GeneMetricGatherer(std::string metric_output_file);
explicit GeneMetricGatherer(std::string metric_output_file,
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename);

void ingestLine(std::string const& str) override;

Expand All @@ -224,7 +246,10 @@ class GeneMetricGatherer: public MetricGatherer
class UmiMetricGatherer: public MetricGatherer
{
public:
UmiMetricGatherer(std::string metric_output_file, TagOrder tag_order);
UmiMetricGatherer(std::string metric_output_file,
TagOrder tag_order,
std::string gtf_file,
std::string mitochondrial_gene_names_filename);
void ingestLine(std::string const& str) override;
void outputMetricsLine() override;

Expand Down
10 changes: 6 additions & 4 deletions tools/TagSort/src/partial_file_merge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,15 @@ std::unique_ptr<MetricGatherer> maybeMakeMetricGatherer(INPUT_OPTIONS_TAGSORT co
if (options.metric_type == MetricType::Cell)
{
return std::make_unique<CellMetricGatherer>(
options.metric_output_file, options.gtf_file,
options.mitochondrial_gene_names_filename);
options.metric_output_file, getTagOrder(options),
options.gtf_file, options.mitochondrial_gene_names_filename);
}
else if (options.metric_type == MetricType::Gene)
return std::make_unique<GeneMetricGatherer>(options.metric_output_file);
return std::make_unique<GeneMetricGatherer>(options.metric_output_file, getTagOrder(options),
options.gtf_file, options.mitochondrial_gene_names_filename);
else if (options.metric_type == MetricType::Umi)
return std::make_unique<UmiMetricGatherer>(options.metric_output_file, getTagOrder(options));
return std::make_unique<UmiMetricGatherer>(options.metric_output_file, getTagOrder(options),
options.gtf_file, options.mitochondrial_gene_names_filename);
else
crash("new MetricType enum value is not yet handled by MetricGatherer!");
return nullptr;
Expand Down
Loading