From 3da9754241ff0d607097ffe63deb21df2d805f31 Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Tue, 31 Oct 2023 16:33:51 -0700 Subject: [PATCH 1/3] In move_node, search for an existing child of the destination (new parent) that already has the same mutation(s) as the node to be moved. If a child with the same mutations is found then merge the moved node with the existing child instead of adding it as a new separate child. Handle all combinations of leaf or internal node type for existing child and moved node. Tree::collapse_tree used a breadth-first order for nodes which caused trouble when child nodes were removed due to merging during moves, and later visited in original BFS order. I changed it to recursively descend the tree and process the deepest nodes first. --- src/mutation_annotated_tree.cpp | 199 +++++++++++++++++++++++++------- src/mutation_annotated_tree.hpp | 8 ++ 2 files changed, 165 insertions(+), 42 deletions(-) diff --git a/src/mutation_annotated_tree.cpp b/src/mutation_annotated_tree.cpp index c10b598b..09a3e005 100644 --- a/src/mutation_annotated_tree.cpp +++ b/src/mutation_annotated_tree.cpp @@ -759,6 +759,27 @@ void Mutation_Annotated_Tree::Node::clear_annotations() { clade_annotations.clear(); } +Mutation_Annotated_Tree::Node* Mutation_Annotated_Tree::Node::find_child_with_muts(std::vector &muts) { + // If this node has a child with the same mutations as passed-in muts, then return that child; + // otherwise return NULL. This sorts muts and some children's mutations if not already sorted. + if (! std::is_sorted(muts.begin(), muts.end())) { + std::sort(muts.begin(), muts.end()); + } + size_t muts_size = muts.size(); + for (Mutation_Annotated_Tree::Node* child: children) { + if (child->mutations.size() == muts_size) { + if (muts_size > 1 && + ! std::is_sorted(child->mutations.begin(), child->mutations.end())) { + std::sort(child->mutations.begin(), child->mutations.end()); + } + if (child->mutations == muts) { + return child; + } + } + } + return NULL; +} + /* === Tree === */ size_t Mutation_Annotated_Tree::Tree::get_max_level () const { size_t max_level = 0; @@ -1088,32 +1109,115 @@ void Mutation_Annotated_Tree::Tree::remove_single_child_nodes() { } } +static void link_parent_child(Mutation_Annotated_Tree::Node *parent, + Mutation_Annotated_Tree::Node *child) { + // Establish parent-child links between the given nodes and invalidate child's branch length. + child->parent = parent; + child->branch_length = -1.0; + parent->children.push_back(child); +} + +static void remove_child(Mutation_Annotated_Tree::Tree *T, Mutation_Annotated_Tree::Node *parent, + Mutation_Annotated_Tree::Node *child, bool move_level) { + // Remove child from parent; if parent has no remaining children, remove parent from tree. + auto iter = std::find(parent->children.begin(), parent->children.end(), child); + if (iter == parent->children.end()) { + fprintf(stderr, "ERROR: child %s not found in parent %s's children\n", + child->identifier.c_str(), parent->identifier.c_str()); + exit(1); + } + parent->children.erase(iter); + if (parent->children.size() == 0) { + T->remove_node(parent->identifier, move_level); + } +} + void Mutation_Annotated_Tree::Tree::move_node (std::string source_id, std::string dest_id, bool move_level) { + // Move source to become a child of destination, update all affected nodes' parent pointers and + // child lists, and recalculate levels if necessary. Node* source = all_nodes[source_id]; Node* destination = all_nodes[dest_id]; Node* curr_parent = source->parent; + if (curr_parent == destination) { + fprintf(stderr, "ERROR: move_node: dest_id=%s but that is already parent of source_id=%s\n", + dest_id.c_str(), source_id.c_str()); + exit(1); + } + // Node(s) whose level will need to be recalculated after move + std::queue need_level_update; - source->parent = destination; - source->branch_length = -1.0; // Invalidate source branch length - - destination->children.push_back(source); - - // Remove source from curr_parent - auto iter = std::find(curr_parent->children.begin(), curr_parent->children.end(), source); - curr_parent->children.erase(iter); - if (curr_parent->children.size() == 0) { - remove_node(curr_parent->identifier, move_level); + // Check for existing child of destination with same mutations as source. + Node* dest_existing = destination->find_child_with_muts(source->mutations); + if (dest_existing == curr_parent || source->mutations.size() == 0) { + dest_existing = NULL; + } + if (dest_existing == NULL) { + // No match with current children of destination; source simply becomes child of destination. + link_parent_child(destination, source); + remove_child(this, curr_parent, source, move_level); + need_level_update.push(source); + } else { + // destination already has a child with the same non-empty set of mutations as source. + // If we simply move source to become a new child of destination then there will be + // duplicate children with the same mutations, so don't do that. The right thing to + // do depends on whether source and the existing child are leaf or internal nodes. + if (dest_existing->is_leaf()) { + if (source->is_leaf()) { + // Both source and existing child are leaves; make a new internal node child of + // destination with source->mutations and move the leaves to become its children + // with no additional mutations. + Node *new_internal = create_node(new_internal_node_id(), destination, -1.0); + for (auto mut: source->mutations) { + new_internal->add_mutation(mut); + } + source->mutations.clear(); + dest_existing->mutations.clear(); + link_parent_child(new_internal, source); + link_parent_child(new_internal, dest_existing); + remove_child(this, destination, dest_existing, move_level); + remove_child(this, curr_parent, source, move_level); + need_level_update.push(new_internal); + } else { + // source is an internal node and existing child is a leaf; move existing child + // into source with no additional mutations and move source into destination. + dest_existing->mutations.clear(); + link_parent_child(source, dest_existing); + link_parent_child(destination, source); + remove_child(this, destination, dest_existing, move_level); + remove_child(this, curr_parent, source, move_level); + need_level_update.push(source); + } + } else { + if (source->is_leaf()) { + // Existing child is an internal node and source is a leaf; move source into + // existing child with no additional mutations. + source->mutations.clear(); + link_parent_child(dest_existing, source); + remove_child(this, curr_parent, source, move_level); + need_level_update.push(source); + } else { + // Both source and existing child are internal nodes; move all of source's children + // to become children of destination's existing child. + // Make copy of source->children vector because the loop removes elements from it + auto source_children = source->children; + for (auto source_child: source_children) { + // It's not sufficient to link/remove/update here because some of these children + // might have the same mutations as existing children of dest_existing; recurse. + move_node(source_child->identifier, dest_existing->identifier, move_level); + } + // Removing all children causes source to be removed from tree, so no need to + // remove source from curr_parent. + } + } } - // Update levels of source descendants - std::queue remaining_nodes; - remaining_nodes.push(source); - while (remaining_nodes.size() > 0) { - Node* curr_node = remaining_nodes.front(); - remaining_nodes.pop(); + // Update levels of moved node(s) and descendants + while (need_level_update.size() > 0) { + Node* curr_node = need_level_update.front(); + need_level_update.pop(); curr_node->level = curr_node->parent->level + 1; for (auto c: curr_node->children) { - remaining_nodes.push(c); + need_level_update.push(c); } } } @@ -1277,37 +1381,48 @@ void Mutation_Annotated_Tree::Tree::uncondense_leaves() { condensed_leaves.clear(); } -void Mutation_Annotated_Tree::Tree::collapse_tree() { - auto bfs = breadth_first_expansion(); - - for (size_t idx = 1; idx < bfs.size(); idx++) { - auto node = bfs[idx]; - auto mutations = node->mutations; - if (mutations.size() == 0) { - auto parent = node->parent; - auto children = node->children; - for (auto child: children) { - move_node(child->identifier, parent->identifier, false); - } - } - //If internal node has one child, the child can be moved up one level - else if (node->children.size() == 1) { - auto child = node->children.front(); - // Preserve chronological order expected by add_mutation by adding child's mutations - // to node instead of vice versa, then move node's updated mutations to child. - for (auto m: child->mutations) { - node->add_mutation(m.copy()); +static void collapse_tree_r(Mutation_Annotated_Tree::Tree *T, Mutation_Annotated_Tree::Node *node) { + // Recursively find nodes with no mutations and move their children up to their parents. + // Moved nodes may be removed (if their mutations are redundant with an existing child of + // grandparent) so make changes starting from leafmost nodes back to root to avoid referencing + // removed nodes. + if (node->children.size() > 0) { + // Collapse each child of node before deciding what to do with node. + // Make a copy of node->children because it can be modified in the loop: + auto node_children = node->children; + for (auto child: node_children) { + collapse_tree_r(T, child); + } + auto parent = node->parent; + if (parent != NULL) { + if (node->mutations.size() == 0) { + auto node_children = node->children; + for (auto child: node_children) { + T->move_node(child->identifier, parent->identifier, false); + } } - child->mutations.clear(); - for (auto m: node->mutations) { - child->mutations.emplace_back(m.copy()); + //If internal node has one child, the child can be moved up one level + else if (node->children.size() == 1) { + auto child = node->children.front(); + // Preserve chronological order expected by add_mutation by adding child's mutations + // to node instead of vice versa, then move node's updated mutations to child. + for (auto m: child->mutations) { + node->add_mutation(m.copy()); + } + child->mutations.clear(); + for (auto m: node->mutations) { + child->mutations.emplace_back(m.copy()); + } + T->move_node(child->identifier, parent->identifier, false); } - auto parent = node->parent; - move_node(child->identifier, parent->identifier, false); } } } +void Mutation_Annotated_Tree::Tree::collapse_tree() { + collapse_tree_r(this, this->root); +} + void Mutation_Annotated_Tree::Tree::rotate_for_display(bool reverse) { auto dfs = depth_first_expansion(); diff --git a/src/mutation_annotated_tree.hpp b/src/mutation_annotated_tree.hpp index 4fcfaa6c..bfc2d4ef 100644 --- a/src/mutation_annotated_tree.hpp +++ b/src/mutation_annotated_tree.hpp @@ -51,6 +51,13 @@ struct Mutation { inline bool operator< (const Mutation& m) const { return ((*this).position < m.position); } + inline bool operator== (const Mutation& m) const { + return ((*this).position == m.position && + (*this).is_missing == m.is_missing && + (*this).chrom == m.chrom && + (*this).par_nuc == m.par_nuc && + (*this).mut_nuc == m.mut_nuc); + } inline Mutation copy() const { Mutation m; m.chrom = chrom; @@ -99,6 +106,7 @@ class Node { void add_mutation(Mutation mut); void clear_mutations(); void clear_annotations(); + Mutation_Annotated_Tree::Node* find_child_with_muts(std::vector &muts); }; class Tree { From 817202e4c503960b437c20b86a202b627c1b6adf Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Tue, 31 Oct 2023 16:36:29 -0700 Subject: [PATCH 2/3] 1. Performance tweak in restrictMutationsLocally (--mask-mutations): for each masked site, instead of copying all mutations that aren't masked, which is almost all mutations in the branch, make a usually-empty list of only the mutations that must be masked and erase those as needed. 2. In moveNodes (--move-nodes), instead of erroring out when the new parent's mutations are not exactly the same as he old parent's, also support the case of a new parent having a strict subset of the old parent's mutations. In that case, add the mutations that the new parent doesn't have to the node and move the node to become a child of the new parent. --- src/matUtils/mask.cpp | 155 +++++++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 41 deletions(-) diff --git a/src/matUtils/mask.cpp b/src/matUtils/mask.cpp index 7d365a12..371309f1 100644 --- a/src/matUtils/mask.cpp +++ b/src/matUtils/mask.cpp @@ -71,10 +71,16 @@ void mask_main(po::parsed_options parsed) { exit(1); } // Load input MAT and uncondense tree + fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str()); + timer.Start(); MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); //T here is the actual object. if (T.condensed_nodes.size() > 0) { + fprintf(stderr, "Uncondensing condensed nodes.\n"); + timer.Start(); T.uncondense_leaves(); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); } // If a restricted samples file was provided, perform masking procedure @@ -103,12 +109,20 @@ void mask_main(po::parsed_options parsed) { // Store final MAT to output file if (output_mat_filename != "") { - fprintf(stderr, "Saving Final Tree\n"); if (recondense) { + timer.Start(); + fprintf(stderr, "Collapsing tree...\n"); T.collapse_tree(); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); + timer.Start(); + fprintf(stderr, "Condensing leaves...\n"); T.condense_leaves(); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); } + fprintf(stderr, "Saving Final Tree to %s\n", output_mat_filename.c_str()); + timer.Start(); MAT::save_mutation_annotated_tree(T, output_mat_filename); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); } } @@ -211,6 +225,8 @@ void restrictMutationsLocally (std::string mutations_filename, MAT::Tree* T, boo delim = ','; } std::string rootid = T->root->identifier; + size_t total_masked = 0; + timer.Start(); while (std::getline(infile, line)) { std::vector words; if (line[line.size()-1] == '\r') { @@ -236,19 +252,27 @@ void restrictMutationsLocally (std::string mutations_filename, MAT::Tree* T, boo } // fprintf(stderr, "Masking mutation %s below node %s\n", ml.first.c_str(), ml.second.c_str()); for (auto n: T->depth_first_expansion(rn)) { - std::vector nmuts; + // The expected common case is to not match any mutations and have nothing to remove. + std::vector muts_to_remove; for (auto& mut: n->mutations) { if (match_mutations(mutobj, &mut)) { instances_masked++; - } else { - nmuts.push_back(mut); + muts_to_remove.push_back(mut); } } - n->mutations = nmuts; + for (auto mut: muts_to_remove) { + auto iter = std::find(n->mutations.begin(), n->mutations.end(), mut); + n->mutations.erase(iter); + } } + total_masked += instances_masked; } + fprintf(stderr, "Completed in %ld msec \n", timer.Stop()); infile.close(); + fprintf(stderr, "Masked a total of %lu mutations. Collapsing tree...\n", total_masked); + timer.Start(); T->collapse_tree(); + fprintf(stderr, "Completed in %ld msec \n", timer.Stop()); } void restrictSamples (std::string samples_filename, MAT::Tree& T) { @@ -356,6 +380,43 @@ void restrictSamples (std::string samples_filename, MAT::Tree& T) { } } +std::unordered_set mutation_set_from_node(MAT::Tree* T, MAT::Node* node, bool include_node, bool include_ancestors) { + std::unordered_set mutations; + if (include_ancestors) { + for (auto an: T->rsearch(node->identifier, include_node)) { + for (auto mut: an->mutations) { + if (mut.is_masked()) { + continue; + } + MAT::Mutation mut_opposite = mut.copy(); + //we check for whether this mutation is going to negate with something in the set + //by identifying its opposite and checking whether the opposite is already present on the traversal. + mut_opposite.par_nuc = mut.mut_nuc; + mut_opposite.mut_nuc = mut.par_nuc; + auto cml = mutations.find(mut_opposite.get_string()); + if (cml != mutations.end()) { + mutations.erase(cml); + } else { + mutations.insert(mut.get_string()); + } + } + } + } else if (include_node) { + for (auto mut: node->mutations) { + if (mut.is_masked()) { + continue; + } + mutations.insert(mut.get_string()); + } + } else { + fprintf(stderr, "ERROR: mutation_set_from_node: at least one of include_node and include_ancestors should be true.\n"); + exit(1); + } + return mutations; +} + + + void moveNodes (std::string node_filename, MAT::Tree* T) { // Function to move nodes between two identical placement paths. That is, move the target node so that is a child of the indicated new parent node, // but the current placement and new placement must involve exactly the same set of mutations for the move to be allowed. @@ -394,49 +455,61 @@ void moveNodes (std::string node_filename, MAT::Tree* T) { } //accumulate the set of mutations belonging to the current and the new placement //not counting mutations belonging to the target, but counting ones to the putative new parent. - std::unordered_set curr_mutations; - std::unordered_set new_mutations; - for (auto an: T->rsearch(mn->identifier, false)) { - for (auto mut: an->mutations) { - if (mut.is_masked()) { - continue; - } - MAT::Mutation mut_opposite = mut.copy(); - //we check for whether this mutation is going to negate with something in the set - //by identifying its opposite and checking whether the opposite is already present on the traversal. - mut_opposite.par_nuc = mut.mut_nuc; - mut_opposite.mut_nuc = mut.par_nuc; - auto cml = curr_mutations.find(mut_opposite.get_string()); - if (cml != curr_mutations.end()) { - curr_mutations.erase(cml); + std::unordered_set curr_mutations = mutation_set_from_node(T, mn, false, true); + std::unordered_set new_mutations = mutation_set_from_node(T, np, true, true); + if (curr_mutations == new_mutations) { + //we can now proceed with the move. + T->move_node(mn->identifier, np->identifier); + fprintf(stderr, "Move of node %s to node %s successful.\n", words[0].c_str(), words[1].c_str()); + } else { + // Not quite the same; figure out whether we need to add a node under new parent. + fprintf(stderr, "The current (%s) and new (%s) node paths do not involve the same set of mutations.\n", + mn->identifier.c_str(), np->identifier.c_str()); + + std::unordered_set extra_mutations; + size_t curr_in_new_count = 0; + for (auto mut: curr_mutations) { + if (new_mutations.find(mut) != new_mutations.end()) { + curr_in_new_count++; } else { - curr_mutations.insert(mut.get_string()); + extra_mutations.insert(mut); } } - } - for (auto an: T->rsearch(np->identifier, true)) { - for (auto mut: an->mutations) { - if (mut.is_masked()) { - continue; + if (extra_mutations.size() == 0 || curr_in_new_count != new_mutations.size()) { + fprintf(stderr, "ERROR: the new parent (%s) has mutations not found in the current node (%s); %ld in common, %ld in new\n", + np->identifier.c_str(), mn->identifier.c_str(), curr_in_new_count, new_mutations.size()); + exit(1); + } + // Look for a child of np that already has extra_mutations. If there is such a child + // then move mn to that child. Otherwise add those mutations to mn and move it to np. + MAT::Node *child_with_muts = NULL; + for (auto child: np->children) { + std::unordered_set mut_set = mutation_set_from_node(T, child, true, false); + if (mut_set == extra_mutations) { + child_with_muts = child; + fprintf(stderr, "Found child with extra_mutations: %s\n", child->identifier.c_str()); + break; } - MAT::Mutation mut_opposite = mut.copy(); - mut_opposite.par_nuc = mut.mut_nuc; - mut_opposite.mut_nuc = mut.par_nuc; - auto cml = new_mutations.find(mut_opposite.get_string()); - if (cml != new_mutations.end()) { - new_mutations.erase(cml); - } else { - new_mutations.insert(mut.get_string()); + } + if (child_with_muts != NULL) { + T->move_node(mn->identifier, child_with_muts->identifier); + } else { + // Preserve chronological order expected by add_mutation by adding mn's mutations + // after extra_mutations instead of vice versa. + std::vectormn_mutations; + for (auto mut: mn->mutations) { + mn_mutations.push_back(mut); + } + mn->mutations.clear(); + for (auto mut: extra_mutations) { + mn->add_mutation(*(MAT::mutation_from_string(mut))); } + for (auto mut: mn_mutations) { + mn->add_mutation(mut); + } + T->move_node(mn->identifier, np->identifier); } } - if (curr_mutations != new_mutations) { - fprintf(stderr, "ERROR: The current and new placement paths do not involve the same set of mutations. Exiting\n"); - exit(1); - } - //we can now proceed with the move. - T->move_node(mn->identifier, np->identifier); - fprintf(stderr, "Move of node %s to node %s successful.\n", words[0].c_str(), words[1].c_str()); } fprintf(stderr, "All requested moves complete.\n"); } From d532e07fd2ca16284376078296585beb88379157 Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Tue, 31 Oct 2023 16:32:00 -0700 Subject: [PATCH 3/3] New matUtils subcommand fix: finds nodes whose only mutation is a reversion of their grandparent's only mutation, a pattern that causes trouble for several Pango lineages, and moves those nodes to become children of their great-grandparents, having only the parent's mutation. --- src/matUtils/fix.cpp | 123 ++++++++++++++++++++++++++++++++++++++++++ src/matUtils/fix.hpp | 3 ++ src/matUtils/main.cpp | 8 ++- 3 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 src/matUtils/fix.cpp create mode 100644 src/matUtils/fix.hpp diff --git a/src/matUtils/fix.cpp b/src/matUtils/fix.cpp new file mode 100644 index 00000000..bc001f6b --- /dev/null +++ b/src/matUtils/fix.cpp @@ -0,0 +1,123 @@ +#include "fix.hpp" + +// Default: move node if it has at least 1 descendent +int default_min_descendent_count = 1; +int default_iterations = 1; + +po::variables_map parse_fix_command(po::parsed_options parsed) { + + po::variables_map vm; + po::options_description filt_desc("fix options"); + filt_desc.add_options() + ("input-mat,i", po::value()->required(), + "Input mutation-annotated tree file [REQUIRED]") + ("output-mat,o", po::value()->required(), + "Path to output fixed mutation-annotated tree file [REQUIRED]") + ("iterations,n", po::value()->default_value(default_iterations), + "Number of iterations to run") + ("min-descendent-count,c", po::value()->default_value(default_min_descendent_count), + "Minimum number of descendents required to move a node") + ("help,h", "Print help messages"); + // Collect all the unrecognized options from the first pass. This will include the + // (positional) command name, so we need to erase that. + std::vector opts = po::collect_unrecognized(parsed.options, po::include_positional); + opts.erase(opts.begin()); + + // Run the parser, with try/catch for help + try { + po::store(po::command_line_parser(opts) + .options(filt_desc) + .run(), vm); + po::notify(vm); + } catch(std::exception &e) { + std::cerr << filt_desc << std::endl; + // Return with error code 1 unless the user specifies help + if (vm.count("help")) + exit(0); + else + exit(1); + } + return vm; +} + +static int fix_grandparent_reversions_r(MAT::Tree *T, MAT::Node *node, MAT::Node *ggp_node, + MAT::Node *gp_node, MAT::Node *p_node, + int min_descendent_count) { + // Recursively scan the tree for cases of grandparent-reversion, i.e. N > A > B > revA; + // when a case like that is found, move the revA node to be a child of N having mutation B. + int descendent_count = 0; + // First recursively descend to children, looking each one up by identifier in case it has + // been removed by the time we get to it: + std::vector child_ids; + for (auto child: node->children) { + child_ids.push_back(child->identifier); + } + for (auto child_id: child_ids) { + MAT::Node *child = T->get_node(child_id); + if (child != NULL) { + descendent_count += fix_grandparent_reversions_r(T, child, gp_node, p_node, node, + min_descendent_count); + } + } + // Now determine whether this node has only one mutation that reverts its grandparent's only + // mutation; if so (and if parent has only one mut, othw parsimony score would increase), + // then move this node to be a child of its great-grandparent, with only its parent's mut. + if (ggp_node != NULL && node->mutations.size() == 1 && gp_node->mutations.size() == 1 && + p_node->mutations.size() == 1) { + MAT::Mutation node_mut = node->mutations.front(); + MAT::Mutation gp_mut = gp_node->mutations.front(); + if (node_mut.position == gp_mut.position && + node_mut.chrom == gp_mut.chrom && + node_mut.mut_nuc == gp_mut.par_nuc && + node_mut.par_nuc == gp_mut.mut_nuc && + descendent_count >= min_descendent_count) { + fprintf(stderr, "Node %s mutation %s reverts grandparent %s's %s%s, moving to %s with %s (%d descendents)\n", + node->identifier.c_str(), node_mut.get_string().c_str(), + gp_node->identifier.c_str(), + (gp_mut.mut_nuc == gp_mut.ref_nuc ? "reversion " : ""), + gp_mut.get_string().c_str(), + ggp_node->identifier.c_str(), + p_node->mutations.front().get_string().c_str(), descendent_count); + node->mutations.clear(); + node->mutations = std::vector(p_node->mutations); + T->move_node(node->identifier, ggp_node->identifier); + } + } + return descendent_count + 1; +} + +static void fix_grandparent_reversions(MAT::Tree *T, int iterations, int min_descendent_count) { + // Find and fix nodes that reverse their grandparent's mutation. + // For each case of N > A > B > revA, move revA to N > B (if N > B already exists then move all + // children of revA to N > B). + int i; + for (i = 0; i < iterations; i++) { + fix_grandparent_reversions_r(T, T->root, NULL, NULL, NULL, min_descendent_count); + } +} + +void fix_main(po::parsed_options parsed) { + po::variables_map vm = parse_fix_command(parsed); + std::string input_mat_filename = vm["input-mat"].as(); + std::string output_mat_filename = vm["output-mat"].as(); + int iterations = vm["iterations"].as(); + int min_descendent_count = vm["min-descendent-count"].as(); + + // Load the input MAT + timer.Start(); + fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str()); + MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); + + // No need to uncondense the tree + + timer.Start(); + fprintf(stderr, "Fixing grandparent-reversions\n"); + fix_grandparent_reversions(&T, iterations, min_descendent_count); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); + + timer.Start(); + fprintf(stderr, "Saving Final Tree\n"); + MAT::save_mutation_annotated_tree(T, output_mat_filename); + fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); +} diff --git a/src/matUtils/fix.hpp b/src/matUtils/fix.hpp new file mode 100644 index 00000000..d8a3368a --- /dev/null +++ b/src/matUtils/fix.hpp @@ -0,0 +1,3 @@ +#include "common.hpp" + +void fix_main(po::parsed_options parsed); diff --git a/src/matUtils/main.cpp b/src/matUtils/main.cpp index 0b0c60b3..9861a260 100644 --- a/src/matUtils/main.cpp +++ b/src/matUtils/main.cpp @@ -4,6 +4,7 @@ #include "extract.hpp" #include "merge.hpp" #include "introduce.hpp" +#include "fix.hpp" #include "version.hpp" #include @@ -12,7 +13,7 @@ Timer timer; int main (int argc, char** argv) { po::options_description global("Command options"); global.add_options() - ("command", po::value(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, and summary.") + ("command", po::value(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, introduce, fix, merge, version, and summary.") ("subargs", po::value >(), "Command-specific arguments."); po::positional_options_description pos; pos.add("command",1 ).add("subargs", -1); @@ -20,7 +21,7 @@ int main (int argc, char** argv) { po::variables_map vm; po::parsed_options parsed = po::command_line_parser(argc, argv).options(global).positional(pos).allow_unregistered().run(); //this help string shows up over and over, lets just define it once - std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "version"}; + std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "fix", "version"}; std::string chelp[] = { "DESCRIPTION\n\n", "calculates basic statistics and counts samples, mutations, and clades in the input MAT\n\n", @@ -30,6 +31,7 @@ int main (int argc, char** argv) { "given sample region information, heuristically identifies points of geographic introduction along the phylogeny\n\n", "merge all samples of two input MAT files into a single output MAT \n\n", "masks the input samples\n\n", + "fixes grandparent-reversion structures\n\n", "display version number\n\n" }; try { @@ -57,6 +59,8 @@ int main (int argc, char** argv) { introduce_main(parsed); } else if (cmd == "mask") { mask_main(parsed); + } else if (cmd == "fix") { + fix_main(parsed); } else if (cmd == "version") { std::cerr << "matUtils (v" << PROJECT_VERSION << ")" << std::endl; } else if (cmd == "help") {