From 2fd261790afb692d7bd61407d11de36210d21239 Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Fri, 27 Sep 2024 17:29:11 -0700 Subject: [PATCH] matUtils extract --reroot: retain rerooted leaf, rename leaf from former internal node This addresses two corner cases that we hadn't thought through yet for --reroot: * First, when rerooting to a leaf node, that node was converted into an internal node (the new root), losing its leaf status, so when the result was written out, the tree no longer had that leaf. To fix this, when rerooting to a leaf, we add a new leaf with the same name on the new root node. * Second, when the old root had no leaf children and only one internal node child, after rerooting it became a new leaf node, keeping its original ID of 'node_1'. However, the saved protobuf could not be read back in by matUtils because when parsing the Newick string, 'node_1' is assigned to the first node encountered, and it is an error for any subsequent node to have the name 'node_1' (even a leaf) because node_1 is already in the tree. To prevent that error, when the old root becomes a leaf node, we change the old root's name to former_root -- unless there is already a former_root in the tree, in which case we make a new former_root_ that does not already exist in the tree. --- src/matUtils/filter.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/matUtils/filter.cpp b/src/matUtils/filter.cpp index e6f06428..bef68cc5 100644 --- a/src/matUtils/filter.cpp +++ b/src/matUtils/filter.cpp @@ -170,6 +170,9 @@ void reroot_tree(MAT::Tree* T, std::string rnid) { fprintf(stderr, "ERROR: New root selection not found in tree. Exiting\n"); exit(1); } + // The new root will not be a leaf when we are done, but keep track of whether it started + // as a leaf. + bool new_root_was_leaf = (norder[0]->children.size() == 0); if (T->root->mutations.size() != 0) { // The way mutations are handled below assumes that the root and reference are synonymous, // and as we change the rooting of the tree, we change the mutations to reflect that the @@ -221,6 +224,30 @@ void reroot_tree(MAT::Tree* T, std::string rnid) { mut.ref_nuc = mut.par_nuc; } } + // If the new root, norder.back(), was a leaf, then its identity would be lost by changing it + // to an internal node because internal node IDs are not written when saving. Also, the tree + // now has one fewer leaf. Add a new leaf on root to keep the original leaf identity. + MAT::Node* new_root = norder.back(); + if (new_root_was_leaf) { + fprintf(stderr, "New root was a leaf node; retaining it as leaf node on new root internal node.\n"); + T->rename_node(rnid, "new_root_" + rnid); + T->create_node(rnid, new_root, 0.0); + } + // If the original root, norder[0], has no remaining children, then it has changed from an + // internal node (with name node_1) to a leaf node. But if a leaf node has the name node_1, + // there will be a fatal error from MAT::create_node (node_1 already in the tree) during + // parsing of the Newick string. To prevent that error, assign a new name to the new leaf node. + MAT::Node* old_root = norder[0]; + if (old_root->children.size() == 0) { + std::string new_name = "former_root"; + int uid = 1; + while (T->get_node(new_name) != NULL) { + new_name = "former_root_" + std::to_string(uid++); + } + fprintf(stderr, "Former root has become a leaf node; assigning new name '%s'.\n", + new_name.c_str()); + T->rename_node(old_root->identifier, new_name); + } assert (T->get_node(rnid)->is_root()); apply_ref_changes(T->root, ref_changes); }