Skip to content

Commit

Permalink
Only use a random subset of genes for generating core aln
Browse files Browse the repository at this point in the history
Added a `--core_subset` argument to every command with which a user
can choose to generate a core genome alignment.

When using large datasets the core genome alignment takes
a substantial amount of time, but perhaps a reasonable sample
would produce the same phylogenetic signal in a fraction of the
time
  • Loading branch information
mgalardini committed Dec 7, 2023
1 parent 18d7c9d commit ee655b5
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 6 deletions.
7 changes: 6 additions & 1 deletion panaroo/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,11 @@ def get_options(args):
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down Expand Up @@ -512,7 +517,7 @@ def main():
generate_core_genome_alignment(G, temp_dir, args.output_dir,
args.n_cpu, args.alr, isolate_names,
args.core, args.codons, len(args.input_files),
args.hc_threshold)
args.hc_threshold, args.subset)

# remove temporary directory
shutil.rmtree(temp_dir)
Expand Down
14 changes: 11 additions & 3 deletions panaroo/generate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Bio.SeqRecord import SeqRecord
import itertools as iter
from tqdm import tqdm
import random

from .generate_alignments import *

Expand Down Expand Up @@ -321,12 +322,18 @@ def generate_pan_genome_alignment(G, temp_dir, output_dir, threads, aligner,
return


def get_core_gene_nodes(G, threshold, num_isolates):
def get_core_gene_nodes(G, threshold, num_isolates, subset=None):
# Get the core genes based on percent threshold
core_nodes = []
for node in G.nodes():
if float(G.nodes[node]["size"]) / float(num_isolates) >= threshold:
core_nodes.append(node)
if subset is not None:
if subset > len(core_nodes):
raise RuntimeError(f"Cannot subset core genes to {subset}, "
f"only {len(core_nodes)} are available!")
random.shuffle(core_nodes)
core_nodes = core_nodes[:subset]
return core_nodes

def update_col_counts(col_counts, s):
Expand Down Expand Up @@ -431,15 +438,16 @@ def concatenate_core_genome_alignments(core_names, output_dir, hc_threshold):


def generate_core_genome_alignment(
G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold
G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold,
subset=None
):
# Make a folder for the output alignments TODO: decide whether or not to keep these
try:
os.mkdir(output_dir + "aligned_gene_sequences")
except FileExistsError:
None
# Get core nodes
core_genes = get_core_gene_nodes(G, threshold, num_isolates)
core_genes = get_core_gene_nodes(G, threshold, num_isolates, subset)
if len(core_genes) < 1:
print("No gene clusters were present above the core frequency"
" threshold! Try adjusting the '--core_threshold' parameter")
Expand Down
8 changes: 7 additions & 1 deletion panaroo/merge_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def merge_graphs(directories,
alr,
core,
hc_threshold,
subset=None,
merge_single=False,
depths=[1,2,3],
n_cpu=1,
Expand Down Expand Up @@ -421,7 +422,7 @@ def merge_graphs(directories,
if not quiet: print("generating core genome MSAs...")
generate_core_genome_alignment(G, temp_dir, output_dir, n_cpu, alr,
isolate_names, core, len(isolate_names),
hc_threshold)
hc_thresholdi, subset)
return


Expand Down Expand Up @@ -523,6 +524,11 @@ def get_options():
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down
7 changes: 6 additions & 1 deletion panaroo/post_run_alignment_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def get_options():
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down Expand Up @@ -114,7 +119,7 @@ def main():
generate_core_genome_alignment(G, temp_dir, args.output_dir,
args.n_cpu, args.alr, isolate_names,
args.core, args.codons, len(isolate_names),
args.hc_threshold)
args.hc_threshold, args.subset)

# remove temporary directory
shutil.rmtree(temp_dir)
Expand Down

0 comments on commit ee655b5

Please sign in to comment.