diff --git a/panaroo/__main__.py b/panaroo/__main__.py index 5ffb6a2..78bc8f1 100755 --- a/panaroo/__main__.py +++ b/panaroo/__main__.py @@ -235,6 +235,11 @@ def get_options(args): help="Core-genome sample threshold (default=0.95)", type=float, default=0.95) + core.add_argument("--core_subset", + dest="subset", + help="Subset the core genome to these many random genes (default=all)", + type=int, + default=None) core.add_argument("--core_entropy_filter", dest="hc_threshold", help=("Manually set the Block Mapping and Gathering with " + @@ -512,7 +517,7 @@ def main(): generate_core_genome_alignment(G, temp_dir, args.output_dir, args.n_cpu, args.alr, isolate_names, args.core, args.codons, len(args.input_files), - args.hc_threshold) + args.hc_threshold, args.subset) # remove temporary directory shutil.rmtree(temp_dir) diff --git a/panaroo/generate_output.py b/panaroo/generate_output.py index c65cbb1..928538b 100755 --- a/panaroo/generate_output.py +++ b/panaroo/generate_output.py @@ -9,6 +9,7 @@ from Bio.SeqRecord import SeqRecord import itertools as iter from tqdm import tqdm +import random from .generate_alignments import * @@ -321,12 +322,18 @@ def generate_pan_genome_alignment(G, temp_dir, output_dir, threads, aligner, return -def get_core_gene_nodes(G, threshold, num_isolates): +def get_core_gene_nodes(G, threshold, num_isolates, subset=None): # Get the core genes based on percent threshold core_nodes = [] for node in G.nodes(): if float(G.nodes[node]["size"]) / float(num_isolates) >= threshold: core_nodes.append(node) + if subset is not None: + if subset > len(core_nodes): + raise RuntimeError(f"Cannot subset core genes to {subset}, " + f"only {len(core_nodes)} are available!") + random.shuffle(core_nodes) + core_nodes = core_nodes[:subset] return core_nodes def update_col_counts(col_counts, s): @@ -431,7 +438,8 @@ def concatenate_core_genome_alignments(core_names, output_dir, hc_threshold): def generate_core_genome_alignment( - G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold + G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold, + subset=None ): # Make a folder for the output alignments TODO: decide whether or not to keep these try: @@ -439,10 +447,11 @@ def generate_core_genome_alignment( except FileExistsError: None # Get core nodes - core_genes = get_core_gene_nodes(G, threshold, num_isolates) + core_genes = get_core_gene_nodes(G, threshold, num_isolates, subset) if len(core_genes) < 1: print("No gene clusters were present above the core frequency" " threshold! Try adjusting the '--core_threshold' parameter") + return core_gene_names = [G.nodes[x]["name"] for x in core_genes] diff --git a/panaroo/merge_graphs.py b/panaroo/merge_graphs.py index e54e9b4..70fc7ab 100644 --- a/panaroo/merge_graphs.py +++ b/panaroo/merge_graphs.py @@ -264,6 +264,7 @@ def merge_graphs(directories, alr, core, hc_threshold, + subset=None, merge_single=False, depths=[1,2,3], n_cpu=1, @@ -431,7 +432,7 @@ def merge_graphs(directories, if not quiet: print("generating core genome MSAs...") generate_core_genome_alignment(G, temp_dir, output_dir, n_cpu, alr, isolate_names, core, len(isolate_names), - hc_threshold) + hc_thresholdi, subset) return @@ -533,6 +534,11 @@ def get_options(): help="Core-genome sample threshold (default=0.95)", type=float, default=0.95) + core.add_argument("--core_subset", + dest="subset", + help="Subset the core genome to these many random genes (default=all)", + type=int, + default=None) core.add_argument("--core_entropy_filter", dest="hc_threshold", help=("Manually set the Block Mapping and Gathering with " + diff --git a/panaroo/post_run_alignment_gen.py b/panaroo/post_run_alignment_gen.py index 1b42f4b..c8450cc 100755 --- a/panaroo/post_run_alignment_gen.py +++ b/panaroo/post_run_alignment_gen.py @@ -55,6 +55,11 @@ def get_options(): help="Core-genome sample threshold (default=0.95)", type=float, default=0.95) + core.add_argument("--core_subset", + dest="subset", + help="Subset the core genome to these many random genes (default=all)", + type=int, + default=None) core.add_argument("--core_entropy_filter", dest="hc_threshold", help=("Manually set the Block Mapping and Gathering with " + @@ -114,7 +119,7 @@ def main(): generate_core_genome_alignment(G, temp_dir, args.output_dir, args.n_cpu, args.alr, isolate_names, args.core, args.codons, len(isolate_names), - args.hc_threshold) + args.hc_threshold, args.subset) # remove temporary directory shutil.rmtree(temp_dir)