Skip to content

Commit

Permalink
Merge pull request #260 from microbial-pangenomes-lab/master
Browse files Browse the repository at this point in the history
Only use a random subset of genes for generating core aln
  • Loading branch information
gtonkinhill authored Dec 20, 2023
2 parents 2a65d2b + ee655b5 commit 647443a
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 6 deletions.
7 changes: 6 additions & 1 deletion panaroo/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,11 @@ def get_options(args):
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down Expand Up @@ -512,7 +517,7 @@ def main():
generate_core_genome_alignment(G, temp_dir, args.output_dir,
args.n_cpu, args.alr, isolate_names,
args.core, args.codons, len(args.input_files),
args.hc_threshold)
args.hc_threshold, args.subset)

# remove temporary directory
shutil.rmtree(temp_dir)
Expand Down
15 changes: 12 additions & 3 deletions panaroo/generate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Bio.SeqRecord import SeqRecord
import itertools as iter
from tqdm import tqdm
import random

from .generate_alignments import *

Expand Down Expand Up @@ -321,12 +322,18 @@ def generate_pan_genome_alignment(G, temp_dir, output_dir, threads, aligner,
return


def get_core_gene_nodes(G, threshold, num_isolates):
def get_core_gene_nodes(G, threshold, num_isolates, subset=None):
# Get the core genes based on percent threshold
core_nodes = []
for node in G.nodes():
if float(G.nodes[node]["size"]) / float(num_isolates) >= threshold:
core_nodes.append(node)
if subset is not None:
if subset > len(core_nodes):
raise RuntimeError(f"Cannot subset core genes to {subset}, "
f"only {len(core_nodes)} are available!")
random.shuffle(core_nodes)
core_nodes = core_nodes[:subset]
return core_nodes

def update_col_counts(col_counts, s):
Expand Down Expand Up @@ -431,18 +438,20 @@ def concatenate_core_genome_alignments(core_names, output_dir, hc_threshold):


def generate_core_genome_alignment(
G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold
G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold,
subset=None
):
# Make a folder for the output alignments TODO: decide whether or not to keep these
try:
os.mkdir(output_dir + "aligned_gene_sequences")
except FileExistsError:
None
# Get core nodes
core_genes = get_core_gene_nodes(G, threshold, num_isolates)
core_genes = get_core_gene_nodes(G, threshold, num_isolates, subset)
if len(core_genes) < 1:
print("No gene clusters were present above the core frequency"
" threshold! Try adjusting the '--core_threshold' parameter")
return

core_gene_names = [G.nodes[x]["name"] for x in core_genes]

Expand Down
8 changes: 7 additions & 1 deletion panaroo/merge_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def merge_graphs(directories,
alr,
core,
hc_threshold,
subset=None,
merge_single=False,
depths=[1,2,3],
n_cpu=1,
Expand Down Expand Up @@ -431,7 +432,7 @@ def merge_graphs(directories,
if not quiet: print("generating core genome MSAs...")
generate_core_genome_alignment(G, temp_dir, output_dir, n_cpu, alr,
isolate_names, core, len(isolate_names),
hc_threshold)
hc_thresholdi, subset)
return


Expand Down Expand Up @@ -533,6 +534,11 @@ def get_options():
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down
7 changes: 6 additions & 1 deletion panaroo/post_run_alignment_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def get_options():
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down Expand Up @@ -114,7 +119,7 @@ def main():
generate_core_genome_alignment(G, temp_dir, args.output_dir,
args.n_cpu, args.alr, isolate_names,
args.core, args.codons, len(isolate_names),
args.hc_threshold)
args.hc_threshold, args.subset)

# remove temporary directory
shutil.rmtree(temp_dir)
Expand Down

0 comments on commit 647443a

Please sign in to comment.