Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Only use a random subset of genes for generating core aln #260

Merged
merged 3 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion panaroo/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,11 @@ def get_options(args):
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down Expand Up @@ -512,7 +517,7 @@ def main():
generate_core_genome_alignment(G, temp_dir, args.output_dir,
args.n_cpu, args.alr, isolate_names,
args.core, args.codons, len(args.input_files),
args.hc_threshold)
args.hc_threshold, args.subset)

# remove temporary directory
shutil.rmtree(temp_dir)
Expand Down
19 changes: 16 additions & 3 deletions panaroo/generate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Bio.SeqRecord import SeqRecord
import itertools as iter
from tqdm import tqdm
import random

from .generate_alignments import *

Expand Down Expand Up @@ -321,12 +322,18 @@ def generate_pan_genome_alignment(G, temp_dir, output_dir, threads, aligner,
return


def get_core_gene_nodes(G, threshold, num_isolates):
def get_core_gene_nodes(G, threshold, num_isolates, subset=None):
# Get the core genes based on percent threshold
core_nodes = []
for node in G.nodes():
if float(G.nodes[node]["size"]) / float(num_isolates) >= threshold:
core_nodes.append(node)
if subset is not None:
if subset > len(core_nodes):
raise RuntimeError(f"Cannot subset core genes to {subset}, "
f"only {len(core_nodes)} are available!")
random.shuffle(core_nodes)
core_nodes = core_nodes[:subset]
return core_nodes

def update_col_counts(col_counts, s):
Expand Down Expand Up @@ -431,15 +438,21 @@ def concatenate_core_genome_alignments(core_names, output_dir, hc_threshold):


def generate_core_genome_alignment(
G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold
G, temp_dir, output_dir, threads, aligner, isolates, threshold, codons, num_isolates, hc_threshold,
subset=None
):
# Make a folder for the output alignments TODO: decide whether or not to keep these
try:
os.mkdir(output_dir + "aligned_gene_sequences")
except FileExistsError:
None
# Get core nodes
core_genes = get_core_gene_nodes(G, threshold, num_isolates)
core_genes = get_core_gene_nodes(G, threshold, num_isolates, subset)
if len(core_genes) < 1:
print("No gene clusters were present above the core frequency"
" threshold! Try adjusting the '--core_threshold' parameter")
return

core_gene_names = [G.nodes[x]["name"] for x in core_genes]

if codons == True:
Expand Down
8 changes: 7 additions & 1 deletion panaroo/merge_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def merge_graphs(directories,
alr,
core,
hc_threshold,
subset=None,
merge_single=False,
depths=[1,2,3],
n_cpu=1,
Expand Down Expand Up @@ -421,7 +422,7 @@ def merge_graphs(directories,
if not quiet: print("generating core genome MSAs...")
generate_core_genome_alignment(G, temp_dir, output_dir, n_cpu, alr,
isolate_names, core, len(isolate_names),
hc_threshold)
hc_thresholdi, subset)
return


Expand Down Expand Up @@ -523,6 +524,11 @@ def get_options():
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down
7 changes: 6 additions & 1 deletion panaroo/post_run_alignment_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def get_options():
help="Core-genome sample threshold (default=0.95)",
type=float,
default=0.95)
core.add_argument("--core_subset",
dest="subset",
help="Subset the core genome to these many random genes (default=all)",
type=int,
default=None)
core.add_argument("--core_entropy_filter",
dest="hc_threshold",
help=("Manually set the Block Mapping and Gathering with " +
Expand Down Expand Up @@ -114,7 +119,7 @@ def main():
generate_core_genome_alignment(G, temp_dir, args.output_dir,
args.n_cpu, args.alr, isolate_names,
args.core, args.codons, len(isolate_names),
args.hc_threshold)
args.hc_threshold, args.subset)

# remove temporary directory
shutil.rmtree(temp_dir)
Expand Down
Loading