diff --git a/README.md b/README.md index d478be9..134a9fd 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,7 @@ Options: -o, --outpath PATH write processed seqs to this filename [required] -np, --numprocs INTEGER number of processes [default: 1] -F, --force_overwrite Overwrite existing file if it exists - -m, --moltype [dna|rna] Molecular type of sequences, defaults to DNA - [default: dna] + -m, --moltype [dna|rna] Molecular type of sequences [default: dna] -L, --limit INTEGER number of sequences to process -hp, --hide_progress hide progress bars --help Show this message and exit. @@ -75,7 +74,7 @@ Usage: dvs nmost [OPTIONS] Identify n seqs that maximise average delta JSD Options: - -s, --seqfile PATH path to .dvtgseqs file [required] + -s, --seqfile PATH path to .dvseqs file [required] -o, --outpath PATH the input string will be cast to Path instance -n, --number INTEGER number of seqs in divergent set [required] -k INTEGER k-mer size [default: 6] @@ -150,11 +149,11 @@ named sequences are added to the final result. Input type ---------- -SequenceCollection, ArrayAlignment, Alignment +ArrayAlignment, SequenceCollection, Alignment Output type ----------- -SequenceCollection, ArrayAlignment, Alignment +ArrayAlignment, SequenceCollection, Alignment ``` @@ -188,7 +187,7 @@ Usage: dvs max [OPTIONS] Identify the seqs that maximise average delta JSD Options: - -s, --seqfile PATH path to .dvtgseqs file [required] + -s, --seqfile PATH path to .dvseqs file [required] -o, --outpath PATH the input string will be cast to Path instance -z, --min_size INTEGER minimum size of divergent set [default: 7] -zp, --max_size INTEGER maximum size of divergent set @@ -273,12 +272,222 @@ named sequences are added to the final result. Input type ---------- -SequenceCollection, ArrayAlignment, Alignment +ArrayAlignment, SequenceCollection, Alignment Output type ----------- -SequenceCollection, ArrayAlignment, Alignment +ArrayAlignment, SequenceCollection, Alignment ``` - \ No newline at end of file + + +### `dvs ctree`: build a phylogeny using k-mers + +The result of the `ctree` command is a newick formatted tree string without distances. + +> **Note** +> A fuller explanation is coming soon! + +
+ Options for command line dvs ctree + + +``` +Usage: dvs ctree [OPTIONS] + + Quickly compute a cluster tree based on kmers for a collection of sequences. + +Options: + -s, --seqfile PATH path to .dvseqs file [required] + -o, --outpath PATH the input string will be cast to Path instance + -m, --moltype [dna|rna] Molecular type of sequences [default: dna] + -k INTEGER k-mer size [default: 6] + --sketch-size INTEGER sketch size for mash distance + -d, --distance [mash|euclidean] + distance measure for tree construction + [default: mash] + -c, --canonical-kmers consider kmers identical to their reverse + complement + -L, --limit INTEGER number of sequences to process + -np, --numprocs INTEGER number of processes [default: 1] + -hp, --hide_progress hide progress bars + --help Show this message and exit. + +``` + + +
+ +
+ Options for cogent3 app dvs_ctree + +The `dvs ctree` is also available as the [cogent3 app](https://cogent3.org/doc/app/index.html) `dvs_ctree` or `dvs_par_ctree`. The latter is not composable, but can run the analysis for a single collection in parallel. + + +``` +Overview +-------- +Create a cluster tree from kmer distances. + +Options for making the app +-------------------------- +dvs_ctree_app = get_app( + 'dvs_ctree', + k=12, + sketch_size=3000, + moltype='dna', + distance_mode='mash', + mash_canonical_kmers=None, + show_progress=False, +) + +Initialise parameters for generating a kmer cluster tree. + +Parameters +---------- +k + kmer size +sketch_size + size of sketches, only applies to mash distance +moltype + seq collection molecular type +distance_mode + mash distance or euclidean distance between kmer freqs +mash_canonical_kmers + whether to use mash canonical kmers for mash distance +show_progress + whether to show progress bars + +Notes +----- +This app is composable. + +If mash_canonical_kmers is enabled when using the mash distance, +kmers are considered identical to their reverse complement. + +References +---------- +.. [1] Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B., + Bergman, N. H., Koren, S., & Phillippy, A. M. (2016). + Mash: fast genome and metagenome distance estimation using MinHash. + Genome biology, 17, 1-14. + +Input type +---------- +ArrayAlignment, SequenceCollection, Alignment + +Output type +----------- +PhyloNode + +``` + + + + +``` +Overview +-------- +Create a cluster tree from kmer distances in parallel. + +Options for making the app +-------------------------- +dvs_par_ctree_app = get_app( + 'dvs_par_ctree', + k=12, + sketch_size=3000, + moltype='dna', + distance_mode='mash', + mash_canonical_kmers=None, + show_progress=False, + max_workers=None, + parallel=True, +) + +Initialise parameters for generating a kmer cluster tree. + +Parameters +---------- +k + kmer size +sketch_size + size of sketches, only applies to mash distance +moltype + seq collection molecular type +distance_mode + mash distance or euclidean distance between kmer freqs +mash_canonical_kmers + whether to use mash canonical kmers for mash distance +show_progress + whether to show progress bars +numprocs + number of workers, defaults to running serial + +Notes +----- +This app is not composable but can run in parallel. It is +best suited to a single large sequence collection. + +If mash_canonical_kmers is enabled when using the mash distance, +kmers are considered identical to their reverse complement. + +References +---------- +.. [1] Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B., + Bergman, N. H., Koren, S., & Phillippy, A. M. (2016). + Mash: fast genome and metagenome distance estimation using MinHash. + Genome biology, 17, 1-14. + +Input type +---------- +ArrayAlignment, SequenceCollection, Alignment + +Output type +----------- +PhyloNode + +``` + + +
diff --git a/src/diverse_seq/cli.py b/src/diverse_seq/cli.py index a596eb5..95ad757 100644 --- a/src/diverse_seq/cli.py +++ b/src/diverse_seq/cli.py @@ -457,7 +457,6 @@ def ctree( hide_progress: bool, ): """Quickly compute a cluster tree based on kmers for a collection of sequences.""" - if seqfile.suffix != ".dvseqs": dvs_util.print_colour( "Sequence data needs to be preprocessed, use 'dvs prep'", @@ -479,7 +478,7 @@ def ctree( ) sys.exit(1) - seqids = dvs_data_store.get_seqids_from_store(seqfile)[:limit] + seqids = dvs_data_store.get_seqids_from_store(seqfile) if limit is not None: seqids = seqids[:limit] @@ -496,7 +495,11 @@ def ctree( show_progress=not hide_progress, ) tree = app(seqids) # pylint: disable=not-callable - tree.write(outpath) + if not tree: + dvs_util.print_colour(tree, "red") + sys.exit(1) + + tree.write(outpath) if __name__ == "__main__": diff --git a/src/diverse_seq/cluster.py b/src/diverse_seq/cluster.py index 8111b04..8140e0d 100644 --- a/src/diverse_seq/cluster.py +++ b/src/diverse_seq/cluster.py @@ -132,6 +132,8 @@ def __init__( Notes ----- + This app is composable. + If mash_canonical_kmers is enabled when using the mash distance, kmers are considered identical to their reverse complement. @@ -233,14 +235,14 @@ def make_cluster_tree( tree_dict.pop(right_index), ) node_index += 1 - - tree = make_tree(str(tree_dict[node_index - 1])) - + # use string representation and then remove quotes + treestring = str(tree_dict[node_index - 1]).replace("'", "") + tree = make_tree(treestring=treestring, underscore_unmunge=True) progress.update(tree_task, completed=1, total=1) - return tree + class DvsParCtreeMixin: def _mash_dist(self, seq_arrays: Sequence[SeqArray]) -> numpy.ndarray: """Calculates pairwise mash distances between sequences in parallel. diff --git a/src/diverse_seq/distance.py b/src/diverse_seq/distance.py index 42fb451..7d8c5d7 100644 --- a/src/diverse_seq/distance.py +++ b/src/diverse_seq/distance.py @@ -159,7 +159,6 @@ def mash_distances( numpy.ndarray Pairwise mash distances between sequences. """ - if progress is None: progress = Progress(disable=True) diff --git a/tests/test_cli.py b/tests/test_cli.py index 66b1987..cdf04c5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -253,9 +253,7 @@ def test_ctree( ): outpath = tmp_dir / "out.tre" - args = ( - f"-s {processed_seq_path} -o {outpath} -d {distance} -k {k} -np {max_workers}" - ) + args = f"-s {processed_seq_path} -o {outpath} -d {distance} -k {k} -np {max_workers} -hp" if sketch_size is not None: args += f" --sketch-size {sketch_size}" args = args.split()