diff --git a/README.md b/README.md
index d478be9..134a9fd 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,7 @@ Options:
-o, --outpath PATH write processed seqs to this filename [required]
-np, --numprocs INTEGER number of processes [default: 1]
-F, --force_overwrite Overwrite existing file if it exists
- -m, --moltype [dna|rna] Molecular type of sequences, defaults to DNA
- [default: dna]
+ -m, --moltype [dna|rna] Molecular type of sequences [default: dna]
-L, --limit INTEGER number of sequences to process
-hp, --hide_progress hide progress bars
--help Show this message and exit.
@@ -75,7 +74,7 @@ Usage: dvs nmost [OPTIONS]
Identify n seqs that maximise average delta JSD
Options:
- -s, --seqfile PATH path to .dvtgseqs file [required]
+ -s, --seqfile PATH path to .dvseqs file [required]
-o, --outpath PATH the input string will be cast to Path instance
-n, --number INTEGER number of seqs in divergent set [required]
-k INTEGER k-mer size [default: 6]
@@ -150,11 +149,11 @@ named sequences are added to the final result.
Input type
----------
-SequenceCollection, ArrayAlignment, Alignment
+ArrayAlignment, SequenceCollection, Alignment
Output type
-----------
-SequenceCollection, ArrayAlignment, Alignment
+ArrayAlignment, SequenceCollection, Alignment
```
@@ -188,7 +187,7 @@ Usage: dvs max [OPTIONS]
Identify the seqs that maximise average delta JSD
Options:
- -s, --seqfile PATH path to .dvtgseqs file [required]
+ -s, --seqfile PATH path to .dvseqs file [required]
-o, --outpath PATH the input string will be cast to Path instance
-z, --min_size INTEGER minimum size of divergent set [default: 7]
-zp, --max_size INTEGER maximum size of divergent set
@@ -273,12 +272,222 @@ named sequences are added to the final result.
Input type
----------
-SequenceCollection, ArrayAlignment, Alignment
+ArrayAlignment, SequenceCollection, Alignment
Output type
-----------
-SequenceCollection, ArrayAlignment, Alignment
+ArrayAlignment, SequenceCollection, Alignment
```
-
\ No newline at end of file
+
+
+### `dvs ctree`: build a phylogeny using k-mers
+
+The result of the `ctree` command is a newick formatted tree string without distances.
+
+> **Note**
+> A fuller explanation is coming soon!
+
+
+ Options for command line dvs ctree
+
+
+```
+Usage: dvs ctree [OPTIONS]
+
+ Quickly compute a cluster tree based on kmers for a collection of sequences.
+
+Options:
+ -s, --seqfile PATH path to .dvseqs file [required]
+ -o, --outpath PATH the input string will be cast to Path instance
+ -m, --moltype [dna|rna] Molecular type of sequences [default: dna]
+ -k INTEGER k-mer size [default: 6]
+ --sketch-size INTEGER sketch size for mash distance
+ -d, --distance [mash|euclidean]
+ distance measure for tree construction
+ [default: mash]
+ -c, --canonical-kmers consider kmers identical to their reverse
+ complement
+ -L, --limit INTEGER number of sequences to process
+ -np, --numprocs INTEGER number of processes [default: 1]
+ -hp, --hide_progress hide progress bars
+ --help Show this message and exit.
+
+```
+
+
+
+
+
+ Options for cogent3 app dvs_ctree
+
+The `dvs ctree` is also available as the [cogent3 app](https://cogent3.org/doc/app/index.html) `dvs_ctree` or `dvs_par_ctree`. The latter is not composable, but can run the analysis for a single collection in parallel.
+
+
+```
+Overview
+--------
+Create a cluster tree from kmer distances.
+
+Options for making the app
+--------------------------
+dvs_ctree_app = get_app(
+ 'dvs_ctree',
+ k=12,
+ sketch_size=3000,
+ moltype='dna',
+ distance_mode='mash',
+ mash_canonical_kmers=None,
+ show_progress=False,
+)
+
+Initialise parameters for generating a kmer cluster tree.
+
+Parameters
+----------
+k
+ kmer size
+sketch_size
+ size of sketches, only applies to mash distance
+moltype
+ seq collection molecular type
+distance_mode
+ mash distance or euclidean distance between kmer freqs
+mash_canonical_kmers
+ whether to use mash canonical kmers for mash distance
+show_progress
+ whether to show progress bars
+
+Notes
+-----
+This app is composable.
+
+If mash_canonical_kmers is enabled when using the mash distance,
+kmers are considered identical to their reverse complement.
+
+References
+----------
+.. [1] Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B.,
+ Bergman, N. H., Koren, S., & Phillippy, A. M. (2016).
+ Mash: fast genome and metagenome distance estimation using MinHash.
+ Genome biology, 17, 1-14.
+
+Input type
+----------
+ArrayAlignment, SequenceCollection, Alignment
+
+Output type
+-----------
+PhyloNode
+
+```
+
+
+
+
+```
+Overview
+--------
+Create a cluster tree from kmer distances in parallel.
+
+Options for making the app
+--------------------------
+dvs_par_ctree_app = get_app(
+ 'dvs_par_ctree',
+ k=12,
+ sketch_size=3000,
+ moltype='dna',
+ distance_mode='mash',
+ mash_canonical_kmers=None,
+ show_progress=False,
+ max_workers=None,
+ parallel=True,
+)
+
+Initialise parameters for generating a kmer cluster tree.
+
+Parameters
+----------
+k
+ kmer size
+sketch_size
+ size of sketches, only applies to mash distance
+moltype
+ seq collection molecular type
+distance_mode
+ mash distance or euclidean distance between kmer freqs
+mash_canonical_kmers
+ whether to use mash canonical kmers for mash distance
+show_progress
+ whether to show progress bars
+numprocs
+ number of workers, defaults to running serial
+
+Notes
+-----
+This app is not composable but can run in parallel. It is
+best suited to a single large sequence collection.
+
+If mash_canonical_kmers is enabled when using the mash distance,
+kmers are considered identical to their reverse complement.
+
+References
+----------
+.. [1] Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B.,
+ Bergman, N. H., Koren, S., & Phillippy, A. M. (2016).
+ Mash: fast genome and metagenome distance estimation using MinHash.
+ Genome biology, 17, 1-14.
+
+Input type
+----------
+ArrayAlignment, SequenceCollection, Alignment
+
+Output type
+-----------
+PhyloNode
+
+```
+
+
+
diff --git a/src/diverse_seq/cli.py b/src/diverse_seq/cli.py
index a596eb5..95ad757 100644
--- a/src/diverse_seq/cli.py
+++ b/src/diverse_seq/cli.py
@@ -457,7 +457,6 @@ def ctree(
hide_progress: bool,
):
"""Quickly compute a cluster tree based on kmers for a collection of sequences."""
-
if seqfile.suffix != ".dvseqs":
dvs_util.print_colour(
"Sequence data needs to be preprocessed, use 'dvs prep'",
@@ -479,7 +478,7 @@ def ctree(
)
sys.exit(1)
- seqids = dvs_data_store.get_seqids_from_store(seqfile)[:limit]
+ seqids = dvs_data_store.get_seqids_from_store(seqfile)
if limit is not None:
seqids = seqids[:limit]
@@ -496,7 +495,11 @@ def ctree(
show_progress=not hide_progress,
)
tree = app(seqids) # pylint: disable=not-callable
- tree.write(outpath)
+ if not tree:
+ dvs_util.print_colour(tree, "red")
+ sys.exit(1)
+
+ tree.write(outpath)
if __name__ == "__main__":
diff --git a/src/diverse_seq/cluster.py b/src/diverse_seq/cluster.py
index 8111b04..8140e0d 100644
--- a/src/diverse_seq/cluster.py
+++ b/src/diverse_seq/cluster.py
@@ -132,6 +132,8 @@ def __init__(
Notes
-----
+ This app is composable.
+
If mash_canonical_kmers is enabled when using the mash distance,
kmers are considered identical to their reverse complement.
@@ -233,14 +235,14 @@ def make_cluster_tree(
tree_dict.pop(right_index),
)
node_index += 1
-
- tree = make_tree(str(tree_dict[node_index - 1]))
-
+ # use string representation and then remove quotes
+ treestring = str(tree_dict[node_index - 1]).replace("'", "")
+ tree = make_tree(treestring=treestring, underscore_unmunge=True)
progress.update(tree_task, completed=1, total=1)
-
return tree
+
class DvsParCtreeMixin:
def _mash_dist(self, seq_arrays: Sequence[SeqArray]) -> numpy.ndarray:
"""Calculates pairwise mash distances between sequences in parallel.
diff --git a/src/diverse_seq/distance.py b/src/diverse_seq/distance.py
index 42fb451..7d8c5d7 100644
--- a/src/diverse_seq/distance.py
+++ b/src/diverse_seq/distance.py
@@ -159,7 +159,6 @@ def mash_distances(
numpy.ndarray
Pairwise mash distances between sequences.
"""
-
if progress is None:
progress = Progress(disable=True)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 66b1987..cdf04c5 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -253,9 +253,7 @@ def test_ctree(
):
outpath = tmp_dir / "out.tre"
- args = (
- f"-s {processed_seq_path} -o {outpath} -d {distance} -k {k} -np {max_workers}"
- )
+ args = f"-s {processed_seq_path} -o {outpath} -d {distance} -k {k} -np {max_workers} -hp"
if sketch_size is not None:
args += f" --sketch-size {sketch_size}"
args = args.split()