From b1c4187fceca2b87fd308944bbd969ebe046adb2 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 7 Nov 2024 12:53:44 +1100 Subject: [PATCH 1/5] ENH: catch case when ctree app returns NotCompleted --- src/diverse_seq/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diverse_seq/cli.py b/src/diverse_seq/cli.py index fe4773f..9d041e7 100644 --- a/src/diverse_seq/cli.py +++ b/src/diverse_seq/cli.py @@ -457,7 +457,6 @@ def ctree( hide_progress: bool, ): """Quickly compute a cluster tree based on kmers for a collection of sequences.""" - if seqfile.suffix != ".dvseqs": dvs_util.print_colour( "Sequence data needs to be preprocessed, use 'dvs prep'", @@ -492,7 +491,6 @@ def ctree( seqs[name] = arr2str_app(record.read()) # pylint: disable=not-callable seqs = make_unaligned_seqs(seqs, moltype=moltype) - app = dvs_cluster.dvs_par_ctree( k=k, sketch_size=sketch_size, @@ -504,6 +502,9 @@ def ctree( show_progress=not hide_progress, ) tree = app(seqs) # pylint: disable=not-callable + if not tree: + dvs_util.print_colour(tree, "red") + sys.exit(1) tree.write(outpath) From ce60f0a96dfa7a745c7d5aa872e6fffff2da0c0c Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 7 Nov 2024 12:54:15 +1100 Subject: [PATCH 2/5] MAINT: turn off ctree progress display in test --- tests/test_cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 66b1987..cdf04c5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -253,9 +253,7 @@ def test_ctree( ): outpath = tmp_dir / "out.tre" - args = ( - f"-s {processed_seq_path} -o {outpath} -d {distance} -k {k} -np {max_workers}" - ) + args = f"-s {processed_seq_path} -o {outpath} -d {distance} -k {k} -np {max_workers} -hp" if sketch_size is not None: args += f" --sketch-size {sketch_size}" args = args.split() From 1c9671ef57399b94211ed02e482dbfef48de8810 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 7 Nov 2024 12:54:42 +1100 Subject: [PATCH 3/5] ENH: remove quotes from ctree produced newick --- src/diverse_seq/cluster.py | 8 +++----- src/diverse_seq/distance.py | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/diverse_seq/cluster.py b/src/diverse_seq/cluster.py index e8db8c2..6b0f627 100644 --- a/src/diverse_seq/cluster.py +++ b/src/diverse_seq/cluster.py @@ -231,11 +231,10 @@ def make_cluster_tree( tree_dict.pop(right_index), ) node_index += 1 - - tree = make_tree(str(tree_dict[node_index - 1])) - + # use string representation and then remove quotes + treestring = str(tree_dict[node_index - 1]).replace("'", "") + tree = make_tree(treestring=treestring, underscore_unmunge=True) progress.update(tree_task, completed=1, total=1) - return tree @@ -297,7 +296,6 @@ def __init__( mash_canonical_kmers=mash_canonical_kmers, show_progress=show_progress, ) - if parallel: if max_workers is None: max_workers = multiprocessing.cpu_count() diff --git a/src/diverse_seq/distance.py b/src/diverse_seq/distance.py index c08411b..7a0a40b 100644 --- a/src/diverse_seq/distance.py +++ b/src/diverse_seq/distance.py @@ -159,7 +159,6 @@ def mash_distances( numpy.ndarray Pairwise mash distances between sequences. """ - if progress is None: progress = Progress(disable=True) From ae1871ff250dfd08e33eeb868e50d14856f3858d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 7 Nov 2024 13:12:38 +1100 Subject: [PATCH 4/5] DOC: updated the README to include ctree --- README.md | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 216 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d478be9..5cc74e7 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,7 @@ Options: -o, --outpath PATH write processed seqs to this filename [required] -np, --numprocs INTEGER number of processes [default: 1] -F, --force_overwrite Overwrite existing file if it exists - -m, --moltype [dna|rna] Molecular type of sequences, defaults to DNA - [default: dna] + -m, --moltype [dna|rna] Molecular type of sequences [default: dna] -L, --limit INTEGER number of sequences to process -hp, --hide_progress hide progress bars --help Show this message and exit. @@ -75,7 +74,7 @@ Usage: dvs nmost [OPTIONS] Identify n seqs that maximise average delta JSD Options: - -s, --seqfile PATH path to .dvtgseqs file [required] + -s, --seqfile PATH path to .dvseqs file [required] -o, --outpath PATH the input string will be cast to Path instance -n, --number INTEGER number of seqs in divergent set [required] -k INTEGER k-mer size [default: 6] @@ -150,11 +149,11 @@ named sequences are added to the final result. Input type ---------- -SequenceCollection, ArrayAlignment, Alignment +Alignment, SequenceCollection, ArrayAlignment Output type ----------- -SequenceCollection, ArrayAlignment, Alignment +Alignment, SequenceCollection, ArrayAlignment ``` @@ -188,7 +187,7 @@ Usage: dvs max [OPTIONS] Identify the seqs that maximise average delta JSD Options: - -s, --seqfile PATH path to .dvtgseqs file [required] + -s, --seqfile PATH path to .dvseqs file [required] -o, --outpath PATH the input string will be cast to Path instance -z, --min_size INTEGER minimum size of divergent set [default: 7] -zp, --max_size INTEGER maximum size of divergent set @@ -273,12 +272,220 @@ named sequences are added to the final result. Input type ---------- -SequenceCollection, ArrayAlignment, Alignment +Alignment, SequenceCollection, ArrayAlignment Output type ----------- -SequenceCollection, ArrayAlignment, Alignment +Alignment, SequenceCollection, ArrayAlignment ``` - \ No newline at end of file + + +### `dvs ctree`: build a phylogeny using k-mers + +The result of the `ctree` command is a newick formatted tree string without distances. + +> **Note** +> A fuller explanation is coming soon! + +
+ Options for command line dvs ctree + + +``` +Usage: dvs ctree [OPTIONS] + + Quickly compute a cluster tree based on kmers for a collection of sequences. + +Options: + -s, --seqfile PATH path to .dvseqs file [required] + -o, --outpath PATH the input string will be cast to Path instance + -m, --moltype [dna|rna] Molecular type of sequences [default: dna] + -k INTEGER k-mer size [default: 6] + --sketch-size INTEGER sketch size for mash distance + -d, --distance [mash|euclidean] + distance measure for tree construction + [default: mash] + -c, --canonical-kmers consider kmers identical to their reverse + complement + -L, --limit INTEGER number of sequences to process + -np, --numprocs INTEGER number of processes [default: 1] + -hp, --hide_progress hide progress bars + --help Show this message and exit. + +``` + + +
+ +
+ Options for cogent3 app dvs_ctree + +The `dvs ctree` is also available as the [cogent3 app](https://cogent3.org/doc/app/index.html) `dvs_ctree` or `dvs_par_ctree`. The latter is not composable, but can run the analysis for a single collection in parallel. + + +``` +Overview +-------- +Create a cluster tree from kmer distances. + +Options for making the app +-------------------------- +dvs_ctree_app = get_app( + 'dvs_ctree', + k=12, + sketch_size=3000, + moltype='dna', + distance_mode='mash', + mash_canonical_kmers=None, + show_progress=False, +) + +Initialise parameters for generating a kmer cluster tree. + +Parameters +---------- +k + kmer size +sketch_size + size of sketches, only applies to mash distance +moltype + seq collection molecular type +distance_mode + mash distance or euclidean distance between kmer freqs +mash_canonical_kmers + whether to use mash canonical kmers for mash distance +show_progress + whether to show progress bars + +Notes +----- +If mash_canonical_kmers is enabled when using the mash distance, +kmers are considered identical to their reverse complement. + +References +---------- +.. [1] Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B., + Bergman, N. H., Koren, S., & Phillippy, A. M. (2016). + Mash: fast genome and metagenome distance estimation using MinHash. + Genome biology, 17, 1-14. + +Input type +---------- +Alignment, SequenceCollection, ArrayAlignment + +Output type +----------- +PhyloNode + +``` + + + + +``` +Overview +-------- +Create a cluster tree from kmer distances in parallel. + +Options for making the app +-------------------------- +dvs_par_ctree_app = get_app( + 'dvs_par_ctree', + k=12, + sketch_size=3000, + moltype='dna', + distance_mode='mash', + mash_canonical_kmers=None, + show_progress=False, + max_workers=None, + parallel=True, +) + +Initialise parameters for generating a kmer cluster tree. + +Parameters +---------- +k + kmer size +sketch_size + size of sketches, only applies to mash distance +moltype + seq collection molecular type +distance_mode + mash distance or euclidean distance between kmer freqs +mash_canonical_kmers + whether to use mash canonical kmers for mash distance +show_progress + whether to show progress bars +numprocs + number of workers, defaults to running serial + +Notes +----- +This is app is not composable but can run in parallel. It is +best suited to a single large sequence collection. + +If mash_canonical_kmers is enabled when using the mash distance, +kmers are considered identical to their reverse complement. + +References +---------- +.. [1] Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B., + Bergman, N. H., Koren, S., & Phillippy, A. M. (2016). + Mash: fast genome and metagenome distance estimation using MinHash. + Genome biology, 17, 1-14. + +Input type +---------- +Alignment, SequenceCollection, ArrayAlignment + +Output type +----------- +PhyloNode + +``` + + +
From 4b73fa908652adee3cddb04d687f93a5936572b9 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 7 Nov 2024 20:47:09 +1100 Subject: [PATCH 5/5] DOC: more doc updates --- README.md | 16 +++++++++------- src/diverse_seq/cluster.py | 4 +++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5cc74e7..134a9fd 100644 --- a/README.md +++ b/README.md @@ -149,11 +149,11 @@ named sequences are added to the final result. Input type ---------- -Alignment, SequenceCollection, ArrayAlignment +ArrayAlignment, SequenceCollection, Alignment Output type ----------- -Alignment, SequenceCollection, ArrayAlignment +ArrayAlignment, SequenceCollection, Alignment ``` @@ -272,11 +272,11 @@ named sequences are added to the final result. Input type ---------- -Alignment, SequenceCollection, ArrayAlignment +ArrayAlignment, SequenceCollection, Alignment Output type ----------- -Alignment, SequenceCollection, ArrayAlignment +ArrayAlignment, SequenceCollection, Alignment ``` @@ -386,6 +386,8 @@ show_progress Notes ----- +This app is composable. + If mash_canonical_kmers is enabled when using the mash distance, kmers are considered identical to their reverse complement. @@ -398,7 +400,7 @@ References Input type ---------- -Alignment, SequenceCollection, ArrayAlignment +ArrayAlignment, SequenceCollection, Alignment Output type ----------- @@ -464,7 +466,7 @@ numprocs Notes ----- -This is app is not composable but can run in parallel. It is +This app is not composable but can run in parallel. It is best suited to a single large sequence collection. If mash_canonical_kmers is enabled when using the mash distance, @@ -479,7 +481,7 @@ References Input type ---------- -Alignment, SequenceCollection, ArrayAlignment +ArrayAlignment, SequenceCollection, Alignment Output type ----------- diff --git a/src/diverse_seq/cluster.py b/src/diverse_seq/cluster.py index 6b0f627..0938155 100644 --- a/src/diverse_seq/cluster.py +++ b/src/diverse_seq/cluster.py @@ -130,6 +130,8 @@ def __init__( Notes ----- + This app is composable. + If mash_canonical_kmers is enabled when using the mash distance, kmers are considered identical to their reverse complement. @@ -275,7 +277,7 @@ def __init__( Notes ----- - This is app is not composable but can run in parallel. It is + This app is not composable but can run in parallel. It is best suited to a single large sequence collection. If mash_canonical_kmers is enabled when using the mash distance,