From 1335266e0662a844fe823bc6a8847c67afef5be6 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 26 Aug 2024 08:38:13 +1000 Subject: [PATCH 1/2] API: changed default k size to 6 --- src/divergent/cli.py | 2 +- src/divergent/records.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/divergent/cli.py b/src/divergent/cli.py index 306c0c9..8868aac 100644 --- a/src/divergent/cli.py +++ b/src/divergent/cli.py @@ -86,7 +86,7 @@ def main(): type=Path, help="path to .dvtgseqs file", ) -_k = click.option("-k", type=int, default=3, help="k-mer size") +_k = click.option("-k", type=int, default=6, help="k-mer size") @main.command(**_click_command_opts) diff --git a/src/divergent/records.py b/src/divergent/records.py index 410ca59..3baf09c 100644 --- a/src/divergent/records.py +++ b/src/divergent/records.py @@ -636,7 +636,7 @@ def __init__( stat: str = "stdev", moltype: str = "dna", include: list[str] | str | None = None, - k: int = 4, + k: int = 6, seed: int | None = None, ) -> None: """ @@ -651,7 +651,7 @@ def __init__( moltype molecular type of the sequences include - sequence names to include + sequence names to include in the final result k k-mer size seed @@ -660,9 +660,8 @@ def __init__( Notes ----- If called with an alignment, the ungapped sequences are used. - Sequence order of input is randomised. If include is not None, the - named sequences are added to the collection before selecting the - divergent set. + The order of the sequences is randomised. If include is not None, the + named sequences are added to the final result. """ self._s2k = seq_to_seqarray(moltype=moltype) + seqarray_to_kmerseq( k=k, @@ -700,7 +699,7 @@ def __init__( n: int = 3, moltype: str = "dna", include: list[str] | str | None = None, - k: int = 4, + k: int = 6, seed: int | None = None, ) -> None: """ @@ -713,14 +712,15 @@ def __init__( k k-mer size include - sequence names to include + sequence names to include in the final result seed random number seed Notes ----- If called with an alignment, the ungapped sequences are used. - Sequence order of input is randomised. + The order of the sequences is randomised. If include is not None, the + named sequences are added to the final result. """ self._s2k = seq_to_seqarray(moltype=moltype) + seqarray_to_kmerseq( k=k, From d0c2e0d8636a9af9c09e968c136641aac454f308 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 26 Aug 2024 08:38:46 +1000 Subject: [PATCH 2/2] DOC: expand readme to include result of app_help() --- README.md | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 184 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 15aa34e..b88bb26 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Commands: ### `dvgt prep`: Preparing the sequence data -The sequences need to be processed before running the `max` command. This is done with the `prep` command. +Convert sequence data into a more efficient format for the diversity assessment. This must be done before running either the `nmost` or `max` commands. #### Usage: @@ -62,7 +62,7 @@ Options: -s, --seqdir PATH directory containing sequence files [required] -sf, --suffix TEXT sequence file suffix [default: fa] -o, --outpath PATH location to write processed seqs [required] - -p, --parallel run in parallel + -np, --numprocs INTEGER number of processes [default: 1] -F, --force_overwrite Overwrite existing file if it exists -m, --moltype [dna|rna] Molecular type of sequences, defaults to DNA [default: dna] @@ -72,11 +72,120 @@ Options: ``` +### `dvgt nmost`: Select the n-most divergent sequences + +We recommend using `nmost` for large datasets. + +> **Note** +> A fuller explanation is coming soon! + +#### Command line usage: + + +``` +Usage: dvgt nmost [OPTIONS] + + Identify n seqs that maximise average delta JSD + +Options: + -s, --seqfile PATH path to .dvtgseqs file [required] + -o, --outpath PATH the input string will be cast to Path instance + -n, --number INTEGER number of seqs in divergent set [required] + -k INTEGER k-mer size [default: 6] + -i, --include TEXT seqnames to include in divergent set + -np, --numprocs INTEGER number of processes [default: 1] + -L, --limit INTEGER number of sequences to process + -v, --verbose is an integer indicating number of cl occurrences + [default: 0] + --help Show this message and exit. + +``` + + +#### As a cogent3 plugin: + +The `dvgt_select_nmost` is also available as a [cogent3 app](https://cogent3.org/doc/app/index.html). The result of using `cogent3.app_help("dvgt_select_nmost")` is shown below. + + +``` +Overview +-------- +selects the n-most divergent seqs from a sequence collection + +Options for making the app +-------------------------- +dvgt_select_nmost_app = get_app( + 'dvgt_select_nmost', + n=3, + moltype='dna', + include=None, + k=6, + seed=None, +) + +Parameters +---------- +n + the number of divergent sequences +moltype + molecular type of the sequences +k + k-mer size +include + sequence names to include in the final result +seed + random number seed + +Notes +----- +If called with an alignment, the ungapped sequences are used. +The order of the sequences is randomised. If include is not None, the +named sequences are added to the final result. + +Input type +---------- +Alignment, SequenceCollection, ArrayAlignment + +Output type +----------- +Alignment, SequenceCollection, ArrayAlignment + +``` + + ### `dvgt max`: Maximise average delta JSD -Once the sequence data has been prepared using `dvgt prep`, the `max` command can be used to identify the sequences that maximise the Jensen-Shannon divergence. The kmer frequencies of the sequences are used to determine the Jensen-Shannon divergence +The result of the `max` command is typically a set that are modestly more divergent than that fron `nmost`. -#### Usage: +> **Note** +> A fuller explanation is coming soon! + +#### Command line usage: -## Running the tests +#### As a cogent3 plugin: + +The `dvgt_select_nmost` is also available as a [cogent3 app](https://cogent3.org/doc/app/index.html). The result of using `cogent3.app_help("dvgt_select_nmost")` is shown below. + + ``` -$ pytest -n auto -``` +Overview +-------- +selects the maximally divergent seqs from a sequence collection + +Options for making the app +-------------------------- +dvgt_select_max_app = get_app( + 'dvgt_select_max', + min_size=3, + max_size=10, + stat='stdev', + moltype='dna', + include=None, + k=6, + seed=None, +) + +Parameters +---------- +min_size + minimum size of the divergent set +max_size + the maximum size if the divergent set +stat + statistic for maximising the set, either mean_delta_jsd, mean_jsd, total_jsd +moltype + molecular type of the sequences +include + sequence names to include in the final result +k + k-mer size +seed + random number seed -This runs in parallel, greatly speeding things up. +Notes +----- +If called with an alignment, the ungapped sequences are used. +The order of the sequences is randomised. If include is not None, the +named sequences are added to the final result. +Input type +---------- +Alignment, SequenceCollection, ArrayAlignment + +Output type +----------- +Alignment, SequenceCollection, ArrayAlignment + +``` +