From 1335266e0662a844fe823bc6a8847c67afef5be6 Mon Sep 17 00:00:00 2001
From: Gavin Huttley <Gavin.Huttley@anu.edu.au>
Date: Mon, 26 Aug 2024 08:38:13 +1000
Subject: [PATCH 1/2] API: changed default k size to 6

---
 src/divergent/cli.py     |  2 +-
 src/divergent/records.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/divergent/cli.py b/src/divergent/cli.py
index 306c0c9..8868aac 100644
--- a/src/divergent/cli.py
+++ b/src/divergent/cli.py
@@ -86,7 +86,7 @@ def main():
     type=Path,
     help="path to .dvtgseqs file",
 )
-_k = click.option("-k", type=int, default=3, help="k-mer size")
+_k = click.option("-k", type=int, default=6, help="k-mer size")
 
 
 @main.command(**_click_command_opts)
diff --git a/src/divergent/records.py b/src/divergent/records.py
index 410ca59..3baf09c 100644
--- a/src/divergent/records.py
+++ b/src/divergent/records.py
@@ -636,7 +636,7 @@ def __init__(
         stat: str = "stdev",
         moltype: str = "dna",
         include: list[str] | str | None = None,
-        k: int = 4,
+        k: int = 6,
         seed: int | None = None,
     ) -> None:
         """
@@ -651,7 +651,7 @@ def __init__(
         moltype
             molecular type of the sequences
         include
-            sequence names to include
+            sequence names to include in the final result
         k
             k-mer size
         seed
@@ -660,9 +660,8 @@ def __init__(
         Notes
         -----
         If called with an alignment, the ungapped sequences are used.
-        Sequence order of input is randomised. If include is not None, the
-        named sequences are added to the collection before selecting the
-        divergent set.
+        The order of the sequences is randomised. If include is not None, the
+        named sequences are added to the final result.
         """
         self._s2k = seq_to_seqarray(moltype=moltype) + seqarray_to_kmerseq(
             k=k,
@@ -700,7 +699,7 @@ def __init__(
         n: int = 3,
         moltype: str = "dna",
         include: list[str] | str | None = None,
-        k: int = 4,
+        k: int = 6,
         seed: int | None = None,
     ) -> None:
         """
@@ -713,14 +712,15 @@ def __init__(
         k
             k-mer size
         include
-            sequence names to include
+            sequence names to include in the final result
         seed
             random number seed
 
         Notes
         -----
         If called with an alignment, the ungapped sequences are used.
-        Sequence order of input is randomised.
+        The order of the sequences is randomised. If include is not None, the
+        named sequences are added to the final result.
         """
         self._s2k = seq_to_seqarray(moltype=moltype) + seqarray_to_kmerseq(
             k=k,

From d0c2e0d8636a9af9c09e968c136641aac454f308 Mon Sep 17 00:00:00 2001
From: Gavin Huttley <Gavin.Huttley@anu.edu.au>
Date: Mon, 26 Aug 2024 08:38:46 +1000
Subject: [PATCH 2/2] DOC: expand readme to include result of app_help()

---
 README.md | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 184 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 15aa34e..b88bb26 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ Commands:
 
 ### `dvgt prep`: Preparing the sequence data
 
-The sequences need to be processed before running the `max` command. This is done with the `prep` command. 
+Convert sequence data into a more efficient format for the diversity assessment. This must be done before running either the `nmost` or `max` commands.
 
 #### Usage:
 
@@ -62,7 +62,7 @@ Options:
   -s, --seqdir PATH        directory containing sequence files  [required]
   -sf, --suffix TEXT       sequence file suffix  [default: fa]
   -o, --outpath PATH       location to write processed seqs  [required]
-  -p, --parallel           run in parallel
+  -np, --numprocs INTEGER  number of processes  [default: 1]
   -F, --force_overwrite    Overwrite existing file if it exists
   -m, --moltype [dna|rna]  Molecular type of sequences, defaults to DNA
                            [default: dna]
@@ -72,11 +72,120 @@ Options:
 ```
 <!-- [[[end]]] -->
 
+### `dvgt nmost`: Select the n-most divergent sequences
+
+We recommend using `nmost` for large datasets.
+
+> **Note**
+> A fuller explanation is coming soon!
+
+#### Command line usage:
+
+<!-- [[[cog
+import cog
+from divergent.cli import main
+from click.testing import CliRunner
+runner = CliRunner()
+result = runner.invoke(main, ["nmost", "--help"])
+help = result.output.replace("Usage: main", "Usage: dvgt")
+cog.out(
+    "```\n{}\n```".format(help)
+)
+]]] -->
+```
+Usage: dvgt nmost [OPTIONS]
+
+  Identify n seqs that maximise average delta JSD
+
+Options:
+  -s, --seqfile PATH       path to .dvtgseqs file  [required]
+  -o, --outpath PATH       the input string will be cast to Path instance
+  -n, --number INTEGER     number of seqs in divergent set  [required]
+  -k INTEGER               k-mer size  [default: 6]
+  -i, --include TEXT       seqnames to include in divergent set
+  -np, --numprocs INTEGER  number of processes  [default: 1]
+  -L, --limit INTEGER      number of sequences to process
+  -v, --verbose            is an integer indicating number of cl occurrences
+                           [default: 0]
+  --help                   Show this message and exit.
+
+```
+<!-- [[[end]]] -->
+
+#### As a cogent3 plugin:
+
+The `dvgt_select_nmost` is also available as a [cogent3 app](https://cogent3.org/doc/app/index.html). The result of using `cogent3.app_help("dvgt_select_nmost")` is shown below.
+
+<!-- [[[cog
+import cog
+import contextlib
+import io
+
+
+from cogent3 import app_help
+
+buffer = io.StringIO()
+
+with contextlib.redirect_stdout(buffer):
+  app_help("dvgt_select_nmost")
+cog.out(
+    "```\n{}\n```".format(buffer.getvalue())
+)
+]]] -->
+```
+Overview
+--------
+selects the n-most divergent seqs from a sequence collection
+
+Options for making the app
+--------------------------
+dvgt_select_nmost_app = get_app(
+    'dvgt_select_nmost',
+    n=3,
+    moltype='dna',
+    include=None,
+    k=6,
+    seed=None,
+)
+
+Parameters
+----------
+n
+    the number of divergent sequences
+moltype
+    molecular type of the sequences
+k
+    k-mer size
+include
+    sequence names to include in the final result
+seed
+    random number seed
+
+Notes
+-----
+If called with an alignment, the ungapped sequences are used.
+The order of the sequences is randomised. If include is not None, the
+named sequences are added to the final result.
+
+Input type
+----------
+Alignment, SequenceCollection, ArrayAlignment
+
+Output type
+-----------
+Alignment, SequenceCollection, ArrayAlignment
+
+```
+<!-- [[[end]]] -->
+
 ### `dvgt max`: Maximise average delta JSD
 
-Once the sequence data has been prepared using `dvgt prep`, the `max` command can be used to identify the sequences that maximise the Jensen-Shannon divergence. The kmer frequencies of the sequences are used to determine the Jensen-Shannon divergence
+The result of the `max` command is typically a set that are modestly more divergent than that fron `nmost`.
 
-#### Usage:
+> **Note**
+> A fuller explanation is coming soon!
+
+#### Command line usage:
 
 <!-- [[[cog
 import cog
@@ -99,7 +208,9 @@ Options:
   -o, --outpath PATH       the input string will be cast to Path instance
   -z, --min_size INTEGER   minimum size of divergent set  [default: 7]
   -zp, --max_size INTEGER  maximum size of divergent set
-  -k INTEGER               k-mer size  [default: 3]
+  -k INTEGER               k-mer size  [default: 6]
+  -st, --stat [stdev|cov]  statistic to maximise  [default: stdev]
+  -i, --include TEXT       seqnames to include in divergent set
   -np, --numprocs INTEGER  number of processes  [default: 1]
   -L, --limit INTEGER      number of sequences to process
   -T, --test_run           reduce number of paths and size of query seqs
@@ -110,11 +221,75 @@ Options:
 ```
 <!-- [[[end]]] -->
 
-## Running the tests
 
+#### As a cogent3 plugin:
+
+The `dvgt_select_nmost` is also available as a [cogent3 app](https://cogent3.org/doc/app/index.html). The result of using `cogent3.app_help("dvgt_select_nmost")` is shown below.
+
+<!-- [[[cog
+import cog
+import contextlib
+import io
+
+
+from cogent3 import app_help
+
+buffer = io.StringIO()
+
+with contextlib.redirect_stdout(buffer):
+  app_help("dvgt_select_max")
+cog.out(
+    "```\n{}\n```".format(buffer.getvalue())
+)
+]]] -->
 ```
-$ pytest -n auto
-```
+Overview
+--------
+selects the maximally divergent seqs from a sequence collection
+
+Options for making the app
+--------------------------
+dvgt_select_max_app = get_app(
+    'dvgt_select_max',
+    min_size=3,
+    max_size=10,
+    stat='stdev',
+    moltype='dna',
+    include=None,
+    k=6,
+    seed=None,
+)
+
+Parameters
+----------
+min_size
+    minimum size of the divergent set
+max_size
+    the maximum size if the divergent set
+stat
+    statistic for maximising the set, either mean_delta_jsd, mean_jsd, total_jsd
+moltype
+    molecular type of the sequences
+include
+    sequence names to include in the final result
+k
+    k-mer size
+seed
+    random number seed
 
-This runs in parallel, greatly speeding things up.
+Notes
+-----
+If called with an alignment, the ungapped sequences are used.
+The order of the sequences is randomised. If include is not None, the
+named sequences are added to the final result.
 
+Input type
+----------
+Alignment, SequenceCollection, ArrayAlignment
+
+Output type
+-----------
+Alignment, SequenceCollection, ArrayAlignment
+
+```
+<!-- [[[end]]] -->