Merge pull request #95 from GavinHuttley/JOSS

Fix missing DOI's
HuttleyLab · Jan 12, 2025 · bcc24b4 · bcc24b4
2 parents 6f615e6 + 3bdfeb1
commit bcc24b4
Show file tree

Hide file tree

Showing 44 changed files with 14,624 additions and 2 deletions.
diff --git a/.hgignore b/.hgignore
@@ -35,6 +35,4 @@ doc/draw*
 dist/*
 working/*
 .ruff_cache/*
-manuscript/*
-paper/*
 jats/*
diff --git a/paper/figs/compute_time-ctree.pdf b/paper/figs/compute_time-ctree.pdf
diff --git a/paper/figs/compute_time.pdf b/paper/figs/compute_time.pdf
diff --git a/paper/figs/jsd_v_dist.pdf b/paper/figs/jsd_v_dist.pdf
diff --git a/paper/figs/likelihood_vs_k_for_ss.pdf b/paper/figs/likelihood_vs_k_for_ss.pdf
diff --git a/paper/figs/likelihood_vs_ss_for_k.pdf b/paper/figs/likelihood_vs_ss_for_k.pdf
diff --git a/paper/figs/max.pdf b/paper/figs/max.pdf
diff --git a/paper/figs/max.tex b/paper/figs/max.tex
@@ -0,0 +1,47 @@
+\documentclass{article}
+\usepackage{graphicx}
+\usepackage{caption}
+\usepackage{minted}
+\usepackage[a4paper]{geometry}
+
+\geometry{
+    paperwidth=12cm,
+    paperheight=9.5cm,
+    left=0.1cm,   % Adjust margins as needed
+    right=0.1cm,
+    top=0.1cm,
+    bottom=0.1cm
+}
+\pagestyle{empty} % no page numbers
+
+\begin{document}
+
+\begin{figure}[h]
+    \centering
+        \begin{minted}{python}
+# A list of sequences converted into k-mer counts.
+records: list[KmerSeq]
+shuffle(records)
+
+# The minimum size of the divergent set.
+min_size: int
+# The maximum size of the divergent set.
+max_size: int
+
+sr = SummedRecords.from_records(records[:min_size])
+for r in records:
+    if sr.increases_jsd(r):
+        # Adding r to the N-1 set increased JSD over sr.jsd.
+        # We define a new SummedRecords instance of {N} & {r}.
+        nsr = sr + r
+        # Has adding r increased the standard deviation?
+        sr = nsr if nsr.std > sr.std else sr.replaced_lowest(r)
+        if sr.size > max_size:
+            # We stay within the user specified set size
+            # by dropping the record with lowest delta-JSD.
+            sr = sr.dropped_lowest()
+            \end{minted}
+
+\end{figure}
+
+\end{document}
diff --git a/paper/figs/nmost.pdf b/paper/figs/nmost.pdf
diff --git a/paper/figs/nmost.tex b/paper/figs/nmost.tex
@@ -0,0 +1,42 @@
+\documentclass{article}
+\usepackage{graphicx}
+\usepackage{caption}
+\usepackage{minted}
+\usepackage[a4paper]{geometry}
+
+\geometry{
+    paperwidth=12cm,
+    paperheight=7cm,
+    left=0.1cm,   % Adjust margins as needed
+    right=0.1cm,
+    top=0.1cm,
+    bottom=0.1cm
+}
+\pagestyle{empty} % no page numbers
+
+\begin{document}
+
+\begin{figure}[h]
+    \centering
+        \begin{minted}{python}
+# A list of sequences converted into k-mer counts.
+records: list[KmerSeq]
+# Randomise the order of the records
+shuffle(records)
+
+# The size of the divergent set.
+n: int
+
+# SummedRecords sorts records by their delta-JSD. The record
+# with the lowest delta-JSD is excluded from the N-1 set.
+sr = SummedRecords.from_records(records[:n])
+for r in records:
+    # Is JSD({N-1} & {r}) > JSD({N})?
+    if sr.increases_jsd(r):
+        # Create a new SummedRecords instance from {N-1} & {r}.
+        sr = sr.replaced_lowest(r)
+        \end{minted}
+
+\end{figure}
+
+\end{document}
diff --git a/paper/figs/selected_edges.pdf b/paper/figs/selected_edges.pdf
diff --git a/paper/figs/synthetic_known_bar.pdf b/paper/figs/synthetic_known_bar.pdf
diff --git a/paper/nbks/benchmark.ipynb b/paper/nbks/benchmark.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Summarising compute speed\n",
+    "\n",
+    "## The select algorithms: max and nmost\n",
+    "\n",
+    "`benchmark.py` was run against the REFSOIL collection -- 960 whole bacterial genomes. We separately recorded times to:\n",
+    "- load the sequences from compressed files and convert them into  `SeqRecord` objects\n",
+    "- identify the divergent set\n",
+    "\n",
+    "Each condition was run 5 times with the sequences randomly drawn without replacement.\n",
+    "\n",
+    "# Synopsis\n",
+    "\n",
+    "Performance is approximately linear with the number of sequences for both the divergent `prep` and `max` steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import plotly.express as px\n",
+    "import project_path\n",
+    "from cogent3 import load_table, make_table\n",
+    "\n",
+    "write_pdf = project_path.pdf_writer()\n",
+    "\n",
+    "\n",
+    "def _cast_element(row):\n",
+    "    try:\n",
+    "        return tuple(row)\n",
+    "    except TypeError:\n",
+    "        return int(row)\n",
+    "\n",
+    "\n",
+    "def group_by(table, columns, quant_col, sort_cols):\n",
+    "    columns = tuple(columns)\n",
+    "    distinct = table.distinct_values(columns)\n",
+    "    one_col = len(columns) == 1\n",
+    "    results = []\n",
+    "    for group in distinct:\n",
+    "        subtable = table.filtered(lambda x: _cast_element(x) == group, columns=columns)\n",
+    "        quants = subtable.columns[quant_col]\n",
+    "        mean, std = quants.mean(), quants.std(ddof=1)\n",
+    "        results.append(\n",
+    "            ([group] if one_col else list(group)) + [mean, std, quants.shape[0]],\n",
+    "        )\n",
+    "\n",
+    "    table = make_table(\n",
+    "        header=list(columns) + [f\"mean_{quant_col}\", f\"std_{quant_col}\", \"n\"],\n",
+    "        rows=results,\n",
+    "    )\n",
+    "    table = table.sorted(columns=sort_cols)\n",
+    "    return table\n",
+    "\n",
+    "\n",
+    "table = load_table(project_path.RESULT_DIR / \"benchmark-max.tsv\")\n",
+    "st = table.filtered(lambda x: x == \"prep\", columns=\"command\").get_columns(\n",
+    "    (\"numseqs\", \"time(s)\"),\n",
+    ")\n",
+    "prep_time = group_by(st, (\"numseqs\",), \"time(s)\", \"numseqs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "px.scatter(\n",
+    "    prep_time,\n",
+    "    x=\"numseqs\",\n",
+    "    y=\"mean_time(s)\",\n",
+    "    error_y=\"std_time(s)\",\n",
+    "    labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n",
+    "    trendline=\"ols\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "st = table.filtered(lambda x: x == max, columns=\"command\").get_columns(\n",
+    "    (\"numseqs\", \"k\", \"time(s)\"),\n",
+    ")\n",
+    "max_time = group_by(st, (\"numseqs\", \"k\"), \"time(s)\", (\"numseqs\", \"k\"))\n",
+    "st = max_time.filtered(lambda x: str(x) in \"28\", columns=\"k\")\n",
+    "st.columns[\"k\"] = st.columns[\"k\"].astype(str)\n",
+    "\n",
+    "tickfont = dict(size=16)\n",
+    "titlefont = dict(size=20)\n",
+    "legend = dict(title=dict(text=\"<i>k</i>\"), font=dict(size=17), tracegroupgap=10)\n",
+    "\n",
+    "fig = px.scatter(\n",
+    "    st,\n",
+    "    x=\"numseqs\",\n",
+    "    y=\"mean_time(s)\",\n",
+    "    error_y=\"std_time(s)\",\n",
+    "    color=\"k\",\n",
+    "    labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n",
+    "    trendline=\"ols\",\n",
+    ")\n",
+    "fig.update_layout(\n",
+    "    height=600,\n",
+    "    width=1400,\n",
+    "    xaxis=dict(\n",
+    "        title=\"Number of Sequences\",\n",
+    "        titlefont=titlefont,\n",
+    "        tickfont=tickfont,\n",
+    "    ),\n",
+    "    yaxis=dict(\n",
+    "        title=\"Mean time (seconds)\",\n",
+    "        titlefont=titlefont,\n",
+    "        tickfont=tickfont,\n",
+    "    ),\n",
+    "    legend=legend,\n",
+    ")\n",
+    "fig.update_traces(marker=dict(size=12))\n",
+    "fig.show()\n",
+    "outpath = project_path.FIG_DIR / \"compute_time.pdf\"\n",
+    "# write_pdf(fig, outpath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "st = max_time.filtered(lambda x: str(x) == \"200\", columns=\"numseqs\")\n",
+    "st.columns[\"numseqs\"] = st.columns[\"numseqs\"].astype(str)\n",
+    "px.scatter(\n",
+    "    st,\n",
+    "    x=\"k\",\n",
+    "    y=\"mean_time(s)\",\n",
+    "    error_y=\"std_time(s)\",\n",
+    "    color=\"numseqs\",\n",
+    "    labels={\"mean_time(s)\": \"Seconds\", \"k\": \"<i>k</i>\"},\n",
+    "    trendline=\"ols\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The cluster algorithm: ctree\n",
+    "\n",
+    "`benchmark_ctree.py` was run against the REFSOIL collection. We recorded the time to resolve the tree.\n",
+    "\n",
+    "Each condition was run 3 times.\n",
+    "\n",
+    "# Synopsis\n",
+    "\n",
+    "Performance is approximately linear with the number of sequences as the algorithm scales proportional to the total of sequence lengths."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "table = load_table(project_path.RESULT_DIR / \"benchmark-ctree.tsv\")\n",
+    "ctree_time = group_by(table, (\"numseqs\", \"k\"), \"time(s)\", (\"numseqs\", \"k\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ctree_time.columns[\"k\"] = ctree_time.columns[\"k\"].astype(str)\n",
+    "\n",
+    "tickfont = dict(size=16)\n",
+    "titlefont = dict(size=20)\n",
+    "legend = dict(title=dict(text=\"<i>k</i>\"), font=dict(size=17), tracegroupgap=10)\n",
+    "\n",
+    "fig = px.scatter(\n",
+    "    ctree_time,\n",
+    "    x=\"numseqs\",\n",
+    "    y=\"mean_time(s)\",\n",
+    "    error_y=\"std_time(s)\",\n",
+    "    color=\"k\",\n",
+    "    labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n",
+    "    trendline=\"ols\",\n",
+    ")\n",
+    "fig.update_layout(\n",
+    "    height=600,\n",
+    "    width=1400,\n",
+    "    xaxis=dict(\n",
+    "        title=\"Number of Sequences\",\n",
+    "        titlefont=titlefont,\n",
+    "        tickfont=tickfont,\n",
+    "    ),\n",
+    "    yaxis=dict(\n",
+    "        title=\"Mean time (seconds)\",\n",
+    "        titlefont=titlefont,\n",
+    "        tickfont=tickfont,\n",
+    "    ),\n",
+    "    legend=legend,\n",
+    ")\n",
+    "fig.update_traces(marker=dict(size=12))\n",
+    "fig.show()\n",
+    "outpath = project_path.FIG_DIR / \"compute_time-ctree.pdf\"\n",
+    "# write_pdf(fig, outpath)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dvgt",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}