Skip to content

Commit

Permalink
Merge pull request #95 from GavinHuttley/JOSS
Browse files Browse the repository at this point in the history
Fix missing DOI's
  • Loading branch information
GavinHuttley authored Jan 12, 2025
2 parents 6f615e6 + 3bdfeb1 commit bcc24b4
Show file tree
Hide file tree
Showing 44 changed files with 14,624 additions and 2 deletions.
2 changes: 0 additions & 2 deletions .hgignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,4 @@ doc/draw*
dist/*
working/*
.ruff_cache/*
manuscript/*
paper/*
jats/*
Binary file added paper/figs/compute_time-ctree.pdf
Binary file not shown.
Binary file added paper/figs/compute_time.pdf
Binary file not shown.
Binary file added paper/figs/jsd_v_dist.pdf
Binary file not shown.
Binary file added paper/figs/likelihood_vs_k_for_ss.pdf
Binary file not shown.
Binary file added paper/figs/likelihood_vs_ss_for_k.pdf
Binary file not shown.
Binary file added paper/figs/max.pdf
Binary file not shown.
47 changes: 47 additions & 0 deletions paper/figs/max.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
\documentclass{article}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{minted}
\usepackage[a4paper]{geometry}

\geometry{
paperwidth=12cm,
paperheight=9.5cm,
left=0.1cm, % Adjust margins as needed
right=0.1cm,
top=0.1cm,
bottom=0.1cm
}
\pagestyle{empty} % no page numbers

\begin{document}

\begin{figure}[h]
\centering
\begin{minted}{python}
# A list of sequences converted into k-mer counts.
records: list[KmerSeq]
shuffle(records)

# The minimum size of the divergent set.
min_size: int
# The maximum size of the divergent set.
max_size: int

sr = SummedRecords.from_records(records[:min_size])
for r in records:
if sr.increases_jsd(r):
# Adding r to the N-1 set increased JSD over sr.jsd.
# We define a new SummedRecords instance of {N} & {r}.
nsr = sr + r
# Has adding r increased the standard deviation?
sr = nsr if nsr.std > sr.std else sr.replaced_lowest(r)
if sr.size > max_size:
# We stay within the user specified set size
# by dropping the record with lowest delta-JSD.
sr = sr.dropped_lowest()
\end{minted}

\end{figure}

\end{document}
Binary file added paper/figs/nmost.pdf
Binary file not shown.
42 changes: 42 additions & 0 deletions paper/figs/nmost.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
\documentclass{article}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{minted}
\usepackage[a4paper]{geometry}

\geometry{
paperwidth=12cm,
paperheight=7cm,
left=0.1cm, % Adjust margins as needed
right=0.1cm,
top=0.1cm,
bottom=0.1cm
}
\pagestyle{empty} % no page numbers

\begin{document}

\begin{figure}[h]
\centering
\begin{minted}{python}
# A list of sequences converted into k-mer counts.
records: list[KmerSeq]
# Randomise the order of the records
shuffle(records)

# The size of the divergent set.
n: int

# SummedRecords sorts records by their delta-JSD. The record
# with the lowest delta-JSD is excluded from the N-1 set.
sr = SummedRecords.from_records(records[:n])
for r in records:
# Is JSD({N-1} & {r}) > JSD({N})?
if sr.increases_jsd(r):
# Create a new SummedRecords instance from {N-1} & {r}.
sr = sr.replaced_lowest(r)
\end{minted}

\end{figure}

\end{document}
Binary file added paper/figs/selected_edges.pdf
Binary file not shown.
Binary file added paper/figs/synthetic_known_bar.pdf
Binary file not shown.
242 changes: 242 additions & 0 deletions paper/nbks/benchmark.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Summarising compute speed\n",
"\n",
"## The select algorithms: max and nmost\n",
"\n",
"`benchmark.py` was run against the REFSOIL collection -- 960 whole bacterial genomes. We separately recorded times to:\n",
"- load the sequences from compressed files and convert them into `SeqRecord` objects\n",
"- identify the divergent set\n",
"\n",
"Each condition was run 5 times with the sequences randomly drawn without replacement.\n",
"\n",
"# Synopsis\n",
"\n",
"Performance is approximately linear with the number of sequences for both the divergent `prep` and `max` steps."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"import project_path\n",
"from cogent3 import load_table, make_table\n",
"\n",
"write_pdf = project_path.pdf_writer()\n",
"\n",
"\n",
"def _cast_element(row):\n",
" try:\n",
" return tuple(row)\n",
" except TypeError:\n",
" return int(row)\n",
"\n",
"\n",
"def group_by(table, columns, quant_col, sort_cols):\n",
" columns = tuple(columns)\n",
" distinct = table.distinct_values(columns)\n",
" one_col = len(columns) == 1\n",
" results = []\n",
" for group in distinct:\n",
" subtable = table.filtered(lambda x: _cast_element(x) == group, columns=columns)\n",
" quants = subtable.columns[quant_col]\n",
" mean, std = quants.mean(), quants.std(ddof=1)\n",
" results.append(\n",
" ([group] if one_col else list(group)) + [mean, std, quants.shape[0]],\n",
" )\n",
"\n",
" table = make_table(\n",
" header=list(columns) + [f\"mean_{quant_col}\", f\"std_{quant_col}\", \"n\"],\n",
" rows=results,\n",
" )\n",
" table = table.sorted(columns=sort_cols)\n",
" return table\n",
"\n",
"\n",
"table = load_table(project_path.RESULT_DIR / \"benchmark-max.tsv\")\n",
"st = table.filtered(lambda x: x == \"prep\", columns=\"command\").get_columns(\n",
" (\"numseqs\", \"time(s)\"),\n",
")\n",
"prep_time = group_by(st, (\"numseqs\",), \"time(s)\", \"numseqs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"px.scatter(\n",
" prep_time,\n",
" x=\"numseqs\",\n",
" y=\"mean_time(s)\",\n",
" error_y=\"std_time(s)\",\n",
" labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n",
" trendline=\"ols\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"st = table.filtered(lambda x: x == max, columns=\"command\").get_columns(\n",
" (\"numseqs\", \"k\", \"time(s)\"),\n",
")\n",
"max_time = group_by(st, (\"numseqs\", \"k\"), \"time(s)\", (\"numseqs\", \"k\"))\n",
"st = max_time.filtered(lambda x: str(x) in \"28\", columns=\"k\")\n",
"st.columns[\"k\"] = st.columns[\"k\"].astype(str)\n",
"\n",
"tickfont = dict(size=16)\n",
"titlefont = dict(size=20)\n",
"legend = dict(title=dict(text=\"<i>k</i>\"), font=dict(size=17), tracegroupgap=10)\n",
"\n",
"fig = px.scatter(\n",
" st,\n",
" x=\"numseqs\",\n",
" y=\"mean_time(s)\",\n",
" error_y=\"std_time(s)\",\n",
" color=\"k\",\n",
" labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n",
" trendline=\"ols\",\n",
")\n",
"fig.update_layout(\n",
" height=600,\n",
" width=1400,\n",
" xaxis=dict(\n",
" title=\"Number of Sequences\",\n",
" titlefont=titlefont,\n",
" tickfont=tickfont,\n",
" ),\n",
" yaxis=dict(\n",
" title=\"Mean time (seconds)\",\n",
" titlefont=titlefont,\n",
" tickfont=tickfont,\n",
" ),\n",
" legend=legend,\n",
")\n",
"fig.update_traces(marker=dict(size=12))\n",
"fig.show()\n",
"outpath = project_path.FIG_DIR / \"compute_time.pdf\"\n",
"# write_pdf(fig, outpath)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"st = max_time.filtered(lambda x: str(x) == \"200\", columns=\"numseqs\")\n",
"st.columns[\"numseqs\"] = st.columns[\"numseqs\"].astype(str)\n",
"px.scatter(\n",
" st,\n",
" x=\"k\",\n",
" y=\"mean_time(s)\",\n",
" error_y=\"std_time(s)\",\n",
" color=\"numseqs\",\n",
" labels={\"mean_time(s)\": \"Seconds\", \"k\": \"<i>k</i>\"},\n",
" trendline=\"ols\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The cluster algorithm: ctree\n",
"\n",
"`benchmark_ctree.py` was run against the REFSOIL collection. We recorded the time to resolve the tree.\n",
"\n",
"Each condition was run 3 times.\n",
"\n",
"# Synopsis\n",
"\n",
"Performance is approximately linear with the number of sequences as the algorithm scales proportional to the total of sequence lengths."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table = load_table(project_path.RESULT_DIR / \"benchmark-ctree.tsv\")\n",
"ctree_time = group_by(table, (\"numseqs\", \"k\"), \"time(s)\", (\"numseqs\", \"k\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ctree_time.columns[\"k\"] = ctree_time.columns[\"k\"].astype(str)\n",
"\n",
"tickfont = dict(size=16)\n",
"titlefont = dict(size=20)\n",
"legend = dict(title=dict(text=\"<i>k</i>\"), font=dict(size=17), tracegroupgap=10)\n",
"\n",
"fig = px.scatter(\n",
" ctree_time,\n",
" x=\"numseqs\",\n",
" y=\"mean_time(s)\",\n",
" error_y=\"std_time(s)\",\n",
" color=\"k\",\n",
" labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n",
" trendline=\"ols\",\n",
")\n",
"fig.update_layout(\n",
" height=600,\n",
" width=1400,\n",
" xaxis=dict(\n",
" title=\"Number of Sequences\",\n",
" titlefont=titlefont,\n",
" tickfont=tickfont,\n",
" ),\n",
" yaxis=dict(\n",
" title=\"Mean time (seconds)\",\n",
" titlefont=titlefont,\n",
" tickfont=tickfont,\n",
" ),\n",
" legend=legend,\n",
")\n",
"fig.update_traces(marker=dict(size=12))\n",
"fig.show()\n",
"outpath = project_path.FIG_DIR / \"compute_time-ctree.pdf\"\n",
"# write_pdf(fig, outpath)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dvgt",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit bcc24b4

Please sign in to comment.