-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #95 from GavinHuttley/JOSS
Fix missing DOI's
- Loading branch information
Showing
44 changed files
with
14,624 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,4 @@ doc/draw* | |
dist/* | ||
working/* | ||
.ruff_cache/* | ||
manuscript/* | ||
paper/* | ||
jats/* |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
\documentclass{article} | ||
\usepackage{graphicx} | ||
\usepackage{caption} | ||
\usepackage{minted} | ||
\usepackage[a4paper]{geometry} | ||
|
||
\geometry{ | ||
paperwidth=12cm, | ||
paperheight=9.5cm, | ||
left=0.1cm, % Adjust margins as needed | ||
right=0.1cm, | ||
top=0.1cm, | ||
bottom=0.1cm | ||
} | ||
\pagestyle{empty} % no page numbers | ||
|
||
\begin{document} | ||
|
||
\begin{figure}[h] | ||
\centering | ||
\begin{minted}{python} | ||
# A list of sequences converted into k-mer counts. | ||
records: list[KmerSeq] | ||
shuffle(records) | ||
|
||
# The minimum size of the divergent set. | ||
min_size: int | ||
# The maximum size of the divergent set. | ||
max_size: int | ||
|
||
sr = SummedRecords.from_records(records[:min_size]) | ||
for r in records: | ||
if sr.increases_jsd(r): | ||
# Adding r to the N-1 set increased JSD over sr.jsd. | ||
# We define a new SummedRecords instance of {N} & {r}. | ||
nsr = sr + r | ||
# Has adding r increased the standard deviation? | ||
sr = nsr if nsr.std > sr.std else sr.replaced_lowest(r) | ||
if sr.size > max_size: | ||
# We stay within the user specified set size | ||
# by dropping the record with lowest delta-JSD. | ||
sr = sr.dropped_lowest() | ||
\end{minted} | ||
|
||
\end{figure} | ||
|
||
\end{document} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
\documentclass{article} | ||
\usepackage{graphicx} | ||
\usepackage{caption} | ||
\usepackage{minted} | ||
\usepackage[a4paper]{geometry} | ||
|
||
\geometry{ | ||
paperwidth=12cm, | ||
paperheight=7cm, | ||
left=0.1cm, % Adjust margins as needed | ||
right=0.1cm, | ||
top=0.1cm, | ||
bottom=0.1cm | ||
} | ||
\pagestyle{empty} % no page numbers | ||
|
||
\begin{document} | ||
|
||
\begin{figure}[h] | ||
\centering | ||
\begin{minted}{python} | ||
# A list of sequences converted into k-mer counts. | ||
records: list[KmerSeq] | ||
# Randomise the order of the records | ||
shuffle(records) | ||
|
||
# The size of the divergent set. | ||
n: int | ||
|
||
# SummedRecords sorts records by their delta-JSD. The record | ||
# with the lowest delta-JSD is excluded from the N-1 set. | ||
sr = SummedRecords.from_records(records[:n]) | ||
for r in records: | ||
# Is JSD({N-1} & {r}) > JSD({N})? | ||
if sr.increases_jsd(r): | ||
# Create a new SummedRecords instance from {N-1} & {r}. | ||
sr = sr.replaced_lowest(r) | ||
\end{minted} | ||
|
||
\end{figure} | ||
|
||
\end{document} |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Summarising compute speed\n", | ||
"\n", | ||
"## The select algorithms: max and nmost\n", | ||
"\n", | ||
"`benchmark.py` was run against the REFSOIL collection -- 960 whole bacterial genomes. We separately recorded times to:\n", | ||
"- load the sequences from compressed files and convert them into `SeqRecord` objects\n", | ||
"- identify the divergent set\n", | ||
"\n", | ||
"Each condition was run 5 times with the sequences randomly drawn without replacement.\n", | ||
"\n", | ||
"# Synopsis\n", | ||
"\n", | ||
"Performance is approximately linear with the number of sequences for both the divergent `prep` and `max` steps." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import plotly.express as px\n", | ||
"import project_path\n", | ||
"from cogent3 import load_table, make_table\n", | ||
"\n", | ||
"write_pdf = project_path.pdf_writer()\n", | ||
"\n", | ||
"\n", | ||
"def _cast_element(row):\n", | ||
" try:\n", | ||
" return tuple(row)\n", | ||
" except TypeError:\n", | ||
" return int(row)\n", | ||
"\n", | ||
"\n", | ||
"def group_by(table, columns, quant_col, sort_cols):\n", | ||
" columns = tuple(columns)\n", | ||
" distinct = table.distinct_values(columns)\n", | ||
" one_col = len(columns) == 1\n", | ||
" results = []\n", | ||
" for group in distinct:\n", | ||
" subtable = table.filtered(lambda x: _cast_element(x) == group, columns=columns)\n", | ||
" quants = subtable.columns[quant_col]\n", | ||
" mean, std = quants.mean(), quants.std(ddof=1)\n", | ||
" results.append(\n", | ||
" ([group] if one_col else list(group)) + [mean, std, quants.shape[0]],\n", | ||
" )\n", | ||
"\n", | ||
" table = make_table(\n", | ||
" header=list(columns) + [f\"mean_{quant_col}\", f\"std_{quant_col}\", \"n\"],\n", | ||
" rows=results,\n", | ||
" )\n", | ||
" table = table.sorted(columns=sort_cols)\n", | ||
" return table\n", | ||
"\n", | ||
"\n", | ||
"table = load_table(project_path.RESULT_DIR / \"benchmark-max.tsv\")\n", | ||
"st = table.filtered(lambda x: x == \"prep\", columns=\"command\").get_columns(\n", | ||
" (\"numseqs\", \"time(s)\"),\n", | ||
")\n", | ||
"prep_time = group_by(st, (\"numseqs\",), \"time(s)\", \"numseqs\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"px.scatter(\n", | ||
" prep_time,\n", | ||
" x=\"numseqs\",\n", | ||
" y=\"mean_time(s)\",\n", | ||
" error_y=\"std_time(s)\",\n", | ||
" labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n", | ||
" trendline=\"ols\",\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"st = table.filtered(lambda x: x == max, columns=\"command\").get_columns(\n", | ||
" (\"numseqs\", \"k\", \"time(s)\"),\n", | ||
")\n", | ||
"max_time = group_by(st, (\"numseqs\", \"k\"), \"time(s)\", (\"numseqs\", \"k\"))\n", | ||
"st = max_time.filtered(lambda x: str(x) in \"28\", columns=\"k\")\n", | ||
"st.columns[\"k\"] = st.columns[\"k\"].astype(str)\n", | ||
"\n", | ||
"tickfont = dict(size=16)\n", | ||
"titlefont = dict(size=20)\n", | ||
"legend = dict(title=dict(text=\"<i>k</i>\"), font=dict(size=17), tracegroupgap=10)\n", | ||
"\n", | ||
"fig = px.scatter(\n", | ||
" st,\n", | ||
" x=\"numseqs\",\n", | ||
" y=\"mean_time(s)\",\n", | ||
" error_y=\"std_time(s)\",\n", | ||
" color=\"k\",\n", | ||
" labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n", | ||
" trendline=\"ols\",\n", | ||
")\n", | ||
"fig.update_layout(\n", | ||
" height=600,\n", | ||
" width=1400,\n", | ||
" xaxis=dict(\n", | ||
" title=\"Number of Sequences\",\n", | ||
" titlefont=titlefont,\n", | ||
" tickfont=tickfont,\n", | ||
" ),\n", | ||
" yaxis=dict(\n", | ||
" title=\"Mean time (seconds)\",\n", | ||
" titlefont=titlefont,\n", | ||
" tickfont=tickfont,\n", | ||
" ),\n", | ||
" legend=legend,\n", | ||
")\n", | ||
"fig.update_traces(marker=dict(size=12))\n", | ||
"fig.show()\n", | ||
"outpath = project_path.FIG_DIR / \"compute_time.pdf\"\n", | ||
"# write_pdf(fig, outpath)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"st = max_time.filtered(lambda x: str(x) == \"200\", columns=\"numseqs\")\n", | ||
"st.columns[\"numseqs\"] = st.columns[\"numseqs\"].astype(str)\n", | ||
"px.scatter(\n", | ||
" st,\n", | ||
" x=\"k\",\n", | ||
" y=\"mean_time(s)\",\n", | ||
" error_y=\"std_time(s)\",\n", | ||
" color=\"numseqs\",\n", | ||
" labels={\"mean_time(s)\": \"Seconds\", \"k\": \"<i>k</i>\"},\n", | ||
" trendline=\"ols\",\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## The cluster algorithm: ctree\n", | ||
"\n", | ||
"`benchmark_ctree.py` was run against the REFSOIL collection. We recorded the time to resolve the tree.\n", | ||
"\n", | ||
"Each condition was run 3 times.\n", | ||
"\n", | ||
"# Synopsis\n", | ||
"\n", | ||
"Performance is approximately linear with the number of sequences as the algorithm scales proportional to the total of sequence lengths." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"table = load_table(project_path.RESULT_DIR / \"benchmark-ctree.tsv\")\n", | ||
"ctree_time = group_by(table, (\"numseqs\", \"k\"), \"time(s)\", (\"numseqs\", \"k\"))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ctree_time.columns[\"k\"] = ctree_time.columns[\"k\"].astype(str)\n", | ||
"\n", | ||
"tickfont = dict(size=16)\n", | ||
"titlefont = dict(size=20)\n", | ||
"legend = dict(title=dict(text=\"<i>k</i>\"), font=dict(size=17), tracegroupgap=10)\n", | ||
"\n", | ||
"fig = px.scatter(\n", | ||
" ctree_time,\n", | ||
" x=\"numseqs\",\n", | ||
" y=\"mean_time(s)\",\n", | ||
" error_y=\"std_time(s)\",\n", | ||
" color=\"k\",\n", | ||
" labels={\"mean_time(s)\": \"Seconds\", \"numseqs\": \"Number of sequences\"},\n", | ||
" trendline=\"ols\",\n", | ||
")\n", | ||
"fig.update_layout(\n", | ||
" height=600,\n", | ||
" width=1400,\n", | ||
" xaxis=dict(\n", | ||
" title=\"Number of Sequences\",\n", | ||
" titlefont=titlefont,\n", | ||
" tickfont=tickfont,\n", | ||
" ),\n", | ||
" yaxis=dict(\n", | ||
" title=\"Mean time (seconds)\",\n", | ||
" titlefont=titlefont,\n", | ||
" tickfont=tickfont,\n", | ||
" ),\n", | ||
" legend=legend,\n", | ||
")\n", | ||
"fig.update_traces(marker=dict(size=12))\n", | ||
"fig.show()\n", | ||
"outpath = project_path.FIG_DIR / \"compute_time-ctree.pdf\"\n", | ||
"# write_pdf(fig, outpath)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "dvgt", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.4" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.