From 480b9296c84e5c71ffa2e560a9c50085836577fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Wed, 3 May 2023 10:50:32 +0200 Subject: [PATCH] fix: error out if a callset does not match the truth at all (#43) * fix: error out if a callset does not match the truth at all * fmt * trigger rerun --- workflow/rules/eval.smk | 6 ++++++ workflow/scripts/collect-stratifications.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 3a23a23..5e8d746 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -137,6 +137,9 @@ rule collect_stratifications: "logs/collect-stratifications/{callset}/{vartype}.log", conda: "../envs/stats.yaml" + # We want this to be determined before FP/FN collection in order to avoid memory + # issues with callsets that do not match the truth at all. + priority: 2 script: "../scripts/collect-stratifications.py" @@ -221,6 +224,9 @@ rule collect_fp_fn: "logs/collect-fp-fn/{genome}/{cov}/{classification}.log", conda: "../envs/stats.yaml" + # This has to happen after precision/recall has been computed, otherwise we risk + # extremely high memory usage if a callset does not match the truth at all. + priority: 1 script: "../scripts/collect-fp-fn.py" diff --git a/workflow/scripts/collect-stratifications.py b/workflow/scripts/collect-stratifications.py index 581ca91..b58c614 100644 --- a/workflow/scripts/collect-stratifications.py +++ b/workflow/scripts/collect-stratifications.py @@ -30,6 +30,12 @@ def load_data(f, coverage): load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input) ) + if (report["tp_truth"] == 0).all(): + raise ValueError( + f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. " + "This is likely a technical issue in the callset and should be checked before further evaluation." + ) + report.to_csv(snakemake.output[0], sep="\t", index=False) else: pd.DataFrame(