From 480b9296c84e5c71ffa2e560a9c50085836577fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20K=C3=B6ster?= <johannes.koester@tu-dortmund.de>
Date: Wed, 3 May 2023 10:50:32 +0200
Subject: [PATCH] fix: error out if a callset does not match the truth at all
 (#43)

* fix: error out if a callset does not match the truth at all

* fmt

* trigger rerun
---
 workflow/rules/eval.smk                     | 6 ++++++
 workflow/scripts/collect-stratifications.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
index 3a23a23..5e8d746 100644
--- a/workflow/rules/eval.smk
+++ b/workflow/rules/eval.smk
@@ -137,6 +137,9 @@ rule collect_stratifications:
         "logs/collect-stratifications/{callset}/{vartype}.log",
     conda:
         "../envs/stats.yaml"
+    # We want this to be determined before FP/FN collection in order to avoid memory
+    # issues with callsets that do not match the truth at all.
+    priority: 2
     script:
         "../scripts/collect-stratifications.py"
 
@@ -221,6 +224,9 @@ rule collect_fp_fn:
         "logs/collect-fp-fn/{genome}/{cov}/{classification}.log",
     conda:
         "../envs/stats.yaml"
+    # This has to happen after precision/recall has been computed, otherwise we risk
+    # extremely high memory usage if a callset does not match the truth at all.
+    priority: 1
     script:
         "../scripts/collect-fp-fn.py"
 
diff --git a/workflow/scripts/collect-stratifications.py b/workflow/scripts/collect-stratifications.py
index 581ca91..b58c614 100644
--- a/workflow/scripts/collect-stratifications.py
+++ b/workflow/scripts/collect-stratifications.py
@@ -30,6 +30,12 @@ def load_data(f, coverage):
         load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input)
     )
 
+    if (report["tp_truth"] == 0).all():
+        raise ValueError(
+            f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. "
+            "This is likely a technical issue in the callset and should be checked before further evaluation."
+        )
+
     report.to_csv(snakemake.output[0], sep="\t", index=False)
 else:
     pd.DataFrame(