fix: error out if a callset does not match the truth at all (#43)

* fix: error out if a callset does not match the truth at all * fmt * trigger rerun
snakemake-workflows · May 3, 2023 · 480b929 · 480b929
1 parent 2f7dddf
commit 480b929
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 0 deletions.
diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
@@ -137,6 +137,9 @@ rule collect_stratifications:
         "logs/collect-stratifications/{callset}/{vartype}.log",
     conda:
         "../envs/stats.yaml"
+    # We want this to be determined before FP/FN collection in order to avoid memory
+    # issues with callsets that do not match the truth at all.
+    priority: 2
     script:
         "../scripts/collect-stratifications.py"
 
@@ -221,6 +224,9 @@ rule collect_fp_fn:
         "logs/collect-fp-fn/{genome}/{cov}/{classification}.log",
     conda:
         "../envs/stats.yaml"
+    # This has to happen after precision/recall has been computed, otherwise we risk
+    # extremely high memory usage if a callset does not match the truth at all.
+    priority: 1
     script:
         "../scripts/collect-fp-fn.py"
 

diff --git a/workflow/scripts/collect-stratifications.py b/workflow/scripts/collect-stratifications.py
@@ -30,6 +30,12 @@ def load_data(f, coverage):
         load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input)
     )
 
+    if (report["tp_truth"] == 0).all():
+        raise ValueError(
+            f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. "
+            "This is likely a technical issue in the callset and should be checked before further evaluation."
+        )
+
     report.to_csv(snakemake.output[0], sep="\t", index=False)
 else:
     pd.DataFrame(