From 75113fafc8aaf797cdd05f3652c327ef0b301e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Tue, 10 Jan 2023 14:14:16 +0100 Subject: [PATCH] feat: replace hap.py with direct use of vcfeval (#30) * feat: replace happy with vcfeval * fmt * fmt * rename script * fix import path --- workflow/envs/rtg-tools.yaml | 6 +++ workflow/rules/common.smk | 4 +- workflow/rules/eval.smk | 54 +++++++++++-------- workflow/rules/utils.smk | 2 - workflow/scripts/calc-precision-recall.py | 5 +- .../{happy_report.py => classification.py} | 15 +++++- workflow/scripts/extract-fp-fn.py | 2 +- 7 files changed, 58 insertions(+), 30 deletions(-) create mode 100644 workflow/envs/rtg-tools.yaml rename workflow/scripts/common/{happy_report.py => classification.py} (92%) diff --git a/workflow/envs/rtg-tools.yaml b/workflow/envs/rtg-tools.yaml new file mode 100644 index 0000000..98873c8 --- /dev/null +++ b/workflow/envs/rtg-tools.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - rtg-tools =3.12.1 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f86a593..c97332b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -41,7 +41,7 @@ coverages = { common_src = [ workflow.source_path("../scripts/common/__init__.py"), - workflow.source_path("../scripts/common/happy_report.py"), + workflow.source_path("../scripts/common/classification.py"), ] @@ -396,3 +396,5 @@ if "variant-calls" in config: wildcard_constraints: callset="|".join(config["variant-calls"]), classification="|".join(["fp", "fn"]), + comparison="genotype|existence", + vartype="snvs|indels", diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 6120c24..38633ee 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -78,38 +78,46 @@ checkpoint stat_truth: "../scripts/stat-truth.py" +rule generate_sdf: + input: + genome="resources/reference/genome.fasta", + genome_index="resources/reference/genome.fasta.fai", + output: + directory("resources/reference/genome-sdf"), + log: + "logs/rtg-tools/sdf.log", + conda: + "../envs/rtg-tools.yaml" + shell: + "rtg format --output {output} {input.genome} &> {log}" + + rule benchmark_variants: input: truth=get_stratified_truth(), - truth_idx=get_stratified_truth(".csi"), + truth_idx=get_stratified_truth(".tbi"), query="results/stratified-variants/{callset}/{cov}.vcf.gz", - genome="resources/reference/genome.fasta", - genome_index="resources/reference/genome.fasta.fai", + query_index="results/stratified-variants/{callset}/{cov}.vcf.gz.tbi", + genome="resources/reference/genome-sdf", output: - multiext( - "results/happy/{callset}/{cov}/report", - ".runinfo.json", - ".vcf.gz", - ".summary.csv", - ".extended.csv", - ".metrics.json.gz", - ".roc.all.csv.gz", - ".roc.Locations.INDEL.csv.gz", - ".roc.Locations.INDEL.PASS.csv.gz", - ".roc.Locations.SNP.csv.gz", - ), - params: - prefix=get_happy_prefix, - engine="vcfeval", + "results/vcfeval/{callset}/{cov}/output.vcf.gz", log: - "logs/happy/{callset}/{cov}.log", - wrapper: - "v1.7.2/bio/hap.py/hap.py" + "logs/vcfeval/{callset}/{cov}.log", + params: + output=lambda w, output: os.path.dirname(output[0]), + conda: + "../envs/rtg-tools.yaml" + threads: 32 + shell: + "rm -r {params.output}; rtg vcfeval --threads {threads} --ref-overlap --all-records " + "--output-mode ga4gh --baseline {input.truth} --calls {input.query} " + "--output {params.output} --template {input.genome} &> {log}" rule calc_precision_recall: input: - calls="results/happy/{callset}/{cov}/report.vcf.gz", + calls="results/vcfeval/{callset}/{cov}/output.vcf.gz", + idx="results/vcfeval/{callset}/{cov}/output.vcf.gz.tbi", common_src=common_src, output: snvs="results/precision-recall/callsets/{callset}/{cov}.{vartype}.tsv", @@ -186,7 +194,7 @@ rule report_precision_recall: rule extract_fp_fn: input: - calls="results/happy/{callset}/{cov}/report.vcf.gz", + calls="results/vcfeval/{callset}/{cov}/output.vcf.gz", common_src=common_src, output: "results/fp-fn/callsets/{cov}/{callset}/{classification}.tsv", diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index ec384e4..83d6779 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -14,8 +14,6 @@ rule index_vcf_variants: "{prefix}.vcf.gz", output: "{prefix}.vcf.gz.tbi", - params: - extra="--tbi", log: "logs/index-vcf-variants/{prefix}.log", wrapper: diff --git a/workflow/scripts/calc-precision-recall.py b/workflow/scripts/calc-precision-recall.py index 4f9a8d8..3db4071 100644 --- a/workflow/scripts/calc-precision-recall.py +++ b/workflow/scripts/calc-precision-recall.py @@ -9,7 +9,7 @@ import pandas as pd import pysam -from common.happy_report import CompareExactGenotype, CompareExistence, Class +from common.classification import CompareExactGenotype, CompareExistence, Class class Classifications: @@ -90,6 +90,7 @@ def collect_results(vartype): ] -vartype = "SNP" if snakemake.wildcards.vartype == "snvs" else "INDEL" +assert snakemake.wildcards.vartype in ["snvs", "indels"] +vartype = "SNV" if snakemake.wildcards.vartype == "snvs" else "INDEL" collect_results(vartype).to_csv(snakemake.output[0], sep="\t", index=False) diff --git a/workflow/scripts/common/happy_report.py b/workflow/scripts/common/classification.py similarity index 92% rename from workflow/scripts/common/happy_report.py rename to workflow/scripts/common/classification.py index 55707cb..540c863 100644 --- a/workflow/scripts/common/happy_report.py +++ b/workflow/scripts/common/classification.py @@ -52,7 +52,20 @@ def get_decision(record, sample): def get_vartypes(record): - return record.samples[0]["BVT"], record.samples[1]["BVT"] + ref_allele = record.alleles[0] + alts_truth = alt_alleles(record, 0) + alts_query = alt_alleles(record, 1) + is_snv = lambda alt: len(alt) == 1 and len(ref_allele) == 1 + + def vartype(alts): + _is_snv = set(map(is_snv, alts)) + if not _is_snv: + return None + # we expect different types + # assert len(_is_snv) == 1, f"{record.pos}, {_is_snv}" + return "SNV" if all(_is_snv) else "INDEL" + + return vartype(alts_truth), vartype(alts_query) def alt_alleles(record, sample): diff --git a/workflow/scripts/extract-fp-fn.py b/workflow/scripts/extract-fp-fn.py index 802149f..db902a9 100644 --- a/workflow/scripts/extract-fp-fn.py +++ b/workflow/scripts/extract-fp-fn.py @@ -7,7 +7,7 @@ import csv import pysam -from common.happy_report import CompareExistence, Class, is_het +from common.classification import CompareExistence, Class, is_het cmp = CompareExistence() varfile = pysam.VariantFile(snakemake.input.calls)