Skip to content

Commit

Permalink
feat: replace hap.py with direct use of vcfeval (#30)
Browse files Browse the repository at this point in the history
* feat: replace happy with vcfeval

* fmt

* fmt

* rename script

* fix import path
  • Loading branch information
johanneskoester authored Jan 10, 2023
1 parent 12dd0cc commit 75113fa
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 30 deletions.
6 changes: 6 additions & 0 deletions workflow/envs/rtg-tools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- rtg-tools =3.12.1
4 changes: 3 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ coverages = {

common_src = [
workflow.source_path("../scripts/common/__init__.py"),
workflow.source_path("../scripts/common/happy_report.py"),
workflow.source_path("../scripts/common/classification.py"),
]


Expand Down Expand Up @@ -396,3 +396,5 @@ if "variant-calls" in config:
wildcard_constraints:
callset="|".join(config["variant-calls"]),
classification="|".join(["fp", "fn"]),
comparison="genotype|existence",
vartype="snvs|indels",
54 changes: 31 additions & 23 deletions workflow/rules/eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -78,38 +78,46 @@ checkpoint stat_truth:
"../scripts/stat-truth.py"


rule generate_sdf:
input:
genome="resources/reference/genome.fasta",
genome_index="resources/reference/genome.fasta.fai",
output:
directory("resources/reference/genome-sdf"),
log:
"logs/rtg-tools/sdf.log",
conda:
"../envs/rtg-tools.yaml"
shell:
"rtg format --output {output} {input.genome} &> {log}"


rule benchmark_variants:
input:
truth=get_stratified_truth(),
truth_idx=get_stratified_truth(".csi"),
truth_idx=get_stratified_truth(".tbi"),
query="results/stratified-variants/{callset}/{cov}.vcf.gz",
genome="resources/reference/genome.fasta",
genome_index="resources/reference/genome.fasta.fai",
query_index="results/stratified-variants/{callset}/{cov}.vcf.gz.tbi",
genome="resources/reference/genome-sdf",
output:
multiext(
"results/happy/{callset}/{cov}/report",
".runinfo.json",
".vcf.gz",
".summary.csv",
".extended.csv",
".metrics.json.gz",
".roc.all.csv.gz",
".roc.Locations.INDEL.csv.gz",
".roc.Locations.INDEL.PASS.csv.gz",
".roc.Locations.SNP.csv.gz",
),
params:
prefix=get_happy_prefix,
engine="vcfeval",
"results/vcfeval/{callset}/{cov}/output.vcf.gz",
log:
"logs/happy/{callset}/{cov}.log",
wrapper:
"v1.7.2/bio/hap.py/hap.py"
"logs/vcfeval/{callset}/{cov}.log",
params:
output=lambda w, output: os.path.dirname(output[0]),
conda:
"../envs/rtg-tools.yaml"
threads: 32
shell:
"rm -r {params.output}; rtg vcfeval --threads {threads} --ref-overlap --all-records "
"--output-mode ga4gh --baseline {input.truth} --calls {input.query} "
"--output {params.output} --template {input.genome} &> {log}"


rule calc_precision_recall:
input:
calls="results/happy/{callset}/{cov}/report.vcf.gz",
calls="results/vcfeval/{callset}/{cov}/output.vcf.gz",
idx="results/vcfeval/{callset}/{cov}/output.vcf.gz.tbi",
common_src=common_src,
output:
snvs="results/precision-recall/callsets/{callset}/{cov}.{vartype}.tsv",
Expand Down Expand Up @@ -186,7 +194,7 @@ rule report_precision_recall:

rule extract_fp_fn:
input:
calls="results/happy/{callset}/{cov}/report.vcf.gz",
calls="results/vcfeval/{callset}/{cov}/output.vcf.gz",
common_src=common_src,
output:
"results/fp-fn/callsets/{cov}/{callset}/{classification}.tsv",
Expand Down
2 changes: 0 additions & 2 deletions workflow/rules/utils.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ rule index_vcf_variants:
"{prefix}.vcf.gz",
output:
"{prefix}.vcf.gz.tbi",
params:
extra="--tbi",
log:
"logs/index-vcf-variants/{prefix}.log",
wrapper:
Expand Down
5 changes: 3 additions & 2 deletions workflow/scripts/calc-precision-recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import pysam

from common.happy_report import CompareExactGenotype, CompareExistence, Class
from common.classification import CompareExactGenotype, CompareExistence, Class


class Classifications:
Expand Down Expand Up @@ -90,6 +90,7 @@ def collect_results(vartype):
]


vartype = "SNP" if snakemake.wildcards.vartype == "snvs" else "INDEL"
assert snakemake.wildcards.vartype in ["snvs", "indels"]
vartype = "SNV" if snakemake.wildcards.vartype == "snvs" else "INDEL"

collect_results(vartype).to_csv(snakemake.output[0], sep="\t", index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,20 @@ def get_decision(record, sample):


def get_vartypes(record):
return record.samples[0]["BVT"], record.samples[1]["BVT"]
ref_allele = record.alleles[0]
alts_truth = alt_alleles(record, 0)
alts_query = alt_alleles(record, 1)
is_snv = lambda alt: len(alt) == 1 and len(ref_allele) == 1

def vartype(alts):
_is_snv = set(map(is_snv, alts))
if not _is_snv:
return None
# we expect different types
# assert len(_is_snv) == 1, f"{record.pos}, {_is_snv}"
return "SNV" if all(_is_snv) else "INDEL"

return vartype(alts_truth), vartype(alts_query)


def alt_alleles(record, sample):
Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/extract-fp-fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import csv
import pysam

from common.happy_report import CompareExistence, Class, is_het
from common.classification import CompareExistence, Class, is_het

cmp = CompareExistence()
varfile = pysam.VariantFile(snakemake.input.calls)
Expand Down

0 comments on commit 75113fa

Please sign in to comment.