Skip to content

Commit

Permalink
Add description from VCF header to arrays
Browse files Browse the repository at this point in the history
Closes #125
  • Loading branch information
jeromekelleher committed Apr 18, 2024
1 parent 9f7492b commit d655bcf
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 2 deletions.
9 changes: 7 additions & 2 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1544,8 +1544,13 @@ def init_array(self, variable):
object_codec=object_codec,
dimension_separator=self.dimension_separator,
)
# Dimension names are part of the spec in Zarr v3
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
a.attrs.update(
{
"description": variable.description,
# Dimension names are part of the spec in Zarr v3
"_ARRAY_DIMENSIONS": variable.dimensions,
}
)

def get_array(self, name):
return self.root["wip_" + name]
Expand Down
36 changes: 36 additions & 0 deletions tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,39 @@ def test_check_overlap(regions):
]
with pytest.raises(ValueError, match="Multiple VCFs have the region"):
vcf.check_overlap(partitions)


class TestVcfDescriptions:
@pytest.mark.parametrize(
("field", "description"),
[
("variant_NS", "Number of Samples With Data"),
("variant_AN", "Total number of alleles in called genotypes"),
(
"variant_AC",
"Allele count in genotypes, for each ALT allele, "
"in the same order as listed",
),
("variant_DP", "Total Depth"),
("variant_AF", "Allele Frequency"),
("variant_AA", "Ancestral Allele"),
("variant_DB", "dbSNP membership, build 129"),
("variant_H2", "HapMap2 membership"),
("call_GQ", "Genotype Quality"),
("call_DP", "Read Depth"),
("call_HQ", "Haplotype Quality"),
],
)
def test_fields(self, schema, field, description):
assert schema["columns"][field]["description"] == description

# This information is not in the schema yet,
# https://github.com/sgkit-dev/bio2zarr/issues/123
# @pytest.mark.parametrize(
# ("filt", "description"),
# [
# ("s50","Less than 50% of samples have data"),
# ("q10", "Quality below 10"),
# ])
# def test_filters(self, schema, filt, description):
# assert schema["filters"][field]["description"] == description
23 changes: 23 additions & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,29 @@ def test_vcf_dimensions(self, ds):
assert ds.variant_H2.dims == ("variants",)
assert ds.variant_position.dims == ("variants",)

@pytest.mark.parametrize(
("field", "description"),
[
("variant_NS", "Number of Samples With Data"),
("variant_AN", "Total number of alleles in called genotypes"),
(
"variant_AC",
"Allele count in genotypes, for each ALT allele, "
"in the same order as listed",
),
("variant_DP", "Total Depth"),
("variant_AF", "Allele Frequency"),
("variant_AA", "Ancestral Allele"),
("variant_DB", "dbSNP membership, build 129"),
("variant_H2", "HapMap2 membership"),
("call_GQ", "Genotype Quality"),
("call_DP", "Read Depth"),
("call_HQ", "Haplotype Quality"),
],
)
def test_vcf_field_description(self, ds, field, description):
assert ds[field].attrs["description"] == description


class Test1000G2020Example:
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"
Expand Down

0 comments on commit d655bcf

Please sign in to comment.