Skip to content

Commit

Permalink
Merge pull request #129 from jeromekelleher/noshuffle-default
Browse files Browse the repository at this point in the history
Some refactoring of schema generation
  • Loading branch information
jeromekelleher authored Apr 19, 2024
2 parents 98478fe + 2e5189f commit dfb3ed4
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 69 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- uses: pre-commit/action@v3
- uses: pre-commit/action@v3.0.1
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 0.0.6 2024-04-xx

- Only use NOSHUFFLE by default on ``call_genotype`` and bool arrays.

# 0.0.5 2024-04-17

- Fix bug in schema handling (compressor settings ignored)
Expand Down
10 changes: 10 additions & 0 deletions bio2zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@
numcodecs.blosc.use_threads = False


def min_int_dtype(min_value, max_value):
if min_value > max_value:
raise ValueError("min_value must be <= max_value")
for a_dtype in ["i1", "i2", "i4", "i8"]:
info = np.iinfo(a_dtype)
if info.min <= min_value and max_value <= info.max:
return a_dtype
raise OverflowError("Integer cannot be represented")


def chunk_aligned_slices(z, n, max_chunks=None):
"""
Returns at n slices in the specified zarr array, aligned
Expand Down
162 changes: 95 additions & 67 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,6 @@ def full_name(self):
return self.name
return f"{self.category}/{self.name}"

# TODO add method here to choose a good set compressor and
# filters default here for this field.

def smallest_dtype(self):
"""
Returns the smallest dtype suitable for this field based
Expand All @@ -123,13 +120,13 @@ def smallest_dtype(self):
if self.vcf_type == "Float":
ret = "f4"
elif self.vcf_type == "Integer":
dtype = "i4"
for a_dtype in ["i1", "i2"]:
info = np.iinfo(a_dtype)
if info.min <= s.min_value and s.max_value <= info.max:
dtype = a_dtype
break
ret = dtype
if not math.isfinite(s.max_value):
# All missing values; use i1. Note we should have some API to
# check more explicitly for missingness:
# https://github.com/sgkit-dev/bio2zarr/issues/131
ret = "i1"
else:
ret = core.min_int_dtype(s.min_value, s.max_value)
elif self.vcf_type == "Flag":
ret = "bool"
elif self.vcf_type == "Character":
Expand All @@ -152,6 +149,10 @@ class VcfPartition:
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
)

# TODO refactor this to have embedded Contig dataclass, Filters
# and Samples dataclasses to allow for more information to be
# retained and forward compatibility.


@dataclasses.dataclass
class IcfMetadata:
Expand Down Expand Up @@ -183,6 +184,14 @@ def format_fields(self):
fields.append(field)
return fields

@property
def num_contigs(self):
return len(self.contig_names)

@property
def num_filters(self):
return len(self.filters)

@property
def num_records(self):
return sum(self.contig_record_counts.values())
Expand Down Expand Up @@ -1242,6 +1251,50 @@ def new(**kwargs):
spec._choose_compressor_settings()
return spec

@staticmethod
def from_field(
vcf_field,
*,
num_variants,
num_samples,
variants_chunk_size,
samples_chunk_size,
variable_name=None,
):
shape = [num_variants]
prefix = "variant_"
dimensions = ["variants"]
chunks = [variants_chunk_size]
if vcf_field.category == "FORMAT":
prefix = "call_"
shape.append(num_samples)
chunks.append(samples_chunk_size)
dimensions.append("samples")
if variable_name is None:
variable_name = prefix + vcf_field.name
# TODO make an option to add in the empty extra dimension
if vcf_field.summary.max_number > 1:
shape.append(vcf_field.summary.max_number)
# TODO we should really be checking this to see if the named dimensions
# are actually correct.
if vcf_field.vcf_number == "R":
dimensions.append("alleles")
elif vcf_field.vcf_number == "A":
dimensions.append("alt_alleles")
elif vcf_field.vcf_number == "G":
dimensions.append("genotypes")
else:
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
return ZarrColumnSpec.new(
vcf_field=vcf_field.full_name,
name=variable_name,
dtype=vcf_field.smallest_dtype(),
shape=shape,
chunks=chunks,
dimensions=dimensions,
description=vcf_field.description,
)

def _choose_compressor_settings(self):
"""
Choose compressor and filter settings based on the size and
Expand All @@ -1250,15 +1303,19 @@ def _choose_compressor_settings(self):
See https://github.com/pystatgen/bio2zarr/discussions/74
"""
dt = np.dtype(self.dtype)
# Default is to not shuffle, because autoshuffle isn't recognised
# by many Zarr implementations, and shuffling can lead to worse
# performance in some cases anyway. Turning on shuffle should be a
# deliberate choice.
shuffle = numcodecs.Blosc.NOSHUFFLE
if dt.itemsize == 1:
# Any 1 byte field gets BITSHUFFLE by default
if self.name == "call_genotype" and self.dtype == "i1":
# call_genotype gets BITSHUFFLE by default as it gets
# significantly better compression (at a cost of slower
# decoding)
shuffle = numcodecs.Blosc.BITSHUFFLE
elif self.dtype == "bool":
shuffle = numcodecs.Blosc.BITSHUFFLE

self.compressor["shuffle"] = shuffle


Expand Down Expand Up @@ -1313,6 +1370,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
)

def spec_from_field(field, variable_name=None):
return ZarrColumnSpec.from_field(
field,
num_samples=n,
num_variants=m,
samples_chunk_size=samples_chunk_size,
variants_chunk_size=variants_chunk_size,
variable_name=variable_name,
)

def fixed_field_spec(
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
):
Expand All @@ -1328,95 +1395,56 @@ def fixed_field_spec(

alt_col = icf.columns["ALT"]
max_alleles = alt_col.vcf_field.summary.max_number + 1
num_filters = len(icf.metadata.filters)

# # FIXME get dtype from lookup table
colspecs = [
fixed_field_spec(
name="variant_contig",
dtype="i2", # FIXME
dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
),
fixed_field_spec(
name="variant_filter",
dtype="bool",
shape=(m, num_filters),
shape=(m, icf.metadata.num_filters),
dimensions=["variants", "filters"],
),
fixed_field_spec(
name="variant_allele",
dtype="str",
shape=[m, max_alleles],
shape=(m, max_alleles),
dimensions=["variants", "alleles"],
),
fixed_field_spec(
vcf_field="POS",
name="variant_position",
dtype="i4",
),
fixed_field_spec(
vcf_field=None,
name="variant_id",
dtype="str",
),
fixed_field_spec(
vcf_field=None,
name="variant_id_mask",
dtype="bool",
),
fixed_field_spec(
vcf_field="QUAL",
name="variant_quality",
dtype="f4",
),
]
name_map = {field.full_name: field for field in icf.metadata.fields}

# Only two of the fixed fields have a direct one-to-one mapping.
colspecs.extend(
[
spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
spec_from_field(name_map["POS"], variable_name="variant_position"),
]
)
colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])

gt_field = None
for field in icf.metadata.fields:
if field.category == "fixed":
continue
for field in icf.metadata.format_fields:
if field.name == "GT":
gt_field = field
continue
shape = [m]
prefix = "variant_"
dimensions = ["variants"]
chunks = [variants_chunk_size]
if field.category == "FORMAT":
prefix = "call_"
shape.append(n)
chunks.append(samples_chunk_size)
dimensions.append("samples")
# TODO make an option to add in the empty extra dimension
if field.summary.max_number > 1:
shape.append(field.summary.max_number)
# TODO we should really be checking this to see if the named dimensions
# are actually correct.
if field.vcf_number == "R":
dimensions.append("alleles")
elif field.vcf_number == "A":
dimensions.append("alt_alleles")
elif field.vcf_number == "G":
dimensions.append("genotypes")
else:
dimensions.append(f"{field.category}_{field.name}_dim")
variable_name = prefix + field.name
colspec = ZarrColumnSpec.new(
vcf_field=field.full_name,
name=variable_name,
dtype=field.smallest_dtype(),
shape=shape,
chunks=chunks,
dimensions=dimensions,
description=field.description,
)
colspecs.append(colspec)
colspecs.append(spec_from_field(field))

if gt_field is not None:
ploidy = gt_field.summary.max_number - 1
shape = [m, n]
chunks = [variants_chunk_size, samples_chunk_size]
dimensions = ["variants", "samples"]

colspecs.append(
ZarrColumnSpec.new(
vcf_field=None,
Expand Down
49 changes: 49 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,55 @@
from bio2zarr import core


class TestMinIntDtype:
@pytest.mark.parametrize(
("min_value", "max_value", "dtype"),
[
(0, 1, "i1"),
(0, 0, "i1"),
(0, 127, "i1"),
(127, 128, "i2"),
(-127, 0, "i1"),
(-127, -126, "i1"),
(0, 2**15 - 1, "i2"),
(-(2**15), 2**15 - 1, "i2"),
(0, 2**15, "i4"),
(-(2**15), 2**15, "i4"),
(0, 2**31 - 1, "i4"),
(-(2**31), 2**31 - 1, "i4"),
(2**31 - 1, 2**31 - 1, "i4"),
(0, 2**31, "i8"),
(0, 2**32, "i8"),
],
)
def test_values(self, min_value, max_value, dtype):
assert core.min_int_dtype(min_value, max_value) == dtype

@pytest.mark.parametrize(
("min_value", "max_value"),
[
(0, 2**63),
(-(2**63) - 1, 0),
(0, 2**65),
],
)
def test_overflow(self, min_value, max_value):
with pytest.raises(OverflowError, match="Integer cannot"):
core.min_int_dtype(min_value, max_value)

@pytest.mark.parametrize(
("min_value", "max_value"),
[
(1, 0),
(-1, -2),
(2**31, 2**31 - 1),
],
)
def test_bad_min_max(self, min_value, max_value):
with pytest.raises(ValueError, match="must be <="):
core.min_int_dtype(min_value, max_value)


class TestParallelWorkManager:
@pytest.mark.parametrize("total", [1, 10, 2**63])
@pytest.mark.parametrize("workers", [0, 1])
Expand Down
21 changes: 20 additions & 1 deletion tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def test_filter_id(self, schema):
def test_variant_contig(self, schema):
assert schema["columns"]["variant_contig"] == {
"name": "variant_contig",
"dtype": "i2",
"dtype": "i1",
"shape": [9],
"chunks": [10000],
"dimensions": ["variants"],
Expand Down Expand Up @@ -298,6 +298,25 @@ def test_call_genotype_phased(self, schema):
"filters": [],
}

def test_call_GQ(self, schema):
assert schema["columns"]["call_GQ"] == {
"name": "call_GQ",
"dtype": "i1",
"shape": [9, 3],
"chunks": [10000, 1000],
"dimensions": ["variants", "samples"],
"description": "Genotype Quality",
"vcf_field": "FORMAT/GQ",
"compressor": {
"id": "blosc",
"cname": "zstd",
"clevel": 7,
"shuffle": 0,
"blocksize": 0,
},
"filters": [],
}


@pytest.mark.parametrize(
"regions",
Expand Down

0 comments on commit dfb3ed4

Please sign in to comment.