Skip to content

Commit

Permalink
Add chunk size options to mkschema
Browse files Browse the repository at this point in the history
Closes #294
  • Loading branch information
jeromekelleher committed Dec 4, 2024
1 parent b1d7ef2 commit 1bfed3a
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Breaking changes
- ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
and numpy >= 2. Existing ICFs will need to be recreated.

- Add chunksize options to mkschema (issue:294)

# 0.1.1 2024-06-19

Maintenance release:
Expand Down
11 changes: 9 additions & 2 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,12 +338,19 @@ def inspect(path, verbose):

@click.command
@icf_path
def mkschema(icf_path):
@variants_chunk_size
@samples_chunk_size
def mkschema(icf_path, variants_chunk_size, samples_chunk_size):
"""
Generate a schema for zarr encoding
"""
stream = click.get_text_stream("stdout")
vcf2zarr.mkschema(icf_path, stream)
vcf2zarr.mkschema(
icf_path,
stream,
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
)


@click.command
Expand Down
8 changes: 6 additions & 2 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,9 +1027,13 @@ def encode_all_partitions(
pwm.submit(self.encode_partition, partition_index)


def mkschema(if_path, out):
def mkschema(if_path, out, *, variants_chunk_size=None, samples_chunk_size=None):
store = icf.IntermediateColumnarFormat(if_path)
spec = VcfZarrSchema.generate(store)
spec = VcfZarrSchema.generate(
store,
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
)
out.write(spec.asjson())


Expand Down
23 changes: 22 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,9 @@ def test_inspect(self, mocked, tmp_path):
def test_mkschema(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
result = runner.invoke(
cli.vcf2zarr_main, f"mkschema {tmp_path}", catch_exceptions=False
cli.vcf2zarr_main,
f"mkschema {tmp_path} --variants-chunk-size=3 " "--samples-chunk-size=4",
catch_exceptions=False,
)
assert result.exit_code == 0
assert len(result.stdout) == 0
Expand Down Expand Up @@ -705,6 +707,25 @@ def test_explode(self, tmp_path):
# Arbitrary check
assert "CHROM" in result.stdout

def test_mkschema(self, tmp_path):
icf_path = tmp_path / "icf"
runner = ct.CliRunner(mix_stderr=False)
result = runner.invoke(
cli.vcf2zarr_main,
f"explode {self.vcf_path} {icf_path}",
catch_exceptions=False,
)
assert result.exit_code == 0
result = runner.invoke(
cli.vcf2zarr_main,
f"mkschema {icf_path} --variants-chunk-size=3 " "--samples-chunk-size=2",
catch_exceptions=False,
)
assert result.exit_code == 0
d = json.loads(result.stdout)
assert d["samples_chunk_size"] == 2
assert d["variants_chunk_size"] == 3

def test_encode(self, tmp_path):
icf_path = tmp_path / "icf"
zarr_path = tmp_path / "zarr"
Expand Down
29 changes: 29 additions & 0 deletions tests/test_vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,35 @@ def test_bad_value(self, tmp_path, icf_path, dimension_separator):
)


class TestSchemaChunkSize:
@pytest.mark.parametrize(
("samples_chunk_size", "variants_chunk_size"),
[
(1, 2),
(2, 1),
(3, 5),
],
)
def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size):
icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
schema = vcf2zarr.VcfZarrSchema.generate(
icf,
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
)
assert schema.samples_chunk_size == samples_chunk_size
assert schema.variants_chunk_size == variants_chunk_size
found = 0
for field in schema.fields:
assert field.dimensions[0] == "variants"
assert field.chunks[0] == variants_chunk_size
if "samples" in field.dimensions:
dim = field.dimensions.index("samples")
assert field.chunks[dim] == samples_chunk_size
found += 1
assert found > 0


class TestSchemaJsonRoundTrip:
def assert_json_round_trip(self, schema):
schema2 = vcf2zarr.VcfZarrSchema.fromjson(schema.asjson())
Expand Down

0 comments on commit 1bfed3a

Please sign in to comment.