diff --git a/CHANGELOG.md b/CHANGELOG.md index ecabe9b..947e8f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Breaking changes - ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x and numpy >= 2. Existing ICFs will need to be recreated. +- Add chunksize options to mkschema (issue:294) + # 0.1.1 2024-06-19 Maintenance release: diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 0722521..e37bd76 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -338,12 +338,19 @@ def inspect(path, verbose): @click.command @icf_path -def mkschema(icf_path): +@variants_chunk_size +@samples_chunk_size +def mkschema(icf_path, variants_chunk_size, samples_chunk_size): """ Generate a schema for zarr encoding """ stream = click.get_text_stream("stdout") - vcf2zarr.mkschema(icf_path, stream) + vcf2zarr.mkschema( + icf_path, + stream, + variants_chunk_size=variants_chunk_size, + samples_chunk_size=samples_chunk_size, + ) @click.command diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index b4b42bf..3094b23 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -1027,9 +1027,13 @@ def encode_all_partitions( pwm.submit(self.encode_partition, partition_index) -def mkschema(if_path, out): +def mkschema(if_path, out, *, variants_chunk_size=None, samples_chunk_size=None): store = icf.IntermediateColumnarFormat(if_path) - spec = VcfZarrSchema.generate(store) + spec = VcfZarrSchema.generate( + store, + variants_chunk_size=variants_chunk_size, + samples_chunk_size=samples_chunk_size, + ) out.write(spec.asjson()) diff --git a/tests/test_cli.py b/tests/test_cli.py index 4508637..2ede6ee 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -461,7 +461,9 @@ def test_inspect(self, mocked, tmp_path): def test_mkschema(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( - cli.vcf2zarr_main, f"mkschema {tmp_path}", catch_exceptions=False + cli.vcf2zarr_main, + f"mkschema {tmp_path} --variants-chunk-size=3 " "--samples-chunk-size=4", + catch_exceptions=False, ) assert result.exit_code == 0 assert len(result.stdout) == 0 @@ -705,6 +707,25 @@ def test_explode(self, tmp_path): # Arbitrary check assert "CHROM" in result.stdout + def test_mkschema(self, tmp_path): + icf_path = tmp_path / "icf" + runner = ct.CliRunner(mix_stderr=False) + result = runner.invoke( + cli.vcf2zarr_main, + f"explode {self.vcf_path} {icf_path}", + catch_exceptions=False, + ) + assert result.exit_code == 0 + result = runner.invoke( + cli.vcf2zarr_main, + f"mkschema {icf_path} --variants-chunk-size=3 " "--samples-chunk-size=2", + catch_exceptions=False, + ) + assert result.exit_code == 0 + d = json.loads(result.stdout) + assert d["samples_chunk_size"] == 2 + assert d["variants_chunk_size"] == 3 + def test_encode(self, tmp_path): icf_path = tmp_path / "icf" zarr_path = tmp_path / "zarr" diff --git a/tests/test_vcz.py b/tests/test_vcz.py index f4b1843..6787461 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -136,6 +136,35 @@ def test_bad_value(self, tmp_path, icf_path, dimension_separator): ) +class TestSchemaChunkSize: + @pytest.mark.parametrize( + ("samples_chunk_size", "variants_chunk_size"), + [ + (1, 2), + (2, 1), + (3, 5), + ], + ) + def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size): + icf = vcf2zarr.IntermediateColumnarFormat(icf_path) + schema = vcf2zarr.VcfZarrSchema.generate( + icf, + variants_chunk_size=variants_chunk_size, + samples_chunk_size=samples_chunk_size, + ) + assert schema.samples_chunk_size == samples_chunk_size + assert schema.variants_chunk_size == variants_chunk_size + found = 0 + for field in schema.fields: + assert field.dimensions[0] == "variants" + assert field.chunks[0] == variants_chunk_size + if "samples" in field.dimensions: + dim = field.dimensions.index("samples") + assert field.chunks[dim] == samples_chunk_size + found += 1 + assert found > 0 + + class TestSchemaJsonRoundTrip: def assert_json_round_trip(self, schema): schema2 = vcf2zarr.VcfZarrSchema.fromjson(schema.asjson())