diff --git a/changes/2799.bugfix.rst b/changes/2799.bugfix.rst new file mode 100644 index 0000000000..f22b7074bb --- /dev/null +++ b/changes/2799.bugfix.rst @@ -0,0 +1 @@ +Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types \ No newline at end of file diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 583ca01c5e..a35c5ca210 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] +def fill_value_or_default(chunk_spec: ArraySpec) -> Any: + fill_value = chunk_spec.fill_value + if fill_value is None: + # Zarr V2 allowed `fill_value` to be null in the metadata. + # Zarr V3 requires it to be set. This has already been + # validated when decoding the metadata, but we support reading + # Zarr V2 data and need to support the case where fill_value + # is None. + return _default_fill_value(dtype=chunk_spec.dtype) + else: + return fill_value + + @dataclass(frozen=True) class BatchedCodecPipeline(CodecPipeline): """Default codec pipeline. @@ -247,17 +260,7 @@ async def read_batch( if chunk_array is not None: out[out_selection] = chunk_array else: - fill_value = chunk_spec.fill_value - - if fill_value is None: - # Zarr V2 allowed `fill_value` to be null in the metadata. - # Zarr V3 requires it to be set. This has already been - # validated when decoding the metadata, but we support reading - # Zarr V2 data and need to support the case where fill_value - # is None. - fill_value = _default_fill_value(dtype=chunk_spec.dtype) - - out[out_selection] = fill_value + out[out_selection] = fill_value_or_default(chunk_spec) else: chunk_bytes_batch = await concurrent_map( [ @@ -284,10 +287,7 @@ async def read_batch( tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: - fill_value = chunk_spec.fill_value - if fill_value is None: - fill_value = _default_fill_value(dtype=chunk_spec.dtype) - out[out_selection] = fill_value + out[out_selection] = fill_value_or_default(chunk_spec) def _merge_chunk_array( self, @@ -305,7 +305,7 @@ def _merge_chunk_array( shape=chunk_spec.shape, dtype=chunk_spec.dtype, order=chunk_spec.order, - fill_value=chunk_spec.fill_value, + fill_value=fill_value_or_default(chunk_spec), ) else: chunk_array = existing_chunk_array.copy() # make a writable copy @@ -394,7 +394,7 @@ async def _read_key( chunk_array_batch.append(None) # type: ignore[unreachable] else: if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( - chunk_spec.fill_value + fill_value_or_default(chunk_spec) ): chunk_array_batch.append(None) else: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 192db5b203..25697c4545 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -349,6 +349,14 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return b"" elif dtype.kind in "UO": return "" + elif dtype.kind in "Mm": + return dtype.type("nat") + elif dtype.kind == "V": + if dtype.fields is not None: + default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()]) + return np.array([default], dtype=dtype) + else: + return np.zeros(1, dtype=dtype) else: return dtype.type(0) diff --git a/tests/test_v2.py b/tests/test_v2.py index 4c689c8e64..0a4487cfcc 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -313,3 +313,22 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: za[...] = a za = zarr.open_array(store=array_path) assert (a == za[:]).all() + + +@pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"]) +def test_other_dtype_roundtrip(fill_value, tmp_path) -> None: + a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7") + array_path = tmp_path / "data.zarr" + za = zarr.create( + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a + za = zarr.open_array(store=array_path) + assert (a == za[:]).all()