Skip to content

Commit

Permalink
fix: implicit fill value initialisation (zarr-developers#2799)
Browse files Browse the repository at this point in the history
* fix: implicit fill value initialisation

- initialise empty chunks to the default fill value during writing
- add default fill value for datetime, timedelta, structured data types

* fmt

* add "other" dtype test

* changelog

---------

Co-authored-by: Davis Bennett <davis.v.bennett@gmail.com>
  • Loading branch information
LDeakin and d-v-b authored Feb 12, 2025
1 parent 2f8b88a commit c66f32b
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 17 deletions.
1 change: 1 addition & 0 deletions changes/2799.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types
34 changes: 17 additions & 17 deletions src/zarr/core/codec_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[
return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]


def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
fill_value = chunk_spec.fill_value
if fill_value is None:
# Zarr V2 allowed `fill_value` to be null in the metadata.
# Zarr V3 requires it to be set. This has already been
# validated when decoding the metadata, but we support reading
# Zarr V2 data and need to support the case where fill_value
# is None.
return _default_fill_value(dtype=chunk_spec.dtype)
else:
return fill_value


@dataclass(frozen=True)
class BatchedCodecPipeline(CodecPipeline):
"""Default codec pipeline.
Expand Down Expand Up @@ -247,17 +260,7 @@ async def read_batch(
if chunk_array is not None:
out[out_selection] = chunk_array
else:
fill_value = chunk_spec.fill_value

if fill_value is None:
# Zarr V2 allowed `fill_value` to be null in the metadata.
# Zarr V3 requires it to be set. This has already been
# validated when decoding the metadata, but we support reading
# Zarr V2 data and need to support the case where fill_value
# is None.
fill_value = _default_fill_value(dtype=chunk_spec.dtype)

out[out_selection] = fill_value
out[out_selection] = fill_value_or_default(chunk_spec)
else:
chunk_bytes_batch = await concurrent_map(
[
Expand All @@ -284,10 +287,7 @@ async def read_batch(
tmp = tmp.squeeze(axis=drop_axes)
out[out_selection] = tmp
else:
fill_value = chunk_spec.fill_value
if fill_value is None:
fill_value = _default_fill_value(dtype=chunk_spec.dtype)
out[out_selection] = fill_value
out[out_selection] = fill_value_or_default(chunk_spec)

def _merge_chunk_array(
self,
Expand All @@ -305,7 +305,7 @@ def _merge_chunk_array(
shape=chunk_spec.shape,
dtype=chunk_spec.dtype,
order=chunk_spec.order,
fill_value=chunk_spec.fill_value,
fill_value=fill_value_or_default(chunk_spec),
)
else:
chunk_array = existing_chunk_array.copy() # make a writable copy
Expand Down Expand Up @@ -394,7 +394,7 @@ async def _read_key(
chunk_array_batch.append(None) # type: ignore[unreachable]
else:
if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
chunk_spec.fill_value
fill_value_or_default(chunk_spec)
):
chunk_array_batch.append(None)
else:
Expand Down
8 changes: 8 additions & 0 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,14 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
return b""
elif dtype.kind in "UO":
return ""
elif dtype.kind in "Mm":
return dtype.type("nat")
elif dtype.kind == "V":
if dtype.fields is not None:
default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()])
return np.array([default], dtype=dtype)
else:
return np.zeros(1, dtype=dtype)
else:
return dtype.type(0)

Expand Down
19 changes: 19 additions & 0 deletions tests/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,22 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
za[...] = a
za = zarr.open_array(store=array_path)
assert (a == za[:]).all()


@pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"])
def test_other_dtype_roundtrip(fill_value, tmp_path) -> None:
a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7")
array_path = tmp_path / "data.zarr"
za = zarr.create(
shape=(3,),
store=array_path,
chunks=(2,),
fill_value=fill_value,
zarr_format=2,
dtype=a.dtype,
)
if fill_value is not None:
assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
za[...] = a
za = zarr.open_array(store=array_path)
assert (a == za[:]).all()

0 comments on commit c66f32b

Please sign in to comment.