Skip to content

Commit

Permalink
[v3] h5py compat methods on Group (#2128)
Browse files Browse the repository at this point in the history
* feature(h5compat): add create_dataset, require_dataset, require_group, and require_gruops methods to group class

* make mypy happy

* doc fixes

* write initial tests

* more tests

* add deprecation warnings

* add deprecation warnings

* switch up test
  • Loading branch information
jhamman authored Sep 4, 2024
1 parent 60b4f57 commit 3b793c1
Show file tree
Hide file tree
Showing 3 changed files with 337 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/zarr/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
ZATTRS_JSON = ".zattrs"

BytesLike = bytes | bytearray | memoryview
ShapeLike = tuple[int, ...] | int
ChunkCoords = tuple[int, ...]
ChunkCoordsLike = Iterable[int]
ZarrFormat = Literal[2, 3]
Expand Down
251 changes: 249 additions & 2 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dataclasses import asdict, dataclass, field, replace
from typing import TYPE_CHECKING, Literal, cast, overload

import numpy as np
import numpy.typing as npt
from typing_extensions import deprecated

Expand All @@ -25,6 +26,7 @@
ZGROUP_JSON,
ChunkCoords,
ZarrFormat,
parse_shapelike,
)
from zarr.core.config import config
from zarr.core.sync import SyncMixin, sync
Expand Down Expand Up @@ -250,7 +252,7 @@ async def getitem(
if zarray is not None:
# TODO: update this once the V2 array support is part of the primary array class
zarr_json = {**zarray, "attributes": zattrs}
return AsyncArray.from_dict(store_path, zarray)
return AsyncArray.from_dict(store_path, zarr_json)
else:
zgroup = (
json.loads(zgroup_bytes.to_bytes())
Expand Down Expand Up @@ -324,6 +326,42 @@ async def create_group(
zarr_format=self.metadata.zarr_format,
)

async def require_group(self, name: str, overwrite: bool = False) -> AsyncGroup:
"""Obtain a sub-group, creating one if it doesn't exist.
Parameters
----------
name : string
Group name.
overwrite : bool, optional
Overwrite any existing group with given `name` if present.
Returns
-------
g : AsyncGroup
"""
if overwrite:
# TODO: check that exists_ok=True errors if an array exists where the group is being created
grp = await self.create_group(name, exists_ok=True)
else:
try:
item: AsyncGroup | AsyncArray = await self.getitem(name)
if not isinstance(item, AsyncGroup):
raise TypeError(
f"Incompatible object ({item.__class__.__name__}) already exists"
)
assert isinstance(item, AsyncGroup) # make mypy happy
grp = item
except KeyError:
grp = await self.create_group(name)
return grp

async def require_groups(self, *names: str) -> tuple[AsyncGroup, ...]:
"""Convenience method to require multiple groups in a single call."""
if not names:
return ()
return tuple(await asyncio.gather(*(self.require_group(name) for name in names)))

async def create_array(
self,
name: str,
Expand Down Expand Up @@ -413,6 +451,117 @@ async def create_array(
data=data,
)

@deprecated("Use AsyncGroup.create_array instead.")
async def create_dataset(self, name: str, **kwargs: Any) -> AsyncArray:
"""Create an array.
Arrays are known as "datasets" in HDF5 terminology. For compatibility
with h5py, Zarr groups also implement the :func:`zarr.AsyncGroup.require_dataset` method.
Parameters
----------
name : string
Array name.
kwargs : dict
Additional arguments passed to :func:`zarr.AsyncGroup.create_array`.
Returns
-------
a : AsyncArray
.. deprecated:: 3.0.0
The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead.
"""
return await self.create_array(name, **kwargs)

@deprecated("Use AsyncGroup.require_array instead.")
async def require_dataset(
self,
name: str,
*,
shape: ChunkCoords,
dtype: npt.DTypeLike = None,
exact: bool = False,
**kwargs: Any,
) -> AsyncArray:
"""Obtain an array, creating if it doesn't exist.
Arrays are known as "datasets" in HDF5 terminology. For compatibility
with h5py, Zarr groups also implement the :func:`zarr.AsyncGroup.create_dataset` method.
Other `kwargs` are as per :func:`zarr.AsyncGroup.create_dataset`.
Parameters
----------
name : string
Array name.
shape : int or tuple of ints
Array shape.
dtype : string or dtype, optional
NumPy dtype.
exact : bool, optional
If True, require `dtype` to match exactly. If false, require
`dtype` can be cast from array dtype.
Returns
-------
a : AsyncArray
.. deprecated:: 3.0.0
The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.require_dataset` instead.
"""
return await self.require_array(name, shape=shape, dtype=dtype, exact=exact, **kwargs)

async def require_array(
self,
name: str,
*,
shape: ChunkCoords,
dtype: npt.DTypeLike = None,
exact: bool = False,
**kwargs: Any,
) -> AsyncArray:
"""Obtain an array, creating if it doesn't exist.
Other `kwargs` are as per :func:`zarr.AsyncGroup.create_dataset`.
Parameters
----------
name : string
Array name.
shape : int or tuple of ints
Array shape.
dtype : string or dtype, optional
NumPy dtype.
exact : bool, optional
If True, require `dtype` to match exactly. If false, require
`dtype` can be cast from array dtype.
Returns
-------
a : AsyncArray
"""
try:
ds = await self.getitem(name)
if not isinstance(ds, AsyncArray):
raise TypeError(f"Incompatible object ({ds.__class__.__name__}) already exists")

shape = parse_shapelike(shape)
if shape != ds.shape:
raise TypeError(f"Incompatible shape ({ds.shape} vs {shape})")

dtype = np.dtype(dtype)
if exact:
if ds.dtype != dtype:
raise TypeError(f"Incompatible dtype ({ds.dtype} vs {dtype})")
else:
if not np.can_cast(ds.dtype, dtype):
raise TypeError(f"Incompatible dtype ({ds.dtype} vs {dtype})")
except KeyError:
ds = await self.create_array(name, shape=shape, dtype=dtype, **kwargs)

return ds

async def update_attributes(self, new_attributes: dict[str, Any]) -> AsyncGroup:
# metadata.attributes is "frozen" so we simply clear and update the dict
self.metadata.attributes.clear()
Expand Down Expand Up @@ -612,8 +761,9 @@ def create(
def open(
cls,
store: StoreLike,
zarr_format: Literal[2, 3, None] = 3,
) -> Group:
obj = sync(AsyncGroup.open(store))
obj = sync(AsyncGroup.open(store, zarr_format=zarr_format))
return cls(obj)

def __getitem__(self, path: str) -> Array | Group:
Expand Down Expand Up @@ -717,6 +867,26 @@ def tree(self, expand: bool = False, level: int | None = None) -> Any:
def create_group(self, name: str, **kwargs: Any) -> Group:
return Group(self._sync(self._async_group.create_group(name, **kwargs)))

def require_group(self, name: str, **kwargs: Any) -> Group:
"""Obtain a sub-group, creating one if it doesn't exist.
Parameters
----------
name : string
Group name.
overwrite : bool, optional
Overwrite any existing group with given `name` if present.
Returns
-------
g : Group
"""
return Group(self._sync(self._async_group.require_group(name, **kwargs)))

def require_groups(self, *names: str) -> tuple[Group, ...]:
"""Convenience method to require multiple groups in a single call."""
return tuple(map(Group, self._sync(self._async_group.require_groups(*names))))

def create_array(
self,
name: str,
Expand Down Expand Up @@ -811,6 +981,83 @@ def create_array(
)
)

@deprecated("Use Group.create_array instead.")
def create_dataset(self, name: str, **kwargs: Any) -> Array:
"""Create an array.
Arrays are known as "datasets" in HDF5 terminology. For compatibility
with h5py, Zarr groups also implement the :func:`zarr.Group.require_dataset` method.
Parameters
----------
name : string
Array name.
kwargs : dict
Additional arguments passed to :func:`zarr.Group.create_array`
Returns
-------
a : Array
.. deprecated:: 3.0.0
The h5py compatibility methods will be removed in 3.1.0. Use `Group.create_array` instead.
"""
return Array(self._sync(self._async_group.create_dataset(name, **kwargs)))

@deprecated("Use Group.require_array instead.")
def require_dataset(self, name: str, **kwargs: Any) -> Array:
"""Obtain an array, creating if it doesn't exist.
Arrays are known as "datasets" in HDF5 terminology. For compatibility
with h5py, Zarr groups also implement the :func:`zarr.Group.create_dataset` method.
Other `kwargs` are as per :func:`zarr.Group.create_dataset`.
Parameters
----------
name : string
Array name.
shape : int or tuple of ints
Array shape.
dtype : string or dtype, optional
NumPy dtype.
exact : bool, optional
If True, require `dtype` to match exactly. If false, require
`dtype` can be cast from array dtype.
Returns
-------
a : Array
.. deprecated:: 3.0.0
The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead.
"""
return Array(self._sync(self._async_group.require_array(name, **kwargs)))

def require_array(self, name: str, **kwargs: Any) -> Array:
"""Obtain an array, creating if it doesn't exist.
Other `kwargs` are as per :func:`zarr.Group.create_array`.
Parameters
----------
name : string
Array name.
shape : int or tuple of ints
Array shape.
dtype : string or dtype, optional
NumPy dtype.
exact : bool, optional
If True, require `dtype` to match exactly. If false, require
`dtype` can be cast from array dtype.
Returns
-------
a : Array
"""
return Array(self._sync(self._async_group.require_array(name, **kwargs)))

def empty(self, **kwargs: Any) -> Array:
return Array(self._sync(self._async_group.empty(**kwargs)))

Expand Down
Loading

0 comments on commit 3b793c1

Please sign in to comment.