Skip to content

Commit

Permalink
Merge branch 'main' into zipstore-from-path
Browse files Browse the repository at this point in the history
  • Loading branch information
aulemahal authored Feb 26, 2025
2 parents 327c5d2 + 64b9a37 commit 955234b
Show file tree
Hide file tree
Showing 21 changed files with 2,016 additions and 132 deletions.
1 change: 1 addition & 0 deletions changes/2665.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Adds functions for concurrently creating multiple arrays and groups.
1 change: 1 addition & 0 deletions changes/2851.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug when setting values of a smaller last chunk.
22 changes: 22 additions & 0 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,28 @@ Zarr allows you to create hierarchical groups, similar to directories::

This creates a group with two datasets: ``foo`` and ``bar``.

Batch Hierarchy Creation
~~~~~~~~~~~~~~~~~~~~~~~~

Zarr provides tools for creating a collection of arrays and groups with a single function call.
Suppose we want to copy existing groups and arrays into a new storage backend:

>>> # Create nested groups and add arrays
>>> root = zarr.group("data/example-3.zarr", attributes={'name': 'root'})
>>> foo = root.create_group(name="foo")
>>> bar = root.create_array(
... name="bar", shape=(100, 10), chunks=(10, 10), dtype="f4"
... )
>>> nodes = {'': root.metadata} | {k: v.metadata for k,v in root.members()}
>>> print(nodes)
>>> from zarr.storage import MemoryStore
>>> new_nodes = dict(zarr.create_hierarchy(store=MemoryStore(), nodes=nodes))
>>> new_root = new_nodes['']
>>> assert new_root.attrs == root.attrs

Note that :func:`zarr.create_hierarchy` will only initialize arrays and groups -- copying array data must
be done in a separate step.

Persistent Storage
------------------

Expand Down
25 changes: 25 additions & 0 deletions docs/user-guide/groups.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,31 @@ For more information on groups see the :class:`zarr.Group` API docs.

.. _user-guide-diagnostics:

Batch Group Creation
--------------------

You can also create multiple groups concurrently with a single function call. :func:`zarr.create_hierarchy` takes
a :class:`zarr.storage.Store` instance and a dict of ``key : metadata`` pairs, parses that dict, and
writes metadata documents to storage:

>>> from zarr import create_hierarchy
>>> from zarr.core.group import GroupMetadata
>>> from zarr.storage import LocalStore
>>> node_spec = {'a/b/c': GroupMetadata()}
>>> nodes_created = dict(create_hierarchy(store=LocalStore(root='data'), nodes=node_spec))
>>> print(sorted(nodes_created.items(), key=lambda kv: len(kv[0])))
[('', <Group file://data>), ('a', <Group file://data/a>), ('a/b', <Group file://data/a/b>), ('a/b/c', <Group file://data/a/b/c>)]

Note that we only specified a single group named ``a/b/c``, but 4 groups were created. These additional groups
were created to ensure that the desired node ``a/b/c`` is connected to the root group ``''`` by a sequence
of intermediate groups. :func:`zarr.create_hierarchy` normalizes the ``nodes`` keyword argument to
ensure that the resulting hierarchy is complete, i.e. all groups or arrays are connected to the root
of the hierarchy via intermediate groups.

Because :func:`zarr.create_hierarchy` concurrently creates metadata documents, it's more efficient
than repeated calls to :func:`create_group` or :func:`create_array`, provided you can statically define
the metadata for the groups and arrays you want to create.

Array and group diagnostics
---------------------------

Expand Down
2 changes: 2 additions & 0 deletions src/zarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
create,
create_array,
create_group,
create_hierarchy,
empty,
empty_like,
full,
Expand Down Expand Up @@ -50,6 +51,7 @@
"create",
"create_array",
"create_group",
"create_hierarchy",
"empty",
"empty_like",
"full",
Expand Down
8 changes: 7 additions & 1 deletion src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
_warn_write_empty_chunks_kwarg,
parse_dtype,
)
from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata
from zarr.core.group import (
AsyncGroup,
ConsolidatedMetadata,
GroupMetadata,
create_hierarchy,
)
from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import _default_compressor, _default_filters
from zarr.errors import NodeTypeValidationError
Expand All @@ -48,6 +53,7 @@
"copy_store",
"create",
"create_array",
"create_hierarchy",
"empty",
"empty_like",
"full",
Expand Down
2 changes: 2 additions & 0 deletions src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from zarr.core.array import Array, AsyncArray
from zarr.core.group import Group
from zarr.core.sync import sync
from zarr.core.sync_group import create_hierarchy

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -46,6 +47,7 @@
"copy_store",
"create",
"create_array",
"create_hierarchy",
"empty",
"empty_like",
"full",
Expand Down
25 changes: 14 additions & 11 deletions src/zarr/core/codec_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,17 +296,6 @@ def _merge_chunk_array(
is_complete_chunk: bool,
drop_axes: tuple[int, ...],
) -> NDBuffer:
if is_complete_chunk and value.shape == chunk_spec.shape:
return value
if existing_chunk_array is None:
chunk_array = chunk_spec.prototype.nd_buffer.create(
shape=chunk_spec.shape,
dtype=chunk_spec.dtype,
order=chunk_spec.order,
fill_value=fill_value_or_default(chunk_spec),
)
else:
chunk_array = existing_chunk_array.copy() # make a writable copy
if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype):
chunk_value = value
else:
Expand All @@ -320,6 +309,20 @@ def _merge_chunk_array(
for idx in range(chunk_spec.ndim)
)
chunk_value = chunk_value[item]
if is_complete_chunk and chunk_value.shape == chunk_spec.shape:
# TODO: For the last chunk, we could have is_complete_chunk=True
# that is smaller than the chunk_spec.shape but this throws
# an error in the _decode_single
return chunk_value
if existing_chunk_array is None:
chunk_array = chunk_spec.prototype.nd_buffer.create(
shape=chunk_spec.shape,
dtype=chunk_spec.dtype,
order=chunk_spec.order,
fill_value=fill_value_or_default(chunk_spec),
)
else:
chunk_array = existing_chunk_array.copy() # make a writable copy
chunk_array[chunk_selection] = chunk_value
return chunk_array

Expand Down
Loading

0 comments on commit 955234b

Please sign in to comment.