From ec014f6e7e184f4ce0ebfb712c50b0b592bf9057 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 6 Jan 2025 04:40:36 +0000 Subject: [PATCH 1/3] Improve API reference doc structure (#2635) Co-authored-by: Joe Hamman --- .gitignore | 2 +- docs/Makefile | 2 +- docs/api/index.rst | 7 ------- docs/conf.py | 3 ++- docs/index.rst | 6 +++--- 5 files changed, 7 insertions(+), 13 deletions(-) delete mode 100644 docs/api/index.rst diff --git a/.gitignore b/.gitignore index 153ca39df0..5663f62d04 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,7 @@ coverage.xml # Sphinx documentation docs/_build/ -docs/_autoapi +docs/api docs/data data data.zip diff --git a/docs/Makefile b/docs/Makefile index fc8fa12915..f42ee840e9 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -52,7 +52,7 @@ help: .PHONY: clean clean: rm -rf $(BUILDDIR)/* - rm -rf $(BUILDDIR)/../_autoapi + rm -rf $(BUILDDIR)/../api .PHONY: html html: diff --git a/docs/api/index.rst b/docs/api/index.rst deleted file mode 100644 index 26d7ce0224..0000000000 --- a/docs/api/index.rst +++ /dev/null @@ -1,7 +0,0 @@ -API reference -============= - -.. toctree:: - :maxdepth: 1 - - ../_autoapi/zarr/index diff --git a/docs/conf.py b/docs/conf.py index 3389c16549..2a93e61d3e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,7 +57,7 @@ autoapi_add_toctree_entry = False autoapi_generate_api_docs = True autoapi_member_order = "groupwise" -autoapi_root = "_autoapi" +autoapi_root = "api" autoapi_keep_files = True autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] @@ -108,6 +108,7 @@ def skip_submodules( "release": "developers/release.html", "roadmap": "developers/roadmap.html", "installation": "user-guide/installation.html", + "api": "api/zarr/index" } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/index.rst b/docs/index.rst index 29baf4b94a..4cafc12711 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ Zarr-Python quickstart user-guide/index - api/index + API reference developers/index developers/release about @@ -81,12 +81,12 @@ Zarr-Python is a Python library for reading and writing Zarr groups and arrays. +++ - .. button-ref:: api/index + .. button-ref:: api/zarr/index :expand: :color: dark :click-parent: - To the API reference guide + To the API reference .. grid-item-card:: :img-top: _static/index_contribute.svg From 5c6267e69fdd69fd6fb0c5fc74f1de82b9b4b07d Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 Jan 2025 06:53:32 +0100 Subject: [PATCH 2/3] Consistent use of 'Zarr format 2 or 3' (#2645) --- docs/index.rst | 2 +- docs/user-guide/extending.rst | 6 +- docs/user-guide/v3_migration.rst | 2 +- pyproject.toml | 2 +- src/zarr/api/asynchronous.py | 22 +++--- src/zarr/api/synchronous.py | 38 +++++------ src/zarr/codecs/vlen_utf8.py | 4 +- src/zarr/core/array.py | 112 +++++++++++++++---------------- src/zarr/core/common.py | 2 +- src/zarr/core/group.py | 106 ++++++++++++++--------------- src/zarr/core/metadata/v2.py | 6 +- src/zarr/core/metadata/v3.py | 6 +- tests/test_array.py | 2 +- tests/test_metadata/test_v3.py | 4 +- 14 files changed, 157 insertions(+), 157 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 4cafc12711..0dcfd7f90f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,7 +25,7 @@ Zarr-Python Zarr-Python is a Python library for reading and writing Zarr groups and arrays. Highlights include: -* Specification support for both Zarr v2 and v3. +* Specification support for both Zarr format 2 and 3. * Create and read from N-dimensional arrays using NumPy-like semantics. * Flexible storage enables reading and writing from local, cloud and in-memory stores. * High performance: Enables fast I/O with support for asynchronous I/O and multi-threading. diff --git a/docs/user-guide/extending.rst b/docs/user-guide/extending.rst index 405dcb92c0..7647703fbb 100644 --- a/docs/user-guide/extending.rst +++ b/docs/user-guide/extending.rst @@ -10,8 +10,8 @@ Custom codecs ------------- .. note:: - This section explains how custom codecs can be created for Zarr version 3 data. For Zarr - version 2, codecs should subclass the + This section explains how custom codecs can be created for Zarr format 3 arrays. For Zarr + format 2, codecs should subclass the `numcodecs.abc.Codec `_ base class and register through `numcodecs.registry.register_codec `_. @@ -66,7 +66,7 @@ strongly recommended to prefix the codec identifier with a unique name. For exam the codecs from ``numcodecs`` are prefixed with ``numcodecs.``, e.g. ``numcodecs.delta``. .. note:: - Note that the extension mechanism for the Zarr version 3 is still under development. + Note that the extension mechanism for the Zarr format 3 is still under development. Requirements for custom codecs including the choice of codec identifiers might change in the future. diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index 974266aac7..d90b87a897 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -4,7 +4,7 @@ Zarr-Python 3 represents a major refactor of the Zarr-Python codebase. Some of the goals motivating this refactor included: -* adding support for the Zarr V3 specification (along with the Zarr V2 specification) +* adding support for the Zarr format 3 specification (along with the Zarr format 2 specification) * cleaning up internal and user facing APIs * improving performance (particularly in high latency storage environments like cloud object stores) diff --git a/pyproject.toml b/pyproject.toml index 0fa0e7b6b4..8bc861d837 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -380,7 +380,7 @@ filterwarnings = [ "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", "ignore:Creating a zarr.buffer.gpu.*:UserWarning", "ignore:Duplicate name:UserWarning", # from ZipFile - "ignore:.*is currently not part in the Zarr version 3 specification.*:UserWarning", + "ignore:.*is currently not part in the Zarr format 3 specification.*:UserWarning", ] markers = [ "gpu: mark a test as requiring CuPy and GPU" diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f42b6d3f51..060618dbd1 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -198,7 +198,7 @@ async def consolidate_metadata( if any(m.zarr_format == 3 for m in members_metadata.values()): warnings.warn( - "Consolidated metadata is currently not part in the Zarr version 3 specification. It " + "Consolidated metadata is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=1, @@ -770,8 +770,8 @@ async def open_group( Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). + store (in the ``zarr.json`` for Zarr format 3 and in the ``.zmetadata`` file + for Zarr format 2). To explicitly require consolidated metadata, set ``use_consolidated=True``, which will raise an exception if consolidated metadata is not found. @@ -779,7 +779,7 @@ async def open_group( To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, which will fall back to using the regular, non consolidated metadata. - Zarr v2 allowed configuring the key storing the consolidated metadata + Zarr format 2 allowed configuring the key storing the consolidated metadata (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` to load consolidated metadata from a non-default key. @@ -870,21 +870,21 @@ async def create( Array shape. chunks : int or tuple of ints, optional The shape of the array's chunks. - V2 only. V3 arrays should use `chunk_shape` instead. + Zarr format 2 only. Zarr format 3 arrays should use `chunk_shape` instead. If not specified, default values are guessed based on the shape and dtype. dtype : str or dtype, optional NumPy dtype. chunk_shape : int or tuple of ints, optional The shape of the Array's chunks (default is None). - V3 only. V2 arrays should use `chunks` instead. + Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. + Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + Zarr format 3 only. Zarr format 2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: @@ -895,7 +895,7 @@ async def create( These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional Primary compressor to compress chunk data. - V2 only. V3 arrays should use ``codecs`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: @@ -925,7 +925,7 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If no ``filters`` are provided, a default set of filters will be used. + Zarr format 2 only. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the @@ -942,7 +942,7 @@ async def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_key_encoding`` instead. Default is ".". write_empty_chunks : bool, optional Deprecated in favor of the ``config`` keyword argument. diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 200db9ec26..7b3d842832 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -502,8 +502,8 @@ def open_group( Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). + store (in the ``zarr.json`` for Zarr format 3 and in the ``.zmetadata`` file + for Zarr format 2). To explicitly require consolidated metadata, set ``use_consolidated=True``, which will raise an exception if consolidated metadata is not found. @@ -511,7 +511,7 @@ def open_group( To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, which will fall back to using the regular, non consolidated metadata. - Zarr v2 allowed configuring the key storing the consolidated metadata + Zarr format 2 allows configuring the key storing the consolidated metadata (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` to load consolidated metadata from a non-default key. @@ -785,16 +785,16 @@ def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -804,32 +804,32 @@ def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. zarr_format : {2, 3}, optional @@ -838,11 +838,11 @@ def create_array( Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index e5b895ae0c..0ef423793d 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -28,7 +28,7 @@ class VLenUTF8Codec(ArrayBytesCodec): def __init__(self) -> None: warn( - "The codec `vlen-utf8` is currently not part in the Zarr version 3 specification. It " + "The codec `vlen-utf8` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=2, @@ -83,7 +83,7 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): def __init__(self) -> None: warn( - "The codec `vlen-bytes` is currently not part in the Zarr version 3 specification. It " + "The codec `vlen-bytes` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=2, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e5c4e4538c..87ec4e48bc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -179,7 +179,7 @@ async def get_array_metadata( ) if zarr_json_bytes is not None and zarray_bytes is not None: # warn and favor v3 - msg = f"Both zarr.json (Zarr v3) and .zarray (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + msg = f"Both zarr.json (Zarr format 3) and .zarray (Zarr format 2) metadata objects exist at {store_path}. Zarr v3 will be used." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) @@ -451,16 +451,16 @@ async def create( The attributes of the array (default is None). chunk_shape : ChunkCoords, optional The shape of the array's chunks - V3 only. V2 arrays should use `chunks` instead. + Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. + Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + Zarr format 3 only. Zarr format 2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: @@ -471,14 +471,14 @@ async def create( These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - V3 only. V2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. chunks : ShapeLike, optional The shape of the array's chunks. - V2 only. V3 arrays should use ``chunk_shape`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional The memory of the array (default is "C"). If ``zarr_format`` is 2, this parameter sets the memory order of the array. @@ -487,12 +487,12 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``compressor`` is provided, a default compressor will be used: @@ -592,15 +592,15 @@ async def _create( if zarr_format == 3: if dimension_separator is not None: raise ValueError( - "dimension_separator cannot be used for arrays with version 3. Use chunk_key_encoding instead." + "dimension_separator cannot be used for arrays with zarr_format 3. Use chunk_key_encoding instead." ) if filters is not None: raise ValueError( - "filters cannot be used for arrays with version 3. Use array-to-array codecs instead." + "filters cannot be used for arrays with zarr_format 3. Use array-to-array codecs instead." ) if compressor is not None: raise ValueError( - "compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead." + "compressor cannot be used for arrays with zarr_format 3. Use bytes-to-bytes codecs instead." ) if order is not None: @@ -622,14 +622,14 @@ async def _create( elif zarr_format == 2: if codecs is not None: raise ValueError( - "codecs cannot be used for arrays with version 2. Use filters and compressor instead." + "codecs cannot be used for arrays with zarr_format 2. Use filters and compressor instead." ) if chunk_key_encoding is not None: raise ValueError( - "chunk_key_encoding cannot be used for arrays with version 2. Use dimension_separator instead." + "chunk_key_encoding cannot be used for arrays with zarr_format 2. Use dimension_separator instead." ) if dimension_names is not None: - raise ValueError("dimension_names cannot be used for arrays with version 2.") + raise ValueError("dimension_names cannot be used for arrays with zarr_format 2.") if order is None: order_parsed = parse_order(zarr_config.get("array.order")) @@ -704,7 +704,7 @@ async def _create_v3( if dtype.kind in "UTS": warn( - f"The dtype `{dtype}` is currently not part in the Zarr version 3 specification. It " + f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=2, @@ -785,7 +785,7 @@ def from_dict( data: dict[str, JSON], ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: """ - Create a Zarr array from a dictionary, with support for both Zarr v2 and v3 metadata. + Create a Zarr array from a dictionary, with support for both Zarr format 2 and 3 metadata. Parameters ---------- @@ -795,17 +795,17 @@ def from_dict( data : dict A dictionary representing the array data. This dictionary should include necessary metadata for the array, such as shape, dtype, and other attributes. The format of the metadata - will determine whether a Zarr v2 or v3 array is created. + will determine whether a Zarr format 2 or 3 array is created. Returns ------- AsyncArray[ArrayV3Metadata] or AsyncArray[ArrayV2Metadata] - The created Zarr array, either using v2 or v3 metadata based on the provided data. + The created Zarr array, either using Zarr format 2 or 3 metadata based on the provided data. Raises ------ ValueError - If the dictionary data is invalid or incompatible with either Zarr v2 or v3 array creation. + If the dictionary data is invalid or incompatible with either Zarr format 2 or 3 array creation. """ metadata = parse_array_metadata(data) return cls(metadata=metadata, store_path=store_path) @@ -1644,16 +1644,16 @@ def create( The data type of the array. chunk_shape : ChunkCoords, optional The shape of the Array's chunks. - V3 only. V2 arrays should use `chunks` instead. + Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. + Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + Zarr format 3 only. Zarr format 2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: @@ -1664,14 +1664,14 @@ def create( These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - V3 only. V2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. chunks : ChunkCoords, optional The shape of the array's chunks. - V2 only. V3 arrays should use ``chunk_shape`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional The memory of the array (default is "C"). If ``zarr_format`` is 2, this parameter sets the memory order of the array. @@ -1680,12 +1680,12 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. - V2 only. V3 arrays should use ``codecs`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``compressor`` is provided, a default compressor will be used: @@ -2239,7 +2239,7 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: ----- Slices with step > 1 are supported, but slices with negative step are not. - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use fields Currently the implementation for __getitem__ is provided by @@ -2338,7 +2338,7 @@ def __setitem__(self, selection: Selection, value: npt.ArrayLike) -> None: ----- Slices with step > 1 are supported, but slices with negative step are not. - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use fields Currently the implementation for __setitem__ is provided by @@ -2470,7 +2470,7 @@ def get_basic_selection( ----- Slices with step > 1 are supported, but slices with negative step are not. - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use the `fields` parameter. This method provides the implementation for accessing data via the @@ -2573,7 +2573,7 @@ def set_basic_selection( Notes ----- - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use the `fields` parameter. This method provides the underlying implementation for modifying data via square @@ -3693,16 +3693,16 @@ async def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -3712,32 +3712,32 @@ async def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. zarr_format : {2, 3}, optional @@ -3746,11 +3746,11 @@ async def create_array( Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -3799,20 +3799,20 @@ async def create_array( if zarr_format == 2: if shard_shape_parsed is not None: msg = ( - "Zarr v2 arrays can only be created with `shard_shape` set to `None`. " + "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. " f"Got `shard_shape={shards}` instead." ) raise ValueError(msg) if serializer != "auto": - raise ValueError("Zarr v2 arrays do not support `serializer`.") + raise ValueError("Zarr format 2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=np.dtype(dtype) ) if dimension_names is not None: - raise ValueError("Zarr v2 arrays do not support dimension names.") + raise ValueError("Zarr format 2 arrays do not support dimension names.") if order is None: order_parsed = zarr_config.get("array.order") else: @@ -3895,7 +3895,7 @@ def _parse_chunk_key_encoding( result = ChunkKeyEncoding.from_dict(data) if zarr_format == 2 and result.name != "v2": msg = ( - "Invalid chunk key encoding. For Zarr v2 arrays, the `name` field of the " + "Invalid chunk key encoding. For Zarr format 2 arrays, the `name` field of the " f"chunk key encoding must be 'v2'. Got `name` = {result.name} instead." ) raise ValueError(msg) @@ -3948,7 +3948,7 @@ def _get_default_chunk_encoding_v2( np_dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ - Get the default chunk encoding for zarr v2 arrays, given a dtype + Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ compressor_dict = _default_compressor(np_dtype) @@ -3972,7 +3972,7 @@ def _parse_chunk_encoding_v2( dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ - Generate chunk encoding classes for v2 arrays with optional defaults. + Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) @@ -3987,7 +3987,7 @@ def _parse_chunk_encoding_v2( _compressor = parse_compressor(compressor[0]) else: if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." raise TypeError(msg) _compressor = parse_compressor(compressor) @@ -4000,7 +4000,7 @@ def _parse_chunk_encoding_v2( for idx, f in enumerate(filters): if not isinstance(f, numcodecs.abc.Codec): msg = ( - "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. " + "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." ) raise TypeError(msg) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index d53f3847a5..7205b8c206 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -192,7 +192,7 @@ def _warn_write_empty_chunks_kwarg() -> None: def _warn_order_kwarg() -> None: # TODO: link to docs page on array configuration in this message msg = ( - "The `order` keyword argument has no effect for zarr v3 arrays. " + "The `order` keyword argument has no effect for Zarr format 3 arrays. " "To control the memory layout of the array, either use the `config` keyword " "argument, as in `config={'order: 'C'}`," "or change the global 'array.order' configuration variable." diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index a4503ce64e..dac2270a53 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -459,8 +459,8 @@ async def open( Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). + store (in the ``zarr.json`` for Zarr format 3 and in the ``.zmetadata`` file + for Zarr format 2). To explicitly require consolidated metadata, set ``use_consolidated=True``, which will raise an exception if consolidated metadata is not found. @@ -468,7 +468,7 @@ async def open( To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, which will fall back to using the regular, non consolidated metadata. - Zarr v2 allowed configuring the key storing the consolidated metadata + Zarr format 2 allowed configuring the key storing the consolidated metadata (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` to load consolidated metadata from a non-default key. """ @@ -514,7 +514,7 @@ async def open( ) if zarr_json_bytes is not None and zgroup_bytes is not None: # warn and favor v3 - msg = f"Both zarr.json (Zarr v3) and .zgroup (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + msg = f"Both zarr.json (Zarr format 3) and .zgroup (Zarr format 2) metadata objects exist at {store_path}. Zarr format 3 will be used." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( @@ -548,7 +548,7 @@ async def open( # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None if not isinstance(use_consolidated, bool | None): - raise TypeError("use_consolidated must be a bool or None for Zarr V3.") + raise TypeError("use_consolidated must be a bool or None for Zarr format 3.") return cls._from_bytes_v3( store_path, @@ -1048,16 +1048,16 @@ async def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -1067,16 +1067,16 @@ async def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. @@ -1085,27 +1085,27 @@ async def create_array( Deprecated in favor of ``compressors``. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -2304,16 +2304,16 @@ def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -2323,16 +2323,16 @@ def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. @@ -2341,27 +2341,27 @@ def create_array( Deprecated in favor of ``compressors``. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -2693,16 +2693,16 @@ def array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -2712,16 +2712,16 @@ def array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. @@ -2730,27 +2730,27 @@ def array( Deprecated in favor of ``compressors``. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bc7fd32cbf..b95433068a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -34,7 +34,7 @@ class ArrayV2MetadataDict(TypedDict): """ - A typed dictionary model for zarr v2 metadata. + A typed dictionary model for Zarr format 2 metadata. """ zarr_format: Literal[2] @@ -68,7 +68,7 @@ def __init__( attributes: dict[str, JSON] | None = None, ) -> None: """ - Metadata for a Zarr version 2 array. + Metadata for a Zarr format 2 array. """ shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) @@ -327,7 +327,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: stored in the Array metadata into an in-memory value. This only gives the default fill value for some type. - This is useful for reading Zarr V2 arrays, which allow the fill + This is useful for reading Zarr format 2 arrays, which allow the fill value to be unspecified. """ if dtype.kind == "S": diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 0821dd9bc9..1265c832b2 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -468,7 +468,7 @@ def parse_fill_value( fill_value : Any A potential fill value. dtype : str - A valid Zarr V3 DataType. + A valid Zarr format 3 DataType. Returns ------- @@ -676,10 +676,10 @@ def parse(cls, dtype: DataType | Any | None) -> DataType: try: dtype = np.dtype(dtype) except (ValueError, TypeError) as e: - raise ValueError(f"Invalid V3 data_type: {dtype}") from e + raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e # check that this is a valid v3 data_type try: data_type = DataType.from_numpy(dtype) except KeyError as e: - raise ValueError(f"Invalid V3 data_type: {dtype}") from e + raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e return data_type diff --git a/tests/test_array.py b/tests/test_array.py index 628b873e72..86885514a3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1219,7 +1219,7 @@ async def test_create_array_v2_no_shards(store: MemoryStore) -> None: Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. """ msg = re.escape( - "Zarr v2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." + "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." ) with pytest.raises(ValueError, match=msg): _ = await create_array( diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 6f7fba6dd1..ef527f42ef 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -336,13 +336,13 @@ def test_invalid_dtype_raises() -> None: "codecs": (), "fill_value": np.datetime64(0, "ns"), } - with pytest.raises(ValueError, match=r"Invalid V3 data_type: .*"): + with pytest.raises(ValueError, match=r"Invalid Zarr format 3 data_type: .*"): ArrayV3Metadata.from_dict(metadata_dict) @pytest.mark.parametrize("data", ["datetime64[s]", "foo", object()]) def test_parse_invalid_dtype_raises(data): - with pytest.raises(ValueError, match=r"Invalid V3 data_type: .*"): + with pytest.raises(ValueError, match=r"Invalid Zarr format 3 data_type: .*"): DataType.parse(data) From 91385283bc9e0ddbd192d5d08df196d9bd90b8e7 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 Jan 2025 16:22:18 +0100 Subject: [PATCH 3/3] Adds filters, compressors and serializer props to Array (#2652) * adds filters, serializer, compressors properties to Array * adapt Array.info * fixes doctests * ugly numcodecs class names * always show filters and compressors in Array.info * format --- docs/user-guide/arrays.rst | 41 +++++--- docs/user-guide/consolidated_metadata.rst | 12 +-- docs/user-guide/groups.rst | 8 +- docs/user-guide/performance.rst | 12 ++- src/zarr/api/synchronous.py | 2 +- src/zarr/core/_info.py | 28 ++--- src/zarr/core/array.py | 119 +++++++++++++++++++--- src/zarr/core/group.py | 6 +- src/zarr/core/metadata/v3.py | 28 ++++- tests/test_array.py | 93 +++++++++++------ tests/test_config.py | 8 +- tests/test_info.py | 12 ++- 12 files changed, 265 insertions(+), 104 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 110e12c3be..ba85ce1cda 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -168,8 +168,8 @@ argument accepted by all array creation functions. For example:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0)] + >>> z.compressors + (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) This array above will use Blosc as the primary compressor, using the Zstandard algorithm (compression level 3) internally within Blosc, and with the @@ -188,7 +188,9 @@ which can be used to print useful diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) The :func:`zarr.Array.info_complete` method inspects the underlying store and @@ -203,7 +205,9 @@ prints additional diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) No. bytes stored : 9696302 Storage ratio : 41.3 @@ -223,8 +227,8 @@ here is an array using Gzip compression, level 1:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), GzipCodec(level=1)] + >>> z.compressors + (GzipCodec(level=1),) Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's built-in delta filter:: @@ -236,23 +240,24 @@ built-in delta filter:: >>> compressors = LZMA(filters=lzma_filters) >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) - >>> z.metadata.codecs - [BytesCodec(endian=), _make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]})] + >>> z.compressors + (_make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) The default compressor can be changed by setting the value of the using Zarr's :ref:`user-guide-config`, e.g.:: >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.metadata.filters - >>> z.metadata.compressor - Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + >>> z.filters + () + >>> z.compressors + (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) - >>> z.metadata.codecs - [BytesCodec(endian=)] + >>> z.compressors + () .. _user-guide-filters: @@ -287,7 +292,9 @@ Here is an example using a delta filter with the Blosc compressor:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'codec_name': 'numcodecs.delta', 'codec_config': {'id': 'delta', 'dtype': 'int32'}}, {'endian': }, {'typesize': 4, 'cname': , 'clevel': 1, 'shuffle': , 'blocksize': 0}] + Filters : (_make_array_array_codec.._Codec(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),) + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) For more information about available filter codecs, see the `Numcodecs @@ -600,11 +607,13 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Order : C Read-only : False Store type : LocalStore - Codecs : [{'chunk_shape': (100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) No. bytes stored : 3981060 Storage ratio : 25.1 - Chunks Initialized : 100 + Shards Initialized : 100 In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 511761d34e..3c015dcfca 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -52,8 +52,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -65,8 +65,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -78,8 +78,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 62160ffde5..da5f393246 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -109,7 +109,9 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 8000000 (7.6M) No. bytes stored : 1432 Storage ratio : 5586.6 @@ -123,7 +125,9 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 4000000 (3.8M) Groups also have the :func:`zarr.Group.tree` method, e.g.:: diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index f56b642fb1..265bef8efe 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -98,7 +98,9 @@ To use sharding, you need to specify the ``shards`` parameter when creating the Order : C Read-only : False Store type : MemoryStore - Codecs : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000000 (93.1G) .. _user-guide-chunks-order: @@ -125,7 +127,9 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 @@ -142,7 +146,9 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : F Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 7b3d842832..1a8e6df649 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -802,7 +802,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 807e940508..845552c8be 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,7 +5,7 @@ import numcodecs.abc import numpy as np -from zarr.abc.codec import Codec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -85,9 +85,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _compressor: numcodecs.abc.Codec | None = None - _filters: tuple[numcodecs.abc.Codec, ...] | None = None - _codecs: list[Codec] | None = None + _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _serializer: ArrayBytesCodec | None = None + _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None @@ -109,18 +109,19 @@ def __repr__(self) -> str: Read-only : {_read_only} Store type : {_store_type}""") - kwargs = dataclasses.asdict(self) + # We can't use dataclasses.asdict, because we only want a shallow dict + kwargs = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} + if self._chunk_shape is None: # for non-regular chunk grids kwargs["chunk_shape"] = "" - if self._compressor is not None: - template += "\nCompressor : {_compressor}" - if self._filters is not None: - template += "\nFilters : {_filters}" + template += "\nFilters : {_filters}" + + if self._serializer is not None: + template += "\nSerializer : {_serializer}" - if self._codecs is not None: - template += "\nCodecs : {_codecs}" + template += "\nCompressors : {_compressors}" if self._count_bytes is not None: template += "\nNo. bytes : {_count_bytes}" @@ -139,5 +140,8 @@ def __repr__(self) -> str: kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}" if self._count_chunks_initialized is not None: - template += "\nChunks Initialized : {_count_chunks_initialized}" + if self._shard_shape is not None: + template += "\nShards Initialized : {_count_chunks_initialized}" + else: + template += "\nChunks Initialized : {_count_chunks_initialized}" return template.format(**kwargs) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 87ec4e48bc..2fa342ce16 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -20,6 +20,7 @@ from warnings import warn import numcodecs +import numcodecs.abc import numpy as np import numpy.typing as npt from typing_extensions import deprecated @@ -911,6 +912,63 @@ def size(self) -> int: """ return np.prod(self.metadata.shape).item() + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + if self.metadata.zarr_format == 2: + filters = self.metadata.filters + if filters is None: + return () + return filters + + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayArrayCodec) + ) + + @property + def serializer(self) -> ArrayBytesCodec | None: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + if self.metadata.zarr_format == 2: + return None + + return next( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayBytesCodec) + ) + + @property + @deprecated("Use AsyncArray.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + if self.metadata.zarr_format == 2: + return self.metadata.compressor + raise TypeError("`compressor` is not available for Zarr format 3 arrays.") + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + if self.metadata.zarr_format == 2: + if self.metadata.compressor is not None: + return (self.metadata.compressor,) + return () + + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, BytesBytesCodec) + ) + @property def dtype(self) -> np.dtype[Any]: """Returns the data type of the array. @@ -1561,31 +1619,27 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - kwargs: dict[str, Any] = {} - if self.metadata.zarr_format == 2: - assert isinstance(self.metadata, ArrayV2Metadata) - if self.metadata.compressor is not None: - kwargs["_compressor"] = self.metadata.compressor - if self.metadata.filters is not None: - kwargs["_filters"] = self.metadata.filters - kwargs["_data_type"] = self.metadata.dtype - kwargs["_chunk_shape"] = self.metadata.chunks + _data_type: np.dtype[Any] | DataType + if isinstance(self.metadata, ArrayV2Metadata): + _data_type = self.metadata.dtype else: - kwargs["_codecs"] = self.metadata.codecs - kwargs["_data_type"] = self.metadata.data_type - kwargs["_chunk_shape"] = self.chunks - kwargs["_shard_shape"] = self.shards + _data_type = self.metadata.data_type return ArrayInfo( _zarr_format=self.metadata.zarr_format, + _data_type=_data_type, _shape=self.shape, _order=self.order, + _shard_shape=self.shards, + _chunk_shape=self.chunks, _read_only=self.read_only, + _compressors=self.compressors, + _filters=self.filters, + _serializer=self.serializer, _store_type=type(self.store_path.store).__name__, _count_bytes=self.nbytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, - **kwargs, ) @@ -1967,6 +2021,41 @@ def read_only(self) -> bool: def fill_value(self) -> Any: return self.metadata.fill_value + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + return self._async_array.filters + + @property + def serializer(self) -> None | ArrayBytesCodec: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + return self._async_array.serializer + + @property + @deprecated("Use Array.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + return self._async_array.compressor + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + return self._async_array.compressors + @property def cdata_shape(self) -> ChunkCoords: """ @@ -3710,7 +3799,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index dac2270a53..d100e30492 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1065,7 +1065,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. @@ -2321,7 +2321,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. @@ -2710,7 +2710,7 @@ def array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 1265c832b2..13a275a6a1 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -81,9 +81,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: return out -def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: - """Check that the codecs are valid for the given dtype""" - +def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: # ensure that we have at least one ArrayBytesCodec abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] if len(abcs) == 0: @@ -91,7 +89,18 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: elif len(abcs) > 1: raise ValueError("Only one ArrayBytesCodec is allowed.") - abc = abcs[0] + return abcs[0] + + +def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: + """Check that the codecs are valid for the given dtype""" + from zarr.codecs.sharding import ShardingCodec + + abc = validate_array_bytes_codec(codecs) + + # Recursively resolve array-bytes codecs within sharding codecs + while isinstance(abc, ShardingCodec): + abc = validate_array_bytes_codec(abc.codecs) # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name @@ -254,7 +263,7 @@ def __init__( config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) - codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] + codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) validate_codecs(codecs_parsed_partial, data_type_parsed) object.__setattr__(self, "shape", shape_parsed) @@ -330,6 +339,15 @@ def shards(self) -> ChunkCoords | None: ) raise NotImplementedError(msg) + @property + def inner_codecs(self) -> tuple[Codec, ...]: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.codecs[0].codecs + return self.codecs + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: diff --git a/tests/test_array.py b/tests/test_array.py index 86885514a3..410b2e58d0 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -20,7 +20,6 @@ VLenUTF8Codec, ZstdCodec, ) -from zarr.codecs.sharding import ShardingCodec from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsLike, @@ -494,7 +493,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -510,9 +509,8 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] - if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -536,7 +534,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -572,7 +570,7 @@ async def test_info_v2_async( _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -596,9 +594,8 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] - if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -624,7 +621,7 @@ async def test_info_complete_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -839,7 +836,8 @@ def test_array_create_metadata_order_v2( arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") - assert arr.metadata.order == expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.order == expected @pytest.mark.parametrize("order_config", ["C", "F", None]) @@ -1048,10 +1046,15 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + # Test metadata explicitly + assert arr.metadata.zarr_format == 2 # guard for mypy # The v2 metadata stores None and () separately - assert arr.metadata.filters == empty_value # type: ignore[union-attr] + assert arr.metadata.filters == empty_value # The v2 metadata does not allow tuple for compressor, therefore it is turned into None - assert arr.metadata.compressor is None # type: ignore[union-attr] + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () # v3 arr = await create_array( @@ -1061,10 +1064,13 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + assert arr.metadata.zarr_format == 3 # guard for mypy if dtype == "str": - assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() else: - assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1116,8 +1122,14 @@ async def test_create_array_no_filters_compressors( ({"name": "transpose", "configuration": {"order": [0]}},), ], ) +@pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) async def test_create_array_v3_chunk_encoding( - store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str + store: MemoryStore, + compressors: CompressorsLike, + filters: FiltersLike, + dtype: str, + chunks: tuple[int, ...], + shards: tuple[int, ...] | None, ) -> None: """ Test various possibilities for the compressors and filters parameter to create_array @@ -1125,17 +1137,18 @@ async def test_create_array_v3_chunk_encoding( arr = await create_array( store=store, dtype=dtype, - shape=(10,), + shape=(12,), + chunks=chunks, + shards=shards, zarr_format=3, filters=filters, compressors=compressors, ) - aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters / compressors from the array. - assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + assert arr.filters == filters_expected + assert arr.compressors == compressors_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1167,9 +1180,16 @@ async def test_create_array_v2_chunk_encoding( filters_expected, compressor_expected = _parse_chunk_encoding_v2( filters=filters, compressor=compressors, dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters/compressor from the array. - assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] - assert arr.metadata.filters == filters_expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.compressor == compressor_expected + assert arr.metadata.filters == filters_expected + + # Normalize for property getters + compressor_expected = () if compressor_expected is None else (compressor_expected,) + filters_expected = () if filters_expected is None else filters_expected + + assert arr.compressors == compressor_expected + assert arr.filters == filters_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1185,12 +1205,12 @@ async def test_create_array_v3_default_filters_compressors(store: MemoryStore, d shape=(10,), zarr_format=3, ) - expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) - # TODO: define the codec pipeline class such that these fields are required, which will obviate the - # type ignore statements - assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] - assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + expected_filters, expected_serializer, expected_compressors = _get_default_chunk_encoding_v3( + np_dtype=np.dtype(dtype) + ) + assert arr.filters == expected_filters + assert arr.serializer == expected_serializer + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1209,8 +1229,15 @@ async def test_create_array_v2_default_filters_compressors(store: MemoryStore, d expected_filters, expected_compressors = _get_default_chunk_encoding_v2( np_dtype=np.dtype(dtype) ) - assert arr.metadata.filters == expected_filters # type: ignore[union-attr] - assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.filters == expected_filters + assert arr.metadata.compressor == expected_compressors + + # Normalize for property getters + expected_filters = () if expected_filters is None else expected_filters + expected_compressors = () if expected_compressors is None else (expected_compressors,) + assert arr.filters == expected_filters + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_config.py b/tests/test_config.py index 20e3c6044f..ca65c62166 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -305,12 +305,12 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize( ("dtype", "expected_codecs"), [ - ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec(), GzipCodec()]), - ("str", [VLenUTF8Codec(), GzipCodec()]), + ("int", (BytesCodec(), GzipCodec())), + ("bytes", (VLenBytesCodec(), GzipCodec())), + ("str", (VLenUTF8Codec(), GzipCodec())), ], ) -async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: +async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> None: with config.set( { "array.v3_default_codecs": { # test setting non-standard codecs diff --git a/tests/test_info.py b/tests/test_info.py index 5d9264aa13..db0fd0ef76 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -59,7 +59,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _serializer=BytesCodec(), ) result = repr(info) assert result == textwrap.dedent(f"""\ @@ -71,7 +71,9 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}]""") + Filters : () + Serializer : BytesCodec(endian=) + Compressors : ()""") @pytest.mark.parametrize("zarr_format", ZARR_FORMATS) @@ -95,7 +97,7 @@ def test_array_info_complete( _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _serializer=BytesCodec(), _count_bytes=count_bytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, @@ -110,7 +112,9 @@ def test_array_info_complete( Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored_formatted} Storage ratio : {storage_ratio_formatted}