diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c85648e0ff..7d82a95662 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -102,6 +102,11 @@ jobs: - name: Run Tests run: | hatch env run --env ${{ matrix.dependency-set }} run + - name: Upload coverage + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true # optional (default = false) doctests: name: doctests diff --git a/changes/2755.bugfix.rst b/changes/2755.bugfix.rst deleted file mode 100644 index 2555369544..0000000000 --- a/changes/2755.bugfix.rst +++ /dev/null @@ -1,3 +0,0 @@ -The array returned by ``zarr.empty`` and an empty ``zarr.core.buffer.cpu.NDBuffer`` will now be filled with the -specified fill value, or with zeros if no fill value is provided. -This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes. \ No newline at end of file diff --git a/changes/2758.bugfix.rst b/changes/2758.bugfix.rst deleted file mode 100644 index 6b80f8a626..0000000000 --- a/changes/2758.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix zip-store path checking for stores with directories listed as files. \ No newline at end of file diff --git a/changes/2778.bugfix.rst b/changes/2778.bugfix.rst deleted file mode 100644 index 2968c4441c..0000000000 --- a/changes/2778.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Use removeprefix rather than replace when removing filename prefixes in `FsspecStore.list` \ No newline at end of file diff --git a/changes/2781.bugfix.rst b/changes/2781.bugfix.rst deleted file mode 100644 index 3673eeece7..0000000000 --- a/changes/2781.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Enable automatic removal of `needs release notes` with labeler action \ No newline at end of file diff --git a/changes/2785.bugfix.rst b/changes/2785.bugfix.rst deleted file mode 100644 index 3f2b3111ea..0000000000 --- a/changes/2785.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Use the proper label config \ No newline at end of file diff --git a/changes/2799.bugfix.rst b/changes/2799.bugfix.rst deleted file mode 100644 index f22b7074bb..0000000000 --- a/changes/2799.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types \ No newline at end of file diff --git a/changes/2801.bugfix.rst b/changes/2801.bugfix.rst deleted file mode 100644 index 294934aacf..0000000000 --- a/changes/2801.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Ensure utf8 compliant strings are used to construct numpy arrays in property-based tests diff --git a/changes/2804.feature.rst b/changes/2804.feature.rst deleted file mode 100644 index 5a707752a0..0000000000 --- a/changes/2804.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:py:class:`LocalStore` learned to ``delete_dir``. This makes array and group deletes more efficient. diff --git a/changes/2807.bugfix.rst b/changes/2807.bugfix.rst deleted file mode 100644 index ae0eb2f6ac..0000000000 --- a/changes/2807.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix pickling for ZipStore diff --git a/changes/2811.bugfix.rst b/changes/2811.bugfix.rst deleted file mode 100644 index ef4e8eb7ed..0000000000 --- a/changes/2811.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Update numcodecs to not overwrite codec configuration ever. Closes :issue:`2800`. diff --git a/docs/developers/contributing.rst b/docs/developers/contributing.rst index 220e24eced..de10fab2c6 100644 --- a/docs/developers/contributing.rst +++ b/docs/developers/contributing.rst @@ -230,6 +230,27 @@ during development at `http://0.0.0.0:8000/ `_. This can b $ hatch --env docs run serve +.. _changelog: + +Changelog +~~~~~~~~~ + +zarr-python uses `towncrier`_ to manage release notes. Most pull requests should +include at least one news fragment describing the changes. To add a release +note, you'll need the GitHub issue or pull request number and the type of your +change (``feature``, ``bugfix``, ``doc``, ``removal``, ``misc``). With that, run +```towncrier create``` with your development environment, which will prompt you +for the issue number, change type, and the news text:: + + towncrier create + +Alternatively, you can manually create the files in the ``changes`` directory +using the naming convention ``{issue-number}.{change-type}.rst``. + +See the `towncrier`_ docs for more. + +.. _towncrier: https://towncrier.readthedocs.io/en/stable/tutorial.html + Development best practices, policies and procedures --------------------------------------------------- diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 08c64eb899..93466a0992 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -3,6 +3,45 @@ Release notes .. towncrier release notes start +3.0.3 (2025-02-14) +------------------ + +Features +~~~~~~~~ + +- Improves performance of FsspecStore.delete_dir for remote filesystems supporting concurrent/batched deletes, e.g., s3fs. (:issue:`2661`) +- Added :meth:`zarr.config.enable_gpu` to update Zarr's configuration to use GPUs. (:issue:`2751`) +- Avoid reading chunks during writes where possible. :issue:`757` (:issue:`2784`) +- :py:class:`LocalStore` learned to ``delete_dir``. This makes array and group deletes more efficient. (:issue:`2804`) +- Add `zarr.testing.strategies.array_metadata` to generate ArrayV2Metadata and ArrayV3Metadata instances. (:issue:`2813`) +- Add arbitrary `shards` to Hypothesis strategy for generating arrays. (:issue:`2822`) + + +Bugfixes +~~~~~~~~ + +- Fixed bug with Zarr using device memory, instead of host memory, for storing metadata when using GPUs. (:issue:`2751`) +- The array returned by ``zarr.empty`` and an empty ``zarr.core.buffer.cpu.NDBuffer`` will now be filled with the + specified fill value, or with zeros if no fill value is provided. + This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes. (:issue:`2755`) +- Fix zip-store path checking for stores with directories listed as files. (:issue:`2758`) +- Use removeprefix rather than replace when removing filename prefixes in `FsspecStore.list` (:issue:`2778`) +- Enable automatic removal of `needs release notes` with labeler action (:issue:`2781`) +- Use the proper label config (:issue:`2785`) +- Alters the behavior of ``create_array`` to ensure that any groups implied by the array's name are created if they do not already exist. Also simplifies the type signature for any function that takes an ArrayConfig-like object. (:issue:`2795`) +- Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types (:issue:`2799`) +- Ensure utf8 compliant strings are used to construct numpy arrays in property-based tests (:issue:`2801`) +- Fix pickling for ZipStore (:issue:`2807`) +- Update numcodecs to not overwrite codec configuration ever. Closes :issue:`2800`. (:issue:`2811`) +- Fix fancy indexing (e.g. arr[5, [0, 1]]) with the sharding codec (:issue:`2817`) + + +Improved Documentation +~~~~~~~~~~~~~~~~~~~~~~ + +- Added new user guide on :ref:`user-guide-gpu`. (:issue:`2751`) + + 3.0.2 (2025-01-31) ------------------ diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 3662f75dff..91ffe50b91 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -32,6 +32,7 @@ Configuration options include the following: - Whether empty chunks are written to storage ``array.write_empty_chunks`` - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers`` - Selections of implementations of codecs, codec pipelines and buffers +- Enabling GPU support with ``zarr.config.enable_gpu()``. See :ref:`user-guide-gpu` for more. For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations in the registry and then select them in the config. diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst new file mode 100644 index 0000000000..4d3492f8bd --- /dev/null +++ b/docs/user-guide/gpu.rst @@ -0,0 +1,37 @@ +.. _user-guide-gpu: + +Using GPUs with Zarr +==================== + +Zarr can use GPUs to accelerate your workload by running +:meth:`zarr.config.enable_gpu`. + +.. note:: + + `zarr-python` currently supports reading the ndarray data into device (GPU) + memory as the final stage of the codec pipeline. Data will still be read into + or copied to host (CPU) memory for encoding and decoding. + + In the future, codecs will be available compressing and decompressing data on + the GPU, avoiding the need to move data between the host and device for + compression and decompression. + +Reading data into device memory +------------------------------- + +:meth:`zarr.config.enable_gpu` configures Zarr to use GPU memory for the data +buffers used internally by Zarr. + +.. code-block:: python + + >>> import zarr + >>> import cupy as cp # doctest: +SKIP + >>> zarr.config.enable_gpu() # doctest: +SKIP + >>> store = zarr.storage.MemoryStore() # doctest: +SKIP + >>> z = zarr.create_array( # doctest: +SKIP + ... store=store, shape=(100, 100), chunks=(10, 10), dtype="float32", + ... ) + >>> type(z[:10, :10]) # doctest: +SKIP + cupy.ndarray + +Note that the output type is a ``cupy.ndarray`` rather than a NumPy array. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index a7bbd12453..c50713332b 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -23,6 +23,7 @@ Advanced Topics performance consolidated_metadata extending + gpu .. Coming soon diff --git a/pyproject.toml b/pyproject.toml index ab285ff7ff..0137927039 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -212,11 +212,7 @@ dependencies = [ 'typing_extensions @ git+https://github.com/python/typing_extensions', 'donfig @ git+https://github.com/pytroll/donfig', # test deps - 'hypothesis', - 'pytest', - 'pytest-cov', - 'pytest-asyncio', - 'moto[s3]', + 'zarr[test]', ] [tool.hatch.envs.upstream.env-vars] @@ -228,6 +224,9 @@ PIP_PRE = "1" run = "pytest --verbose" run-mypy = "mypy src" run-hypothesis = "pytest --hypothesis-profile ci tests/test_properties.py tests/test_store/test_stateful*" +run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" +run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" +run-coverage-html = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report html --cov=src" list-env = "pip list" [tool.hatch.envs.min_deps] @@ -247,18 +246,16 @@ dependencies = [ 'typing_extensions==4.9.*', 'donfig==0.8.*', # test deps - 'hypothesis', - 'pytest', - 'pytest-cov', - 'pytest-asyncio', - 'moto[s3]', + 'zarr[test]', ] [tool.hatch.envs.min_deps.scripts] run = "pytest --verbose" run-hypothesis = "pytest --hypothesis-profile ci tests/test_properties.py tests/test_store/test_stateful*" list-env = "pip list" - +run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" +run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" +run-coverage-html = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report html --cov=src" [tool.ruff] line-length = 100 diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index fabd042dbe..16400f5f4b 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -357,7 +357,7 @@ async def encode( @abstractmethod async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -379,7 +379,7 @@ async def read( @abstractmethod async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f562a995ce..8c032a7805 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,7 +10,7 @@ from typing_extensions import deprecated from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -857,7 +857,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -1019,7 +1019,7 @@ async def create( mode = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) - config_dict: ArrayConfigLike = {} + config_dict: ArrayConfigParams = {} if write_empty_chunks is not None: if config is not None: diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 20c016ab51..b9f8ac0ae7 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -25,7 +25,7 @@ SerializerLike, ShardsLike, ) - from zarr.core.array_spec import ArrayConfig, ArrayConfigLike + from zarr.core.array_spec import ArrayConfigLike from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ( @@ -625,7 +625,7 @@ def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, **kwargs: Any, ) -> Array: """Create an array. @@ -695,7 +695,7 @@ def create( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. - config : ArrayConfig or ArrayConfigLike, optional + config : ArrayConfigLike, optional Runtime configuration of the array. If provided, will override the default values from `zarr.config.array`. @@ -761,7 +761,7 @@ def create_array( dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> Array: """Create an array. @@ -853,7 +853,7 @@ def create_array( Ignored otherwise. overwrite : bool, default False Whether to overwrite an array with the same name in the store, if one exists. - config : ArrayConfig or ArrayConfigLike, optional + config : ArrayConfigLike, optional Runtime configuration for the array. Returns diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index e8730c86dd..42b1313fac 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -455,8 +455,9 @@ async def _decode_single( chunk_spec, chunk_selection, out_selection, + is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection in indexer + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], out, ) @@ -486,7 +487,7 @@ async def _decode_partial_single( ) indexed_chunks = list(indexer) - all_chunk_coords = {chunk_coords for chunk_coords, _, _ in indexed_chunks} + all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks} # reading bytes of all requested chunks shard_dict: ShardMapping = {} @@ -524,12 +525,17 @@ async def _decode_partial_single( chunk_spec, chunk_selection, out_selection, + is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection in indexer + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], out, ) - return out + + if hasattr(indexer, "sel_shape"): + return out.reshape(indexer.sel_shape) + else: + return out async def _encode_single( self, @@ -558,8 +564,9 @@ async def _encode_single( chunk_spec, chunk_selection, out_selection, + is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection in indexer + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], shard_array, ) @@ -601,8 +608,9 @@ async def _encode_partial_single( chunk_spec, chunk_selection, out_selection, + is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection in indexer + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], shard_array, ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index d6a580320d..0e9d74ba8a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -39,6 +39,7 @@ NDBuffer, default_buffer_prototype, ) +from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, @@ -164,19 +165,20 @@ async def get_array_metadata( ) -> dict[str, JSON]: if zarr_format == 2: zarray_bytes, zattrs_bytes = await gather( - (store_path / ZARRAY_JSON).get(), (store_path / ZATTRS_JSON).get() + (store_path / ZARRAY_JSON).get(prototype=cpu_buffer_prototype), + (store_path / ZATTRS_JSON).get(prototype=cpu_buffer_prototype), ) if zarray_bytes is None: raise FileNotFoundError(store_path) elif zarr_format == 3: - zarr_json_bytes = await (store_path / ZARR_JSON).get() + zarr_json_bytes = await (store_path / ZARR_JSON).get(prototype=cpu_buffer_prototype) if zarr_json_bytes is None: raise FileNotFoundError(store_path) elif zarr_format is None: zarr_json_bytes, zarray_bytes, zattrs_bytes = await gather( - (store_path / ZARR_JSON).get(), - (store_path / ZARRAY_JSON).get(), - (store_path / ZATTRS_JSON).get(), + (store_path / ZARR_JSON).get(prototype=cpu_buffer_prototype), + (store_path / ZARRAY_JSON).get(prototype=cpu_buffer_prototype), + (store_path / ZATTRS_JSON).get(prototype=cpu_buffer_prototype), ) if zarr_json_bytes is not None and zarray_bytes is not None: # warn and favor v3 @@ -220,7 +222,7 @@ class AsyncArray(Generic[T_ArrayMetadata]): The metadata of the array. store_path : StorePath The path to the Zarr store. - config : ArrayConfig, optional + config : ArrayConfigLike, optional The runtime configuration of the array, by default None. Attributes @@ -245,7 +247,7 @@ def __init__( self: AsyncArray[ArrayV2Metadata], metadata: ArrayV2Metadata | ArrayV2MetadataDict, store_path: StorePath, - config: ArrayConfig | None = None, + config: ArrayConfigLike | None = None, ) -> None: ... @overload @@ -253,14 +255,14 @@ def __init__( self: AsyncArray[ArrayV3Metadata], metadata: ArrayV3Metadata | ArrayV3MetadataDict, store_path: StorePath, - config: ArrayConfig | None = None, + config: ArrayConfigLike | None = None, ) -> None: ... def __init__( self, metadata: ArrayMetadata | ArrayMetadataDict, store_path: StorePath, - config: ArrayConfig | None = None, + config: ArrayConfigLike | None = None, ) -> None: if isinstance(metadata, dict): zarr_format = metadata["zarr_format"] @@ -274,12 +276,11 @@ def __init__( raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") metadata_parsed = parse_array_metadata(metadata) - - config = ArrayConfig.from_dict({}) if config is None else config + config_parsed = parse_array_config(config) object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) - object.__setattr__(self, "_config", config) + object.__setattr__(self, "_config", config_parsed) object.__setattr__(self, "codec_pipeline", create_codec_pipeline(metadata=metadata_parsed)) # this overload defines the function signature when zarr_format is 2 @@ -303,7 +304,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... # this overload defines the function signature when zarr_format is 3 @@ -332,7 +333,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @overload @@ -360,7 +361,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @overload @@ -394,7 +395,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod @@ -429,7 +430,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Method to create a new asynchronous array instance. @@ -507,7 +508,7 @@ async def create( Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional The data to be inserted into the array (default is None). - config : ArrayConfig or ArrayConfigLike, optional + config : ArrayConfigLike, optional Runtime configuration for the array. Returns @@ -570,7 +571,7 @@ async def _create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Method to create a new asynchronous array instance. See :func:`AsyncArray.create` for more details. @@ -1291,8 +1292,9 @@ async def _get_selection( self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, + is_complete_chunk, ) - for chunk_coords, chunk_selection, out_selection in indexer + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer ], out_buffer, drop_axes=indexer.drop_axes, @@ -1350,7 +1352,7 @@ async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = F """ Asynchronously save the array metadata. """ - to_save = metadata.to_buffer_dict(default_buffer_prototype()) + to_save = metadata.to_buffer_dict(cpu_buffer_prototype) awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] if ensure_parents: @@ -1362,7 +1364,7 @@ async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = F [ (parent.store_path / key).set_if_not_exists(value) for key, value in parent.metadata.to_buffer_dict( - default_buffer_prototype() + cpu_buffer_prototype ).items() ] ) @@ -1420,8 +1422,9 @@ async def _set_selection( self.metadata.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, + is_complete_chunk, ) - for chunk_coords, chunk_selection, out_selection in indexer + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer ], value_buffer, drop_axes=indexer.drop_axes, @@ -1744,7 +1747,7 @@ def create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> Array: """Creates a new Array instance from an initialized store. @@ -1873,7 +1876,7 @@ def _create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, ) -> Array: """Creates a new Array instance from an initialized store. See :func:`Array.create` for more details. @@ -3814,7 +3817,8 @@ async def init_array( chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, overwrite: bool = False, -) -> ArrayV3Metadata | ArrayV2Metadata: + config: ArrayConfigLike | None, +) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: """Create and persist an array metadata document. Parameters @@ -3893,11 +3897,13 @@ async def init_array( Zarr format 3 only. Zarr format 2 arrays should not use this parameter. overwrite : bool, default False Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfigLike or None, optional + Configuration for this array. Returns ------- - ArrayV3Metadata | ArrayV2Metadata - The array metadata document. + AsyncArray + The AsyncArray. """ if zarr_format is None: @@ -3997,14 +4003,9 @@ async def init_array( attributes=attributes, ) - # save the metadata to disk - # TODO: make this easier -- it should be a simple function call that takes a {key: buffer} - coros = ( - (store_path / key).set(value) - for key, value in meta.to_buffer_dict(default_buffer_prototype()).items() - ) - await gather(*coros) - return meta + arr = AsyncArray(metadata=meta, store_path=store_path, config=config) + await arr._save_metadata(meta, ensure_parents=True) + return arr async def create_array( @@ -4027,7 +4028,7 @@ async def create_array( dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigLike | None = None, + config: ArrayConfigLike | None = None, write_data: bool = True, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -4117,7 +4118,7 @@ async def create_array( Ignored otherwise. overwrite : bool, default False Whether to overwrite an array with the same name in the store, if one exists. - config : ArrayConfig or ArrayConfigLike, optional + config : ArrayConfigLike, optional Runtime configuration for the array. write_data : bool If a pre-existing array-like object was provided to this function via the ``data`` parameter @@ -4143,13 +4144,12 @@ async def create_array( """ mode: Literal["a"] = "a" - config_parsed = parse_array_config(config) store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) data_parsed, shape_parsed, dtype_parsed = _parse_data_params( data=data, shape=shape, dtype=dtype ) - meta = await init_array( + result = await init_array( store_path=store_path, shape=shape_parsed, dtype=dtype_parsed, @@ -4165,9 +4165,9 @@ async def create_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, overwrite=overwrite, + config=config, ) - result = AsyncArray(metadata=meta, store_path=store_path, config=config_parsed) if write_data is True and data_parsed is not None: await result._set_selection( BasicIndexer(..., shape=result.shape, chunk_grid=result.metadata.chunk_grid), diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index b1a6a3cad0..59d3cc6b40 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -21,7 +21,7 @@ from zarr.core.common import ChunkCoords -class ArrayConfigLike(TypedDict): +class ArrayConfigParams(TypedDict): """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -56,13 +56,13 @@ def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) @classmethod - def from_dict(cls, data: ArrayConfigLike) -> Self: + def from_dict(cls, data: ArrayConfigParams) -> Self: """ Create an ArrayConfig from a dict. The keys of that dict are a subset of the attributes of the ArrayConfig class. Any keys missing from that dict will be set to the the values in the ``array`` namespace of ``zarr.config``. """ - kwargs_out: ArrayConfigLike = {} + kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): field_name = cast(Literal["order", "write_empty_chunks"], f.name) if field_name not in data: @@ -72,7 +72,10 @@ def from_dict(cls, data: ArrayConfigLike) -> Self: return cls(**kwargs_out) -def parse_array_config(data: ArrayConfig | ArrayConfigLike | None) -> ArrayConfig: +ArrayConfigLike = ArrayConfig | ArrayConfigParams + + +def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: """ Convert various types of data to an ArrayConfig. """ diff --git a/src/zarr/core/buffer/gpu.py b/src/zarr/core/buffer/gpu.py index 6941c8897e..aac6792cff 100644 --- a/src/zarr/core/buffer/gpu.py +++ b/src/zarr/core/buffer/gpu.py @@ -13,6 +13,10 @@ from zarr.core.buffer import core from zarr.core.buffer.core import ArrayLike, BufferPrototype, NDArrayLike +from zarr.registry import ( + register_buffer, + register_ndbuffer, +) if TYPE_CHECKING: from collections.abc import Iterable @@ -215,3 +219,6 @@ def __setitem__(self, key: Any, value: Any) -> None: buffer_prototype = BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer) + +register_buffer(Buffer) +register_ndbuffer(NDBuffer) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index a35c5ca210..0c53cda96c 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -16,7 +16,7 @@ ) from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config -from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice +from zarr.core.indexing import SelectorTuple, is_scalar from zarr.core.metadata.v2 import _default_fill_value from zarr.registry import register_pipeline @@ -243,7 +243,7 @@ async def encode_partial_batch( async def read_batch( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -251,10 +251,10 @@ async def read_batch( chunk_array_batch = await self.decode_partial_batch( [ (byte_getter, chunk_selection, chunk_spec) - for byte_getter, chunk_spec, chunk_selection, _ in batch_info + for byte_getter, chunk_spec, chunk_selection, *_ in batch_info ] ) - for chunk_array, (_, chunk_spec, _, out_selection) in zip( + for chunk_array, (_, chunk_spec, _, out_selection, _) in zip( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: @@ -263,22 +263,19 @@ async def read_batch( out[out_selection] = fill_value_or_default(chunk_spec) else: chunk_bytes_batch = await concurrent_map( - [ - (byte_getter, array_spec.prototype) - for byte_getter, array_spec, _, _ in batch_info - ], + [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], lambda byte_getter, prototype: byte_getter.get(prototype), config.get("async.concurrency"), ) chunk_array_batch = await self.decode_batch( [ (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, _, _) in zip( + for chunk_bytes, (_, chunk_spec, *_) in zip( chunk_bytes_batch, batch_info, strict=False ) ], ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: @@ -296,9 +293,10 @@ def _merge_chunk_array( out_selection: SelectorTuple, chunk_spec: ArraySpec, chunk_selection: SelectorTuple, + is_complete_chunk: bool, drop_axes: tuple[int, ...], ) -> NDBuffer: - if is_total_slice(chunk_selection, chunk_spec.shape) and value.shape == chunk_spec.shape: + if is_complete_chunk and value.shape == chunk_spec.shape: return value if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( @@ -327,7 +325,7 @@ def _merge_chunk_array( async def write_batch( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -337,14 +335,14 @@ async def write_batch( await self.encode_partial_batch( [ (byte_setter, value, chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info + for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info ], ) else: await self.encode_partial_batch( [ (byte_setter, value[out_selection], chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info + for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info ], ) @@ -361,10 +359,10 @@ async def _read_key( chunk_bytes_batch = await concurrent_map( [ ( - None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter, + None if is_complete_chunk else byte_setter, chunk_spec.prototype, ) - for byte_setter, chunk_spec, chunk_selection, _ in batch_info + for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info ], _read_key, config.get("async.concurrency"), @@ -372,7 +370,7 @@ async def _read_key( chunk_array_decoded = await self.decode_batch( [ (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, _, _) in zip( + for chunk_bytes, (_, chunk_spec, *_) in zip( chunk_bytes_batch, batch_info, strict=False ) ], @@ -380,14 +378,24 @@ async def _read_key( chunk_array_merged = [ self._merge_chunk_array( - chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes - ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_decoded, batch_info, strict=False + chunk_array, + value, + out_selection, + chunk_spec, + chunk_selection, + is_complete_chunk, + drop_axes, ) + for chunk_array, ( + _, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + ) in zip(chunk_array_decoded, batch_info, strict=False) ] chunk_array_batch: list[NDBuffer | None] = [] - for chunk_array, (_, chunk_spec, _, _) in zip( + for chunk_array, (_, chunk_spec, *_) in zip( chunk_array_merged, batch_info, strict=False ): if chunk_array is None: @@ -403,7 +411,7 @@ async def _read_key( chunk_bytes_batch = await self.encode_batch( [ (chunk_array, chunk_spec) - for chunk_array, (_, chunk_spec, _, _) in zip( + for chunk_array, (_, chunk_spec, *_) in zip( chunk_array_batch, batch_info, strict=False ) ], @@ -418,7 +426,7 @@ async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> Non await concurrent_map( [ (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, _, _, _) in zip( + for chunk_bytes, (byte_setter, *_) in zip( chunk_bytes_batch, batch_info, strict=False ) ], @@ -446,7 +454,7 @@ async def encode( async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -461,7 +469,7 @@ async def read( async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 051e8c68e1..c565cb0708 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -29,10 +29,13 @@ from __future__ import annotations -from typing import Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, cast from donfig import Config as DConfig +if TYPE_CHECKING: + from donfig.config_obj import ConfigSet + class BadConfigError(ValueError): _msg = "bad Config: %r" @@ -56,6 +59,14 @@ def reset(self) -> None: self.clear() self.refresh() + def enable_gpu(self) -> ConfigSet: + """ + Configure Zarr to use GPUs where possible. + """ + return self.set( + {"buffer": "zarr.core.buffer.gpu.Buffer", "ndbuffer": "zarr.core.buffer.gpu.NDBuffer"} + ) + # The default configuration for zarr config = Config( diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 41b6c26bb6..998fe156a1 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -321,12 +321,12 @@ class ChunkDimProjection(NamedTuple): Selection of items from chunk array. dim_out_sel Selection of items in target (output) array. - """ dim_chunk_ix: int dim_chunk_sel: Selector dim_out_sel: Selector | None + is_complete_chunk: bool @dataclass(frozen=True) @@ -346,7 +346,8 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel - dim_offset dim_out_sel = None - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + is_complete_chunk = self.dim_chunk_len == 1 + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) @dataclass(frozen=True) @@ -420,7 +421,10 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + is_complete_chunk = ( + dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] + ) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) def check_selection_length(selection: SelectionNormalized, shape: ChunkCoords) -> None: @@ -493,12 +497,14 @@ class ChunkProjection(NamedTuple): Selection of items from chunk array. out_selection Selection of items in target (output) array. - + is_complete_chunk: + True if a complete chunk is indexed """ chunk_coords: ChunkCoords chunk_selection: tuple[Selector, ...] | npt.NDArray[np.intp] out_selection: tuple[Selector, ...] | npt.NDArray[np.intp] | slice + is_complete_chunk: bool def is_slice(s: Any) -> TypeGuard[slice]: @@ -574,8 +580,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: out_selection = tuple( p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) @dataclass(frozen=True) @@ -643,8 +649,9 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] stop = self.chunk_nitems_cumsum[dim_chunk_ix] dim_out_sel = slice(start, stop) + is_complete_chunk = False # TODO - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) class Order(Enum): @@ -783,8 +790,8 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel[start:stop] - dim_offset - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + is_complete_chunk = False # TODO + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) def slice_to_range(s: slice, length: int) -> range: @@ -921,7 +928,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: if not is_basic_selection(out_selection): out_selection = ix_(out_selection, self.shape) - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) @dataclass(frozen=True) @@ -1030,8 +1038,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: out_selection = tuple( p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) @dataclass(frozen=True) @@ -1198,7 +1206,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets, strict=True) ) - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + is_complete_chunk = False # TODO + yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) @dataclass(frozen=True) @@ -1363,32 +1372,6 @@ def c_order_iter(chunks_per_shard: ChunkCoords) -> Iterator[ChunkCoords]: return itertools.product(*(range(x) for x in chunks_per_shard)) -def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: - """Determine whether `item` specifies a complete slice of array with the - given `shape`. Used to optimize __setitem__ operations on the Chunk - class.""" - - # N.B., assume shape is normalized - if item == slice(None): - return True - if isinstance(item, slice): - item = (item,) - if isinstance(item, tuple): - return all( - (isinstance(dim_sel, int) and dim_len == 1) - or ( - isinstance(dim_sel, slice) - and ( - (dim_sel == slice(None)) - or ((dim_sel.stop - dim_sel.start == dim_len) and (dim_sel.step in [1, None])) - ) - ) - for dim_sel, dim_len in zip(item, shape, strict=False) - ) - else: - raise TypeError(f"expected slice or tuple of slices, found {item!r}") - - def get_indexer( selection: SelectionWithFields, shape: ChunkCoords, chunk_grid: ChunkGrid ) -> Indexer: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 25697c4545..3d292c81b4 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -353,7 +353,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type("nat") elif dtype.kind == "V": if dtype.fields is not None: - default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()]) + default = tuple(_default_fill_value(field[0]) for field in dtype.fields.values()) return np.array([default], dtype=dtype) else: return np.zeros(1, dtype=dtype) diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 92c14fcc76..1cc7039e68 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -1,6 +1,7 @@ from __future__ import annotations import warnings +from contextlib import suppress from typing import TYPE_CHECKING, Any from zarr.abc.store import ( @@ -286,6 +287,19 @@ async def delete(self, key: str) -> None: except self.allowed_exceptions: pass + async def delete_dir(self, prefix: str) -> None: + # docstring inherited + if not self.supports_deletes: + raise NotImplementedError( + "This method is only available for stores that support deletes." + ) + self._check_writable() + + path_to_delete = _dereference_path(self.path, prefix) + + with suppress(*self.allowed_exceptions): + await self.fs._rm(path_to_delete, recursive=True) + async def exists(self, key: str) -> bool: # docstring inherited path = _dereference_path(self.path, key) diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index e9d6211588..5f1a97acd9 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -88,7 +88,7 @@ def log(self, hint: Any = "") -> Generator[None, None, None]: op = f"{type(self._store).__name__}.{method}" if hint: op = f"{op}({hint})" - self.logger.info("Calling %s", op) + self.logger.info(" Calling %s", op) start_time = time.time() try: self.counter[method] += 1 diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 5722f3c99e..8847b49020 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,5 +1,5 @@ import sys -from typing import Any +from typing import Any, Literal import hypothesis.extra.numpy as npst import hypothesis.strategies as st @@ -8,9 +8,13 @@ from hypothesis.strategies import SearchStrategy import zarr -from zarr.abc.store import RangeByteRequest +from zarr.abc.store import RangeByteRequest, Store +from zarr.codecs.bytes import BytesCodec from zarr.core.array import Array +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.common import ZarrFormat +from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike from zarr.storage._common import _dereference_path @@ -67,6 +71,11 @@ def safe_unicode_for_dtype(dtype: np.dtype[np.str_]) -> st.SearchStrategy[str]: ) +def clear_store(x: Store) -> Store: + sync(x.clear()) + return x + + # From https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#node-names # 1. must not be the empty string ("") # 2. must not include the character "/" @@ -85,12 +94,59 @@ def safe_unicode_for_dtype(dtype: np.dtype[np.str_]) -> st.SearchStrategy[str]: # st.builds will only call a new store constructor for different keyword arguments # i.e. stores.examples() will always return the same object per Store class. # So we map a clear to reset the store. -stores = st.builds(MemoryStore, st.just({})).map(lambda x: sync(x.clear())) +stores = st.builds(MemoryStore, st.just({})).map(clear_store) compressors = st.sampled_from([None, "default"]) -zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([2, 3]) +zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([3, 2]) array_shapes = npst.array_shapes(max_dims=4, min_side=0) +@st.composite # type: ignore[misc] +def dimension_names(draw: st.DrawFn, *, ndim: int | None = None) -> list[None | str] | None: + simple_text = st.text(zarr_key_chars, min_size=0) + return draw(st.none() | st.lists(st.none() | simple_text, min_size=ndim, max_size=ndim)) # type: ignore[no-any-return] + + +@st.composite # type: ignore[misc] +def array_metadata( + draw: st.DrawFn, + *, + array_shapes: st.SearchStrategy[tuple[int, ...]] = npst.array_shapes, + zarr_formats: st.SearchStrategy[Literal[2, 3]] = zarr_formats, + attributes: st.SearchStrategy[dict[str, Any]] = attrs, +) -> ArrayV2Metadata | ArrayV3Metadata: + zarr_format = draw(zarr_formats) + # separator = draw(st.sampled_from(['/', '\\'])) + shape = draw(array_shapes()) + ndim = len(shape) + chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) + dtype = draw(v3_dtypes()) + fill_value = draw(npst.from_dtype(dtype)) + if zarr_format == 2: + return ArrayV2Metadata( + shape=shape, + chunks=chunk_shape, + dtype=dtype, + fill_value=fill_value, + order=draw(st.sampled_from(["C", "F"])), + attributes=draw(attributes), + dimension_separator=draw(st.sampled_from([".", "/"])), + filters=None, + compressor=None, + ) + else: + return ArrayV3Metadata( + shape=shape, + data_type=dtype, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + fill_value=fill_value, + attributes=draw(attributes), + dimension_names=draw(dimension_names(ndim=ndim)), + chunk_key_encoding=DefaultChunkKeyEncoding(separator="/"), # FIXME + codecs=[BytesCodec()], + storage_transformers=(), + ) + + @st.composite # type: ignore[misc] def numpy_arrays( draw: st.DrawFn, @@ -110,6 +166,32 @@ def numpy_arrays( return draw(npst.arrays(dtype=dtype, shape=shapes)) +@st.composite # type: ignore[misc] +def chunk_shapes(draw: st.DrawFn, *, shape: tuple[int, ...]) -> tuple[int, ...]: + # We want this strategy to shrink towards arrays with smaller number of chunks + # 1. st.integers() shrinks towards smaller values. So we use that to generate number of chunks + numchunks = draw( + st.tuples(*[st.integers(min_value=0 if size == 0 else 1, max_value=size) for size in shape]) + ) + # 2. and now generate the chunks tuple + return tuple( + size // nchunks if nchunks > 0 else 0 + for size, nchunks in zip(shape, numchunks, strict=True) + ) + + +@st.composite # type: ignore[misc] +def shard_shapes( + draw: st.DrawFn, *, shape: tuple[int, ...], chunk_shape: tuple[int, ...] +) -> tuple[int, ...]: + # We want this strategy to shrink towards arrays with smaller number of shards + # shards must be an integral number of chunks + assert all(c != 0 for c in chunk_shape) + numchunks = tuple(s // c for s, c in zip(shape, chunk_shape, strict=True)) + multiples = tuple(draw(st.integers(min_value=1, max_value=nc)) for nc in numchunks) + return tuple(m * c for m, c in zip(multiples, chunk_shape, strict=True)) + + @st.composite # type: ignore[misc] def np_array_and_chunks( draw: st.DrawFn, *, arrays: st.SearchStrategy[np.ndarray] = numpy_arrays @@ -119,19 +201,7 @@ def np_array_and_chunks( Returns: a tuple of the array and a suitable random chunking for it. """ array = draw(arrays) - # We want this strategy to shrink towards arrays with smaller number of chunks - # 1. st.integers() shrinks towards smaller values. So we use that to generate number of chunks - numchunks = draw( - st.tuples( - *[st.integers(min_value=0 if size == 0 else 1, max_value=size) for size in array.shape] - ) - ) - # 2. and now generate the chunks tuple - chunks = tuple( - size // nchunks if nchunks > 0 else 0 - for size, nchunks in zip(array.shape, numchunks, strict=True) - ) - return (array, chunks) + return (array, draw(chunk_shapes(shape=array.shape))) @st.composite # type: ignore[misc] @@ -154,7 +224,12 @@ def arrays( zarr_format = draw(zarr_formats) if arrays is None: arrays = numpy_arrays(shapes=shapes, zarr_formats=st.just(zarr_format)) - nparray, chunks = draw(np_array_and_chunks(arrays=arrays)) + nparray = draw(arrays) + chunk_shape = draw(chunk_shapes(shape=nparray.shape)) + if zarr_format == 3 and all(c > 0 for c in chunk_shape): + shard_shape = draw(st.none() | shard_shapes(shape=nparray.shape, chunk_shape=chunk_shape)) + else: + shard_shape = None # test that None works too. fill_value = draw(st.one_of([st.none(), npst.from_dtype(nparray.dtype)])) # compressor = draw(compressors) @@ -167,7 +242,8 @@ def arrays( a = root.create_array( array_path, shape=nparray.shape, - chunks=chunks, + chunks=chunk_shape, + shards=shard_shape, dtype=nparray.dtype, attributes=attributes, # compressor=compressor, # FIXME @@ -180,7 +256,8 @@ def arrays( assert a.name is not None assert isinstance(root[array_path], Array) assert nparray.shape == a.shape - assert chunks == a.chunks + assert chunk_shape == a.chunks + assert shard_shape == a.shards assert array_path == a.path, (path, name, array_path, a.name, a.path) assert a.basename == name, (a.basename, name) assert dict(a.attrs) == expected_attrs @@ -209,6 +286,43 @@ def basic_indices(draw: st.DrawFn, *, shape: tuple[int], **kwargs: Any) -> Any: ) +@st.composite # type: ignore[misc] +def orthogonal_indices( + draw: st.DrawFn, *, shape: tuple[int] +) -> tuple[tuple[np.ndarray[Any, Any], ...], tuple[np.ndarray[Any, Any], ...]]: + """ + Strategy that returns + (1) a tuple of integer arrays used for orthogonal indexing of Zarr arrays. + (2) an tuple of integer arrays that can be used for equivalent indexing of numpy arrays + """ + zindexer = [] + npindexer = [] + ndim = len(shape) + for axis, size in enumerate(shape): + val = draw( + npst.integer_array_indices( + shape=(size,), result_shape=npst.array_shapes(min_side=1, max_side=size, max_dims=1) + ) + | basic_indices(min_dims=1, shape=(size,), allow_ellipsis=False) + .map(lambda x: (x,) if not isinstance(x, tuple) else x) # bare ints, slices + .filter(lambda x: bool(x)) # skip empty tuple + ) + (idxr,) = val + if isinstance(idxr, int): + idxr = np.array([idxr]) + zindexer.append(idxr) + if isinstance(idxr, slice): + idxr = np.arange(*idxr.indices(size)) + elif isinstance(idxr, (tuple, int)): + idxr = np.array(idxr) + newshape = [1] * ndim + newshape[axis] = idxr.size + npindexer.append(idxr.reshape(newshape)) + + # casting the output of broadcast_arrays is needed for numpy 1.25 + return tuple(zindexer), tuple(np.broadcast_arrays(*npindexer)) + + def key_ranges( keys: SearchStrategy = node_names, max_size: int = sys.maxsize ) -> SearchStrategy[list[int]]: diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py index c7b6e7939c..0a93b93fdb 100644 --- a/src/zarr/testing/utils.py +++ b/src/zarr/testing/utils.py @@ -38,7 +38,7 @@ def has_cupy() -> bool: return False -T_Callable = TypeVar("T_Callable", bound=Callable[[], Coroutine[Any, Any, None]]) +T_Callable = TypeVar("T_Callable", bound=Callable[..., Coroutine[Any, Any, None] | None]) # Decorator for GPU tests diff --git a/tests/test_api.py b/tests/test_api.py index 2cfed2c52a..341245404c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -28,6 +28,7 @@ from zarr.errors import MetadataValidationError from zarr.storage import MemoryStore from zarr.storage._utils import normalize_path +from zarr.testing.utils import gpu_test def test_create(memory_store: Store) -> None: @@ -1132,3 +1133,40 @@ def test_open_array_with_mode_r_plus(store: Store) -> None: assert isinstance(result, NDArrayLike) assert (result == 1).all() z2[:] = 3 + + +@gpu_test +@pytest.mark.parametrize( + "store", + ["local", "memory", "zip"], + indirect=True, +) +@pytest.mark.parametrize("zarr_format", [None, 2, 3]) +def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None: + import cupy as cp + + if zarr_format == 2: + # Without this, the zstd codec attempts to convert the cupy + # array to bytes. + compressors = None + else: + compressors = "auto" + + with zarr.config.enable_gpu(): + src = cp.random.uniform(size=(100, 100)) # allocate on the device + z = zarr.create_array( + store, + name="a", + shape=src.shape, + chunks=(10, 10), + dtype=src.dtype, + overwrite=True, + zarr_format=zarr_format, + compressors=compressors, + ) + z[:10, :10] = src[:10, :10] + + result = z[:10, :10] + # assert_array_equal doesn't check the type + assert isinstance(result, type(src)) + cp.testing.assert_array_equal(result, src[:10, :10]) diff --git a/tests/test_array.py b/tests/test_array.py index 8ff39de70a..c9b2cbce91 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -849,66 +849,6 @@ def test_append_bad_shape(store: MemoryStore, zarr_format: ZarrFormat) -> None: z.append(b) -@pytest.mark.parametrize("order", ["C", "F", None]) -@pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_array_create_metadata_order_v2( - order: MemoryOrder | None, zarr_format: int, store: MemoryStore -) -> None: - """ - Test that the ``order`` attribute in zarr v2 array metadata is set correctly via the ``order`` - keyword argument to ``Array.create``. When ``order`` is ``None``, the value of the - ``array.order`` config is used. - """ - arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") - - expected = order or zarr.config.get("array.order") - assert arr.metadata.zarr_format == 2 # guard for mypy - assert arr.metadata.order == expected - - -@pytest.mark.parametrize("order_config", ["C", "F", None]) -@pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_array_create_order( - order_config: MemoryOrder | None, - zarr_format: ZarrFormat, - store: MemoryStore, -) -> None: - """ - Test that the arrays generated by array indexing have a memory order defined by the config order - value - """ - config: ArrayConfigLike = {} - if order_config is None: - config = {} - expected = zarr.config.get("array.order") - else: - config = {"order": order_config} - expected = order_config - - arr = zarr.create_array( - store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config - ) - - vals = np.asarray(arr) - if expected == "C": - assert vals.flags.c_contiguous - elif expected == "F": - assert vals.flags.f_contiguous - else: - raise AssertionError - - -@pytest.mark.parametrize("write_empty_chunks", [True, False]) -def test_write_empty_chunks_config(write_empty_chunks: bool) -> None: - """ - Test that the value of write_empty_chunks is sensitive to the global config when not set - explicitly - """ - with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): - arr = zarr.create_array({}, shape=(2, 2), dtype="i4") - assert arr._async_array._config.write_empty_chunks == write_empty_chunks - - @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("write_empty_chunks", [True, False]) @pytest.mark.parametrize("fill_value", [0, 5]) @@ -1012,339 +952,396 @@ def test_auto_partition_auto_shards( assert auto_shards == expected_shards -def test_chunks_and_shards() -> None: - store = StorePath(MemoryStore()) - shape = (100, 100) - chunks = (5, 5) - shards = (10, 10) - - arr_v3 = zarr.create_array(store=store / "v3", shape=shape, chunks=chunks, dtype="i4") - assert arr_v3.chunks == chunks - assert arr_v3.shards is None - - arr_v3_sharding = zarr.create_array( - store=store / "v3_sharding", - shape=shape, - chunks=chunks, - shards=shards, - dtype="i4", - ) - assert arr_v3_sharding.chunks == chunks - assert arr_v3_sharding.shards == shards - - arr_v2 = zarr.create_array( - store=store / "v2", shape=shape, chunks=chunks, zarr_format=2, dtype="i4" - ) - assert arr_v2.chunks == chunks - assert arr_v2.shards is None - - -def test_create_array_default_fill_values() -> None: - a = zarr.create_array(MemoryStore(), shape=(5,), chunks=(5,), dtype=" None: - """ - Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. - """ +class TestCreateArray: + @staticmethod + def test_chunks_and_shards(store: Store) -> None: + spath = StorePath(store) + shape = (100, 100) + chunks = (5, 5) + shards = (10, 10) + + arr_v3 = zarr.create_array(store=spath / "v3", shape=shape, chunks=chunks, dtype="i4") + assert arr_v3.chunks == chunks + assert arr_v3.shards is None + + arr_v3_sharding = zarr.create_array( + store=spath / "v3_sharding", + shape=shape, + chunks=chunks, + shards=shards, + dtype="i4", + ) + assert arr_v3_sharding.chunks == chunks + assert arr_v3_sharding.shards == shards - # v2 - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - zarr_format=2, - compressors=empty_value, - filters=empty_value, + arr_v2 = zarr.create_array( + store=spath / "v2", shape=shape, chunks=chunks, zarr_format=2, dtype="i4" + ) + assert arr_v2.chunks == chunks + assert arr_v2.shards is None + + @staticmethod + @pytest.mark.parametrize( + ("dtype", "fill_value_expected"), [(" None: + a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) + assert a.fill_value == fill_value_expected + + @staticmethod + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("empty_value", [None, ()]) + async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_value: Any) -> None: + """ + Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. + """ + + # v2 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=empty_value, + filters=empty_value, + ) + # Test metadata explicitly + assert arr.metadata.zarr_format == 2 # guard for mypy + # The v2 metadata stores None and () separately + assert arr.metadata.filters == empty_value + # The v2 metadata does not allow tuple for compressor, therefore it is turned into None + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () + + # v3 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + compressors=empty_value, + filters=empty_value, + ) + assert arr.metadata.zarr_format == 3 # guard for mypy + if dtype == "str": + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() + else: + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() + + @staticmethod + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + (), + (ZstdCodec(level=3),), + (ZstdCodec(level=3), GzipCodec(level=0)), + ZstdCodec(level=3), + {"name": "zstd", "configuration": {"level": 3}}, + ({"name": "zstd", "configuration": {"level": 3}},), + ], ) - assert arr.metadata.zarr_format == 3 # guard for mypy - if dtype == "str": - assert arr.metadata.codecs == (VLenUTF8Codec(),) - assert arr.serializer == VLenUTF8Codec() - else: - assert arr.metadata.codecs == (BytesCodec(),) - assert arr.serializer == BytesCodec() - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) -@pytest.mark.parametrize( - "compressors", - [ - "auto", - None, - (), - (ZstdCodec(level=3),), - (ZstdCodec(level=3), GzipCodec(level=0)), - ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3}}, - ({"name": "zstd", "configuration": {"level": 3}},), - ], -) -@pytest.mark.parametrize( - "filters", - [ - "auto", - None, - (), - ( - TransposeCodec( - order=[ - 0, - ] + @pytest.mark.parametrize( + "filters", + [ + "auto", + None, + (), + ( + TransposeCodec( + order=[ + 0, + ] + ), ), - ), - ( - TransposeCodec( - order=[ - 0, - ] + ( + TransposeCodec( + order=[ + 0, + ] + ), + TransposeCodec( + order=[ + 0, + ] + ), ), TransposeCodec( order=[ 0, ] ), - ), - TransposeCodec( - order=[ - 0, - ] - ), - {"name": "transpose", "configuration": {"order": [0]}}, - ({"name": "transpose", "configuration": {"order": [0]}},), - ], -) -@pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) -async def test_create_array_v3_chunk_encoding( - store: MemoryStore, - compressors: CompressorsLike, - filters: FiltersLike, - dtype: str, - chunks: tuple[int, ...], - shards: tuple[int, ...] | None, -) -> None: - """ - Test various possibilities for the compressors and filters parameter to create_array - """ - arr = await create_array( - store=store, - dtype=dtype, - shape=(12,), - chunks=chunks, - shards=shards, - zarr_format=3, - filters=filters, - compressors=compressors, - ) - filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( - filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) - ) - assert arr.filters == filters_expected - assert arr.compressors == compressors_expected - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) -@pytest.mark.parametrize( - "compressors", - [ - "auto", - None, - numcodecs.Zstd(level=3), - (), - (numcodecs.Zstd(level=3),), - ], -) -@pytest.mark.parametrize( - "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] -) -async def test_create_array_v2_chunk_encoding( - store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str -) -> None: - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - zarr_format=2, - compressors=compressors, - filters=filters, - ) - filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=np.dtype(dtype) - ) - assert arr.metadata.zarr_format == 2 # guard for mypy - assert arr.metadata.compressor == compressor_expected - assert arr.metadata.filters == filters_expected - - # Normalize for property getters - compressor_expected = () if compressor_expected is None else (compressor_expected,) - filters_expected = () if filters_expected is None else filters_expected - - assert arr.compressors == compressor_expected - assert arr.filters == filters_expected - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) -async def test_create_array_v3_default_filters_compressors(store: MemoryStore, dtype: str) -> None: - """ - Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with - ``zarr_format`` = 3 and ``filters`` and ``compressors`` are not specified. - """ - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - zarr_format=3, + {"name": "transpose", "configuration": {"order": [0]}}, + ({"name": "transpose", "configuration": {"order": [0]}},), + ], ) - expected_filters, expected_serializer, expected_compressors = _get_default_chunk_encoding_v3( - np_dtype=np.dtype(dtype) - ) - assert arr.filters == expected_filters - assert arr.serializer == expected_serializer - assert arr.compressors == expected_compressors - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) -async def test_create_array_v2_default_filters_compressors(store: MemoryStore, dtype: str) -> None: - """ - Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with - ``zarr_format`` = 2 and ``filters`` and ``compressors`` are not specified. - """ - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - zarr_format=2, - ) - expected_filters, expected_compressors = _get_default_chunk_encoding_v2( - np_dtype=np.dtype(dtype) + @pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) + async def test_v3_chunk_encoding( + store: MemoryStore, + compressors: CompressorsLike, + filters: FiltersLike, + dtype: str, + chunks: tuple[int, ...], + shards: tuple[int, ...] | None, + ) -> None: + """ + Test various possibilities for the compressors and filters parameter to create_array + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(12,), + chunks=chunks, + shards=shards, + zarr_format=3, + filters=filters, + compressors=compressors, + ) + filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( + filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) + ) + assert arr.filters == filters_expected + assert arr.compressors == compressors_expected + + @staticmethod + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + numcodecs.Zstd(level=3), + (), + (numcodecs.Zstd(level=3),), + ], ) - assert arr.metadata.zarr_format == 2 # guard for mypy - assert arr.metadata.filters == expected_filters - assert arr.metadata.compressor == expected_compressors - - # Normalize for property getters - expected_filters = () if expected_filters is None else expected_filters - expected_compressors = () if expected_compressors is None else (expected_compressors,) - assert arr.filters == expected_filters - assert arr.compressors == expected_compressors - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -async def test_create_array_v2_no_shards(store: MemoryStore) -> None: - """ - Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. - """ - msg = re.escape( - "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." + @pytest.mark.parametrize( + "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] ) - with pytest.raises(ValueError, match=msg): - _ = await create_array( + async def test_v2_chunk_encoding( + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str + ) -> None: + arr = await create_array( store=store, - dtype="uint8", + dtype=dtype, shape=(10,), - shards=(5,), zarr_format=2, + compressors=compressors, + filters=filters, ) - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("impl", ["sync", "async"]) -async def test_create_array_data(impl: Literal["sync", "async"], store: Store) -> None: - """ - Test that we can invoke ``create_array`` with a ``data`` parameter. - """ - data = np.arange(10) - name = "foo" - arr: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | Array - if impl == "sync": - arr = sync_api.create_array(store, name=name, data=data) - stored = arr[:] - elif impl == "async": - arr = await create_array(store, name=name, data=data, zarr_format=3) - stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), - prototype=default_buffer_prototype(), + filters_expected, compressor_expected = _parse_chunk_encoding_v2( + filters=filters, compressor=compressors, dtype=np.dtype(dtype) ) - else: - raise ValueError(f"Invalid impl: {impl}") - - assert np.array_equal(stored, data) - + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.compressor == compressor_expected + assert arr.metadata.filters == filters_expected -@pytest.mark.parametrize("store", ["memory"], indirect=True) -async def test_create_array_data_invalid_params(store: Store) -> None: - """ - Test that failing to specify data AND shape / dtype results in a ValueError - """ - with pytest.raises(ValueError, match="shape was not specified"): - await create_array(store, data=None, shape=None, dtype=None) + # Normalize for property getters + compressor_expected = () if compressor_expected is None else (compressor_expected,) + filters_expected = () if filters_expected is None else filters_expected - # we catch shape=None first, so specifying a dtype should raise the same exception as before - with pytest.raises(ValueError, match="shape was not specified"): - await create_array(store, data=None, shape=None, dtype="uint8") - - with pytest.raises(ValueError, match="dtype was not specified"): - await create_array(store, data=None, shape=(10, 10)) + assert arr.compressors == compressor_expected + assert arr.filters == filters_expected + @staticmethod + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + async def test_default_filters_compressors( + store: MemoryStore, dtype: str, zarr_format: ZarrFormat + ) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=zarr_format, + ) + if zarr_format == 3: + expected_filters, expected_serializer, expected_compressors = ( + _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) + ) -@pytest.mark.parametrize("store", ["memory"], indirect=True) -async def test_create_array_data_ignored_params(store: Store) -> None: - """ - Test that specify data AND shape AND dtype results in a warning - """ - data = np.arange(10) - with pytest.raises( - ValueError, match="The data parameter was used, but the shape parameter was also used." - ): - await create_array(store, data=data, shape=data.shape, dtype=None, overwrite=True) + elif zarr_format == 2: + default_filters, default_compressors = _get_default_chunk_encoding_v2( + np_dtype=np.dtype(dtype) + ) + if default_filters is None: + expected_filters = () + else: + expected_filters = default_filters + if default_compressors is None: + expected_compressors = () + else: + expected_compressors = (default_compressors,) + expected_serializer = None + else: + raise ValueError(f"Invalid zarr_format: {zarr_format}") + + assert arr.filters == expected_filters + assert arr.serializer == expected_serializer + assert arr.compressors == expected_compressors + + @staticmethod + async def test_v2_no_shards(store: Store) -> None: + """ + Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. + """ + msg = re.escape( + "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." + ) + with pytest.raises(ValueError, match=msg): + _ = await create_array( + store=store, + dtype="uint8", + shape=(10,), + shards=(5,), + zarr_format=2, + ) - # we catch shape first, so specifying a dtype should raise the same warning as before - with pytest.raises( - ValueError, match="The data parameter was used, but the shape parameter was also used." - ): - await create_array(store, data=data, shape=data.shape, dtype=data.dtype, overwrite=True) + @staticmethod + @pytest.mark.parametrize("impl", ["sync", "async"]) + async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: + """ + Test that we can invoke ``create_array`` with a ``data`` parameter. + """ + data = np.arange(10) + name = "foo" + arr: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | Array + if impl == "sync": + arr = sync_api.create_array(store, name=name, data=data) + stored = arr[:] + elif impl == "async": + arr = await create_array(store, name=name, data=data, zarr_format=3) + stored = await arr._get_selection( + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), + prototype=default_buffer_prototype(), + ) + else: + raise ValueError(f"Invalid impl: {impl}") + + assert np.array_equal(stored, data) + + @staticmethod + async def test_with_data_invalid_params(store: Store) -> None: + """ + Test that failing to specify data AND shape / dtype results in a ValueError + """ + with pytest.raises(ValueError, match="shape was not specified"): + await create_array(store, data=None, shape=None, dtype=None) + + # we catch shape=None first, so specifying a dtype should raise the same exception as before + with pytest.raises(ValueError, match="shape was not specified"): + await create_array(store, data=None, shape=None, dtype="uint8") + + with pytest.raises(ValueError, match="dtype was not specified"): + await create_array(store, data=None, shape=(10, 10)) + + @staticmethod + async def test_data_ignored_params(store: Store) -> None: + """ + Test that specifying data AND shape AND dtype results in a ValueError + """ + data = np.arange(10) + with pytest.raises( + ValueError, match="The data parameter was used, but the shape parameter was also used." + ): + await create_array(store, data=data, shape=data.shape, dtype=None, overwrite=True) + + # we catch shape first, so specifying a dtype should raise the same warning as before + with pytest.raises( + ValueError, match="The data parameter was used, but the shape parameter was also used." + ): + await create_array(store, data=data, shape=data.shape, dtype=data.dtype, overwrite=True) + + with pytest.raises( + ValueError, match="The data parameter was used, but the dtype parameter was also used." + ): + await create_array(store, data=data, shape=None, dtype=data.dtype, overwrite=True) + + @staticmethod + @pytest.mark.parametrize("order_config", ["C", "F", None]) + def test_order( + order_config: MemoryOrder | None, + zarr_format: ZarrFormat, + store: MemoryStore, + ) -> None: + """ + Test that the arrays generated by array indexing have a memory order defined by the config order + value, and that for zarr v2 arrays, the ``order`` field in the array metadata is set correctly. + """ + config: ArrayConfigLike = {} + if order_config is None: + config = {} + expected = zarr.config.get("array.order") + else: + config = {"order": order_config} + expected = order_config + if zarr_format == 2: + arr = zarr.create_array( + store=store, + shape=(2, 2), + zarr_format=zarr_format, + dtype="i4", + order=expected, + config=config, + ) + # guard for type checking + assert arr.metadata.zarr_format == 2 + assert arr.metadata.order == expected + else: + arr = zarr.create_array( + store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config + ) + vals = np.asarray(arr) + if expected == "C": + assert vals.flags.c_contiguous + elif expected == "F": + assert vals.flags.f_contiguous + else: + raise AssertionError + + @staticmethod + @pytest.mark.parametrize("write_empty_chunks", [True, False]) + async def test_write_empty_chunks_config(write_empty_chunks: bool, store: Store) -> None: + """ + Test that the value of write_empty_chunks is sensitive to the global config when not set + explicitly + """ + with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): + arr = await create_array(store, shape=(2, 2), dtype="i4") + assert arr._config.write_empty_chunks == write_empty_chunks + + @staticmethod + @pytest.mark.parametrize("path", [None, "", "/", "/foo", "foo", "foo/bar"]) + async def test_name(store: Store, zarr_format: ZarrFormat, path: str | None) -> None: + arr = await create_array( + store, shape=(2, 2), dtype="i4", name=path, zarr_format=zarr_format + ) + if path is None: + expected_path = "" + elif path.startswith("/"): + expected_path = path.lstrip("/") + else: + expected_path = path + assert arr.path == expected_path + assert arr.name == "/" + expected_path - with pytest.raises( - ValueError, match="The data parameter was used, but the dtype parameter was also used." - ): - await create_array(store, data=data, shape=None, dtype=data.dtype, overwrite=True) + # test that implicit groups were created + path_parts = expected_path.split("/") + if len(path_parts) > 1: + *parents, _ = ["", *accumulate(path_parts, lambda x, y: "/".join([x, y]))] # noqa: FLY002 + for parent_path in parents: + # this will raise if these groups were not created + _ = await zarr.api.asynchronous.open_group( + store=store, path=parent_path, mode="r", zarr_format=zarr_format + ) @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) @@ -1361,9 +1358,18 @@ async def test_orthogonal_set_total_slice() -> None: """Ensure that a whole chunk overwrite does not read chunks""" store = MemoryStore() array = zarr.create_array(store, shape=(20, 20), chunks=(1, 2), dtype=int, fill_value=-1) - with mock.patch("zarr.storage.MemoryStore.get", side_effect=ValueError): + with mock.patch("zarr.storage.MemoryStore.get", side_effect=RuntimeError): array[0, slice(4, 10)] = np.arange(6) + array = zarr.create_array( + store, shape=(20, 21), chunks=(1, 2), dtype=int, fill_value=-1, overwrite=True + ) + with mock.patch("zarr.storage.MemoryStore.get", side_effect=RuntimeError): + array[0, :] = np.arange(21) + + with mock.patch("zarr.storage.MemoryStore.get", side_effect=RuntimeError): + array[:] = 1 + @pytest.mark.skipif( Version(numcodecs.__version__) < Version("0.15.1"), @@ -1443,3 +1449,18 @@ def test_multiprocessing(store: Store, method: Literal["fork", "spawn", "forkser results = pool.starmap(_index_array, [(arr, slice(len(data)))]) assert all(np.array_equal(r, data) for r in results) + + +async def test_sharding_coordinate_selection() -> None: + store = MemoryStore() + g = zarr.open_group(store, mode="w") + arr = g.create_array( + name="a", + shape=(2, 3, 4), + chunks=(1, 2, 2), + overwrite=True, + dtype=np.float32, + shards=(2, 4, 4), + ) + arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) + assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() # type: ignore[index] diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 932c32f1ae..30d0d75f22 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -19,7 +19,6 @@ OrthogonalSelection, Selection, _iter_grid, - is_total_slice, make_slice_selection, normalize_integer_selection, oindex, @@ -1954,8 +1953,3 @@ def test_vectorized_indexing_incompatible_shape(store) -> None: ) with pytest.raises(ValueError, match="Attempting to set"): arr[np.array([1, 2]), np.array([1, 2])] = np.array([[-1, -2], [-3, -4]]) - - -def test_is_total_slice(): - assert is_total_slice((0, slice(4, 6)), (1, 2)) - assert is_total_slice((slice(0, 1, None), slice(4, 6)), (1, 2)) diff --git a/tests/test_properties.py b/tests/test_properties.py index 2e60c951dd..acecd44810 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -1,14 +1,26 @@ -import numpy as np import pytest from numpy.testing import assert_array_equal +from zarr.core.buffer import default_buffer_prototype + pytest.importorskip("hypothesis") import hypothesis.extra.numpy as npst import hypothesis.strategies as st -from hypothesis import given - -from zarr.testing.strategies import arrays, basic_indices, numpy_arrays, zarr_formats +from hypothesis import assume, given + +from zarr.abc.store import Store +from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.core.sync import sync +from zarr.testing.strategies import ( + array_metadata, + arrays, + basic_indices, + numpy_arrays, + orthogonal_indices, + stores, + zarr_formats, +) @given(data=st.data(), zarr_format=zarr_formats) @@ -18,6 +30,24 @@ def test_roundtrip(data: st.DataObject, zarr_format: int) -> None: assert_array_equal(nparray, zarray[:]) +@given(array=arrays()) +def test_array_creates_implicit_groups(array): + path = array.path + ancestry = path.split("/")[:-1] + for i in range(len(ancestry)): + parent = "/".join(ancestry[: i + 1]) + if array.metadata.zarr_format == 2: + assert ( + sync(array.store.get(f"{parent}/.zgroup", prototype=default_buffer_prototype())) + is not None + ) + elif array.metadata.zarr_format == 3: + assert ( + sync(array.store.get(f"{parent}/zarr.json", prototype=default_buffer_prototype())) + is not None + ) + + @given(data=st.data()) def test_basic_indexing(data: st.DataObject) -> None: zarray = data.draw(arrays()) @@ -26,12 +56,29 @@ def test_basic_indexing(data: st.DataObject) -> None: actual = zarray[indexer] assert_array_equal(nparray[indexer], actual) - new_data = np.ones_like(actual) + new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) zarray[indexer] = new_data nparray[indexer] = new_data assert_array_equal(nparray, zarray[:]) +@given(data=st.data()) +def test_oindex(data: st.DataObject) -> None: + # integer_array_indices can't handle 0-size dimensions. + zarray = data.draw(arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) + nparray = zarray[:] + + zindexer, npindexer = data.draw(orthogonal_indices(shape=nparray.shape)) + actual = zarray.oindex[zindexer] + assert_array_equal(nparray[npindexer], actual) + + assume(zarray.shards is None) # GH2834 + new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) + nparray[npindexer] = new_data + zarray.oindex[zindexer] = new_data + assert_array_equal(nparray, zarray[:]) + + @given(data=st.data()) def test_vindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. @@ -46,6 +93,25 @@ def test_vindex(data: st.DataObject) -> None: actual = zarray.vindex[indexer] assert_array_equal(nparray[indexer], actual) + # FIXME! + # when the indexer is such that a value gets overwritten multiple times, + # I think the output depends on chunking. + # new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) + # nparray[indexer] = new_data + # zarray.vindex[indexer] = new_data + # assert_array_equal(nparray, zarray[:]) + + +@given(store=stores, meta=array_metadata()) # type: ignore[misc] +async def test_roundtrip_array_metadata( + store: Store, meta: ArrayV2Metadata | ArrayV3Metadata +) -> None: + asdict = meta.to_buffer_dict(prototype=default_buffer_prototype()) + for key, expected in asdict.items(): + await store.set(f"0/{key}", expected) + actual = await store.get(f"0/{key}", prototype=default_buffer_prototype()) + assert actual == expected + # @st.composite # def advanced_indices(draw, *, shape): diff --git a/tests/test_store/test_fsspec.py b/tests/test_store/test_fsspec.py index 929de37869..2e9620f29d 100644 --- a/tests/test_store/test_fsspec.py +++ b/tests/test_store/test_fsspec.py @@ -110,7 +110,12 @@ class TestFsspecStoreS3(StoreTests[FsspecStore, cpu.Buffer]): @pytest.fixture def store_kwargs(self, request) -> dict[str, str | bool]: - fs, path = fsspec.url_to_fs( + try: + from fsspec import url_to_fs + except ImportError: + # before fsspec==2024.3.1 + from fsspec.core import url_to_fs + fs, path = url_to_fs( f"s3://{test_bucket_name}", endpoint_url=endpoint_url, anon=False, asynchronous=True ) return {"fs": fs, "path": path} @@ -182,6 +187,10 @@ async def test_fsspec_store_from_uri(self, store: FsspecStore) -> None: ) assert dict(group.attrs) == {"key": "value-3"} + @pytest.mark.skipif( + parse_version(fsspec.__version__) < parse_version("2024.03.01"), + reason="Prior bug in from_upath", + ) def test_from_upath(self) -> None: upath = pytest.importorskip("upath") path = upath.UPath( @@ -204,7 +213,12 @@ def test_init_raises_if_path_has_scheme(self, store_kwargs) -> None: self.store_cls(**store_kwargs) def test_init_warns_if_fs_asynchronous_is_false(self) -> None: - fs, path = fsspec.url_to_fs( + try: + from fsspec import url_to_fs + except ImportError: + # before fsspec==2024.3.1 + from fsspec.core import url_to_fs + fs, path = url_to_fs( f"s3://{test_bucket_name}", endpoint_url=endpoint_url, anon=False, asynchronous=False ) store_kwargs = {"fs": fs, "path": path} @@ -217,6 +231,14 @@ async def test_empty_nonexistent_path(self, store_kwargs) -> None: store = await self.store_cls.open(**store_kwargs) assert await store.is_empty("") + async def test_delete_dir_unsupported_deletes(self, store: FsspecStore) -> None: + store.supports_deletes = False + with pytest.raises( + NotImplementedError, + match="This method is only available for stores that support deletes.", + ): + await store.delete_dir("test_prefix") + @pytest.mark.skipif( parse_version(fsspec.__version__) < parse_version("2024.12.0"), @@ -244,3 +266,28 @@ def test_no_wrap_async_filesystem(): assert not isinstance(store.fs, AsyncFileSystemWrapper) assert store.fs.async_impl + + +@pytest.mark.skipif( + parse_version(fsspec.__version__) < parse_version("2024.12.0"), + reason="No AsyncFileSystemWrapper", +) +async def test_delete_dir_wrapped_filesystem(tmpdir) -> None: + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + from fsspec.implementations.local import LocalFileSystem + + wrapped_fs = AsyncFileSystemWrapper(LocalFileSystem(auto_mkdir=True)) + store = FsspecStore(wrapped_fs, read_only=False, path=f"{tmpdir}/test/path") + + assert isinstance(store.fs, AsyncFileSystemWrapper) + assert store.fs.asynchronous + + await store.set("zarr.json", cpu.Buffer.from_bytes(b"root")) + await store.set("foo-bar/zarr.json", cpu.Buffer.from_bytes(b"root")) + await store.set("foo/zarr.json", cpu.Buffer.from_bytes(b"bar")) + await store.set("foo/c/0", cpu.Buffer.from_bytes(b"chunk")) + await store.delete_dir("foo") + assert await store.exists("zarr.json") + assert await store.exists("foo-bar/zarr.json") + assert not await store.exists("foo/zarr.json") + assert not await store.exists("foo/c/0")