diff --git a/src/zarr/v2/__init__.py b/src/zarr/v2/__init__.py deleted file mode 100644 index 27c7595580..0000000000 --- a/src/zarr/v2/__init__.py +++ /dev/null @@ -1,54 +0,0 @@ -# flake8: noqa -from zarr.v2.codecs import * -from zarr.v2.convenience import ( - consolidate_metadata, - copy, - copy_all, - copy_store, - load, - open, - open_consolidated, - save, - save_array, - save_group, - tree, -) -from zarr.v2.core import Array -from zarr.v2.creation import ( - array, - create, - empty, - empty_like, - full, - full_like, - ones, - ones_like, - open_array, - open_like, - zeros, - zeros_like, -) -from zarr.v2.errors import CopyError, MetadataError -from zarr.v2.hierarchy import Group, group, open_group -from zarr.v2.n5 import N5Store, N5FSStore -from zarr.v2.storage import ( - ABSStore, - DBMStore, - DictStore, - DirectoryStore, - KVStore, - LMDBStore, - LRUStoreCache, - MemoryStore, - MongoDBStore, - NestedDirectoryStore, - RedisStore, - SQLiteStore, - TempStore, - ZipStore, -) -from zarr.v2.sync import ProcessSynchronizer, ThreadSynchronizer -from zarr._version import version as __version__ - -# in case setuptools scm screw up and find version to be 0.0.0 -assert not __version__.startswith("0.0.0") diff --git a/src/zarr/v2/_storage/__init__.py b/src/zarr/v2/_storage/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/zarr/v2/_storage/absstore.py b/src/zarr/v2/_storage/absstore.py deleted file mode 100644 index c04ad240da..0000000000 --- a/src/zarr/v2/_storage/absstore.py +++ /dev/null @@ -1,224 +0,0 @@ -"""This module contains storage classes related to Azure Blob Storage (ABS)""" - -import warnings -from numcodecs.compat import ensure_bytes -from zarr.v2.util import normalize_storage_path -from zarr.v2._storage.store import Store - -__doctest_requires__ = { - ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], -} - - -class ABSStore(Store): - """Storage class using Azure Blob Storage (ABS). - - Parameters - ---------- - container : string - The name of the ABS container to use. - - .. deprecated:: - Use ``client`` instead. - - prefix : string - Location of the "directory" to use as the root of the storage hierarchy - within the container. - - account_name : string - The Azure blob storage account name. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - account_key : string - The Azure blob storage account access key. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - blob_service_kwargs : dictionary - Extra arguments to be passed into the azure blob client, for e.g. when - using the emulator, pass in blob_service_kwargs={'is_emulated': True}. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - client : azure.storage.blob.ContainerClient, optional - And ``azure.storage.blob.ContainerClient`` to connect with. See - `here `_ # noqa - for more. - - .. versionadded:: 2.8.3 - - Notes - ----- - In order to use this store, you must install the Microsoft Azure Storage SDK for Python, - ``azure-storage-blob>=12.5.0``. - """ - - def __init__( - self, - container=None, - prefix="", - account_name=None, - account_key=None, - blob_service_kwargs=None, - dimension_separator=None, - client=None, - ): - self._dimension_separator = dimension_separator - self.prefix = normalize_storage_path(prefix) - if client is None: - # deprecated option, try to construct the client for them - msg = ( - "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" - "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " - "'client' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - from azure.storage.blob import ContainerClient - - blob_service_kwargs = blob_service_kwargs or {} - client = ContainerClient( - "https://{}.blob.core.windows.net/".format(account_name), - container, - credential=account_key, - **blob_service_kwargs, - ) - - self.client = client - self._container = container - self._account_name = account_name - self._account_key = account_key - - @staticmethod - def _warn_deprecated(property_): - msg = ( - "The {} property is deprecated and will be removed in a future " - "version. Get the property from 'ABSStore.client' instead." - ) - warnings.warn(msg.format(property_), FutureWarning, stacklevel=3) - - @property - def container(self): - self._warn_deprecated("container") - return self._container - - @property - def account_name(self): - self._warn_deprecated("account_name") - return self._account_name - - @property - def account_key(self): - self._warn_deprecated("account_key") - return self._account_key - - def _append_path_to_prefix(self, path): - if self.prefix == "": - return normalize_storage_path(path) - else: - return "/".join([self.prefix, normalize_storage_path(path)]) - - @staticmethod - def _strip_prefix_from_path(path, prefix): - # normalized things will not have any leading or trailing slashes - path_norm = normalize_storage_path(path) - prefix_norm = normalize_storage_path(prefix) - if prefix: - return path_norm[(len(prefix_norm) + 1) :] - else: - return path_norm - - def __getitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - blob_name = self._append_path_to_prefix(key) - try: - return self.client.download_blob(blob_name).readall() - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % blob_name) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - blob_name = self._append_path_to_prefix(key) - self.client.upload_blob(blob_name, value, overwrite=True) - - def __delitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - try: - self.client.delete_blob(self._append_path_to_prefix(key)) - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % key) - - def __eq__(self, other): - return ( - isinstance(other, ABSStore) - and self.client == other.client - and self.prefix == other.prefix - ) - - def keys(self): - return list(self.__iter__()) - - def __iter__(self): - if self.prefix: - list_blobs_prefix = self.prefix + "/" - else: - list_blobs_prefix = None - for blob in self.client.list_blobs(list_blobs_prefix): - yield self._strip_prefix_from_path(blob.name, self.prefix) - - def __len__(self): - return len(self.keys()) - - def __contains__(self, key): - blob_name = self._append_path_to_prefix(key) - return self.client.get_blob_client(blob_name).exists() - - def listdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - items = [ - self._strip_prefix_from_path(blob.name, dir_path) - for blob in self.client.walk_blobs(name_starts_with=dir_path, delimiter="/") - ] - return items - - def rmdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - for blob in self.client.list_blobs(name_starts_with=dir_path): - self.client.delete_blob(blob) - - def getsize(self, path=None): - store_path = normalize_storage_path(path) - fs_path = self._append_path_to_prefix(store_path) - if fs_path: - blob_client = self.client.get_blob_client(fs_path) - else: - blob_client = None - - if blob_client and blob_client.exists(): - return blob_client.get_blob_properties().size - else: - size = 0 - if fs_path == "": - fs_path = None - elif not fs_path.endswith("/"): - fs_path += "/" - for blob in self.client.walk_blobs(name_starts_with=fs_path, delimiter="/"): - blob_client = self.client.get_blob_client(blob) - if blob_client.exists(): - size += blob_client.get_blob_properties().size - return size - - def clear(self): - self.rmdir() diff --git a/src/zarr/v2/_storage/store.py b/src/zarr/v2/_storage/store.py deleted file mode 100644 index ec1dbf0565..0000000000 --- a/src/zarr/v2/_storage/store.py +++ /dev/null @@ -1,226 +0,0 @@ -from collections.abc import MutableMapping -from typing import Any, List, Mapping, Optional, Sequence, Union - -from zarr.v2.meta import Metadata2 -from zarr.v2.util import normalize_storage_path -from zarr.v2.context import Context - - -# v2 store keys -array_meta_key = ".zarray" -group_meta_key = ".zgroup" -attrs_key = ".zattrs" - -DEFAULT_ZARR_VERSION = 2 - - -class BaseStore(MutableMapping[str, Any]): - """Abstract base class for store implementations. - - This is a thin wrapper over MutableMapping that provides methods to check - whether a store is readable, writeable, eraseable and or listable. - - Stores cannot be mutable mapping as they do have a couple of other - requirements that would break Liskov substitution principle (stores only - allow strings as keys, mutable mapping are more generic). - - Having no-op base method also helps simplifying store usage and do not need - to check the presence of attributes and methods, like `close()`. - - Stores can be used as context manager to make sure they close on exit. - - .. added: 2.11.0 - - """ - - _readable = True - _writeable = True - _erasable = True - _listable = True - _store_version = 2 - _metadata_class = Metadata2 - - def is_readable(self): - return self._readable - - def is_writeable(self): - return self._writeable - - def is_listable(self): - return self._listable - - def is_erasable(self): - return self._erasable - - def __enter__(self): - if not hasattr(self, "_open_count"): - self._open_count = 0 - self._open_count += 1 - return self - - def __exit__(self, exc_type, exc_value, traceback): - self._open_count -= 1 - if self._open_count == 0: - self.close() - - def close(self) -> None: - """Do nothing by default""" - pass - - def rename(self, src_path: str, dst_path: str) -> None: - if not self.is_erasable(): - raise NotImplementedError( - f'{type(self)} is not erasable, cannot call "rename"' - ) # pragma: no cover - _rename_from_keys(self, src_path, dst_path) - - @staticmethod - def _ensure_store(store: Any): - """ - We want to make sure internally that zarr stores are always a class - with a specific interface derived from ``BaseStore``, which is slightly - different than ``MutableMapping``. - - We'll do this conversion in a few places automatically - """ - from zarr.v2.storage import KVStore # avoid circular import - - if isinstance(store, BaseStore): - if not store._store_version == 2: - raise ValueError( - f"cannot initialize a v2 store with a v{store._store_version} store" - ) - return store - elif isinstance(store, MutableMapping): - return KVStore(store) - else: - for attr in [ - "keys", - "values", - "get", - "__setitem__", - "__getitem__", - "__delitem__", - "__contains__", - ]: - if not hasattr(store, attr): - break - else: - return KVStore(store) - - raise ValueError( - "Starting with Zarr 2.11.0, stores must be subclasses of " - "BaseStore, if your store exposes the MutableMapping interface " - f"wrap it in zarr.v2.storage.KVStore. Got {store}" - ) - - def getitems( - self, keys: Sequence[str], *, contexts: Mapping[str, Context] - ) -> Mapping[str, Any]: - """Retrieve data from multiple keys. - - Parameters - ---------- - keys : Iterable[str] - The keys to retrieve - contexts: Mapping[str, Context] - A mapping of keys to their context. Each context is a mapping of store - specific information. E.g. a context could be a dict telling the store - the preferred output array type: `{"meta_array": cupy.empty(())}` - - Returns - ------- - Mapping - A collection mapping the input keys to their results. - - Notes - ----- - This default implementation uses __getitem__() to read each key sequentially and - ignores contexts. Overwrite this method to implement concurrent reads of multiple - keys and/or to utilize the contexts. - """ - return {k: self[k] for k in keys if k in self} - - -class Store(BaseStore): - """Abstract store class used by implementations following the Zarr v2 spec. - - Adds public `listdir`, `rename`, and `rmdir` methods on top of BaseStore. - - .. added: 2.11.0 - - """ - - def listdir(self, path: str = "") -> List[str]: - path = normalize_storage_path(path) - return _listdir_from_keys(self, path) - - def rmdir(self, path: str = "") -> None: - if not self.is_erasable(): - raise NotImplementedError( - f'{type(self)} is not erasable, cannot call "rmdir"' - ) # pragma: no cover - path = normalize_storage_path(path) - _rmdir_from_keys(self, path) - - -# allow MutableMapping for backwards compatibility -StoreLike = Union[BaseStore, MutableMapping[str, Any]] - - -def _path_to_prefix(path: Optional[str]) -> str: - # assume path already normalized - if path: - prefix = path + "/" - else: - prefix = "" - return prefix - - -def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: - # assume path already normalized - src_prefix = _path_to_prefix(src_path) - dst_prefix = _path_to_prefix(dst_path) - version = getattr(store, "_store_version", 2) - if version == 2: - for key in list(store.keys()): - if key.startswith(src_prefix): - new_key = dst_prefix + key.lstrip(src_prefix) - store[new_key] = store.pop(key) - else: - raise NotImplementedError("This function only supports Zarr version 2.") - - -def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: - # assume path already normalized - prefix = _path_to_prefix(path) - for key in list(store.keys()): - if key.startswith(prefix): - del store[key] - - -def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str]: - # assume path already normalized - prefix = _path_to_prefix(path) - children = set() - for key in list(store.keys()): - if key.startswith(prefix) and len(key) > len(prefix): - suffix = key[len(prefix) :] - child = suffix.split("/")[0] - children.add(child) - return sorted(children) - - -def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: - key = prefix + array_meta_key - return key - - -def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: - key = prefix + group_meta_key - return key - - -def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: - key = prefix + attrs_key - return key diff --git a/src/zarr/v2/attrs.py b/src/zarr/v2/attrs.py deleted file mode 100644 index af23d43b9e..0000000000 --- a/src/zarr/v2/attrs.py +++ /dev/null @@ -1,158 +0,0 @@ -from typing import Any -import warnings -from collections.abc import MutableMapping - -from zarr.v2._storage.store import Store -from zarr.v2.util import json_dumps - - -class Attributes(MutableMapping[str, Any]): - """Class providing access to user attributes on an array or group. Should not be - instantiated directly, will be available via the `.attrs` property of an array or - group. - - Parameters - ---------- - store : MutableMapping - The store in which to store the attributes. - key : str, optional - The key under which the attributes will be stored. - read_only : bool, optional - If True, attributes cannot be modified. - cache : bool, optional - If True (default), attributes will be cached locally. - synchronizer : Synchronizer - Only necessary if attributes may be modified from multiple threads or processes. - - """ - - def __init__(self, store, key=".zattrs", read_only=False, cache=True, synchronizer=None): - _Store = Store - self.store = _Store._ensure_store(store) - self.key = key - self.read_only = read_only - self.cache = cache - self._cached_asdict = None - self.synchronizer = synchronizer - - def _get_nosync(self): - try: - data = self.store[self.key] - except KeyError: - d: dict[str, Any] = dict() - else: - d = self.store._metadata_class.parse_metadata(data) - return d - - def asdict(self): - """Retrieve all attributes as a dictionary.""" - if self.cache and self._cached_asdict is not None: - return self._cached_asdict - d = self._get_nosync() - if self.cache: - self._cached_asdict = d - return d - - def refresh(self): - """Refresh cached attributes from the store.""" - if self.cache: - self._cached_asdict = self._get_nosync() - - def __contains__(self, x): - return x in self.asdict() - - def __getitem__(self, item): - return self.asdict()[item] - - def _write_op(self, f, *args, **kwargs): - # guard condition - if self.read_only: - raise PermissionError("attributes are read-only") - - # synchronization - if self.synchronizer is None: - return f(*args, **kwargs) - else: - with self.synchronizer[self.key]: - return f(*args, **kwargs) - - def __setitem__(self, item, value): - self._write_op(self._setitem_nosync, item, value) - - def _setitem_nosync(self, item, value): - # load existing data - d = self._get_nosync() - - # set key value - - d[item] = value - - # _put modified data - self._put_nosync(d) - - def __delitem__(self, item): - self._write_op(self._delitem_nosync, item) - - def _delitem_nosync(self, key): - # load existing data - d = self._get_nosync() - - # delete key value - del d[key] - - # _put modified data - self._put_nosync(d) - - def put(self, d): - """Overwrite all attributes with the key/value pairs in the provided dictionary - `d` in a single operation.""" - self._write_op(self._put_nosync, d) - - def _put_nosync(self, d): - d_to_check = d - if not all(isinstance(item, str) for item in d_to_check): - # TODO: Raise an error for non-string keys - # raise TypeError("attribute keys must be strings") - warnings.warn( - "only attribute keys of type 'string' will be allowed in the future", - DeprecationWarning, - stacklevel=2, - ) - - try: - d_to_check = {str(k): v for k, v in d_to_check.items()} - except TypeError as ex: # pragma: no cover - raise TypeError("attribute keys can not be stringified") from ex - - d = d_to_check - - self.store[self.key] = json_dumps(d) - if self.cache: - self._cached_asdict = d - - # noinspection PyMethodOverriding - def update(self, *args, **kwargs): - """Update the values of several attributes in a single operation.""" - self._write_op(self._update_nosync, *args, **kwargs) - - def _update_nosync(self, *args, **kwargs): - # load existing data - d = self._get_nosync() - - # update - d.update(*args, **kwargs) - - # _put modified data - self._put_nosync(d) - - def keys(self): - return self.asdict().keys() - - def __iter__(self): - return iter(self.asdict()) - - def __len__(self): - return len(self.asdict()) - - def _ipython_key_completions_(self): - return sorted(self) diff --git a/src/zarr/v2/codecs.py b/src/zarr/v2/codecs.py deleted file mode 100644 index 4ad68b8627..0000000000 --- a/src/zarr/v2/codecs.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa -from numcodecs import * -from numcodecs import get_codec, Blosc, Pickle, Zlib, Delta, AsType, BZ2 -from numcodecs.registry import codec_registry diff --git a/src/zarr/v2/context.py b/src/zarr/v2/context.py deleted file mode 100644 index 4eb1db7491..0000000000 --- a/src/zarr/v2/context.py +++ /dev/null @@ -1,19 +0,0 @@ -from typing import TypedDict - -from numcodecs.compat import NDArrayLike - - -class Context(TypedDict, total=False): - """A context for component specific information - - All keys are optional. Any component reading the context must provide - a default implementation in the case a key cannot be found. - - Attributes - ---------- - meta_array : array-like, optional - An array-like instance to use for determining the preferred output - array type. - """ - - meta_array: NDArrayLike diff --git a/src/zarr/v2/convenience.py b/src/zarr/v2/convenience.py deleted file mode 100644 index c066ee59e0..0000000000 --- a/src/zarr/v2/convenience.py +++ /dev/null @@ -1,1284 +0,0 @@ -"""Convenience functions for storing and loading data.""" - -import itertools -import os -import re -from collections.abc import Mapping, MutableMapping -from zarr.v2.core import Array -from zarr.v2.creation import array as _create_array -from zarr.v2.creation import open_array -from zarr.v2.errors import CopyError, PathNotFoundError -from zarr.v2.hierarchy import Group -from zarr.v2.hierarchy import group as _create_group -from zarr.v2.hierarchy import open_group -from zarr.v2.meta import json_dumps, json_loads -from zarr.v2.storage import ( - contains_array, - contains_group, - normalize_store_arg, - BaseStore, - ConsolidatedMetadataStore, -) -from zarr.v2.util import TreeViewer, buffer_size, normalize_storage_path - -from typing import Any, Union - -StoreLike = Union[BaseStore, MutableMapping[str, Any], str, None] - -_builtin_open = open # builtin open is later shadowed by a local open function - - -def _check_and_update_path(store: BaseStore, path): - if getattr(store, "_store_version", 2) > 2 and not path: - raise ValueError("path must be provided for v3 stores") - return normalize_storage_path(path) - - -# noinspection PyShadowingBuiltins -def open(store: StoreLike = None, mode: str = "a", *, path=None, **kwargs): - """Convenience function to open a group or array using file-mode-like semantics. - - Parameters - ---------- - store : Store or string, optional - Store or path to directory in file system or name of zip file. - mode : {'r', 'r+', 'a', 'w', 'w-'}, optional - Persistence mode: 'r' means read only (must exist); 'r+' means - read/write (must exist); 'a' means read/write (create if doesn't - exist); 'w' means create (overwrite if exists); 'w-' means create - (fail if exists). - path : str or None, optional - The path within the store to open. - **kwargs - Additional parameters are passed through to :func:`zarr.v2.creation.open_array` or - :func:`zarr.v2.hierarchy.open_group`. - - Returns - ------- - z : :class:`zarr.v2.core.Array` or :class:`zarr.v2.hierarchy.Group` - Array or group, depending on what exists in the given store. - - See Also - -------- - zarr.v2.creation.open_array, zarr.v2.hierarchy.open_group - - Examples - -------- - - Storing data in a directory 'data/example.zarr' on the local file system:: - - >>> import zarr - >>> store = 'data/example.zarr' - >>> zw = zarr.v2.open(store, mode='w', shape=100, dtype='i4') # open new array - >>> zw - - >>> za = zarr.v2.open(store, mode='a') # open existing array for reading and writing - >>> za - - >>> zr = zarr.v2.open(store, mode='r') # open existing array read-only - >>> zr - - >>> gw = zarr.v2.open(store, mode='w') # open new group, overwriting previous data - >>> gw - - >>> ga = zarr.v2.open(store, mode='a') # open existing group for reading and writing - >>> ga - - >>> gr = zarr.v2.open(store, mode='r') # open existing group read-only - >>> gr - - - """ - - # handle polymorphic store arg - # we pass storage options explicitly, since normalize_store_arg might construct - # a store if the input is a fsspec-compatible URL - _store: BaseStore = normalize_store_arg( - store, storage_options=kwargs.pop("storage_options", {}), mode=mode - ) - # path = _check_and_update_path(_store, path) - path = normalize_storage_path(path) - kwargs["path"] = path - - if mode in {"w", "w-", "x"}: - if "shape" in kwargs: - return open_array(_store, mode=mode, **kwargs) - else: - return open_group(_store, mode=mode, **kwargs) - - elif mode == "a": - if "shape" in kwargs or contains_array(_store, path): - return open_array(_store, mode=mode, **kwargs) - else: - return open_group(_store, mode=mode, **kwargs) - - else: - if contains_array(_store, path): - return open_array(_store, mode=mode, **kwargs) - elif contains_group(_store, path): - return open_group(_store, mode=mode, **kwargs) - else: - raise PathNotFoundError(path) - - -def _might_close(path): - return isinstance(path, (str, os.PathLike)) - - -def save_array(store: StoreLike, arr, *, path=None, **kwargs): - """Convenience function to save a NumPy array to the local file system, following a - similar API to the NumPy save() function. - - Parameters - ---------- - store : MutableMapping or string - Store or path to directory in file system or name of zip file. - arr : ndarray - NumPy array with data to save. - path : str or None, optional - The path within the store where the array will be saved. - kwargs - Passed through to :func:`create`, e.g., compressor. - - Examples - -------- - Save an array to a directory on the file system (uses a :class:`DirectoryStore`):: - - >>> import zarr - >>> import numpy as np - >>> arr = np.arange(10000) - >>> zarr.v2.save_array('data/example.zarr', arr) - >>> zarr.v2.load('data/example.zarr') - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - - Save an array to a single file (uses a :class:`ZipStore`):: - - >>> zarr.v2.save_array('data/example.zip', arr) - >>> zarr.v2.load('data/example.zip') - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - - """ - may_need_closing = _might_close(store) - _store: BaseStore = normalize_store_arg(store, mode="w") - path = _check_and_update_path(_store, path) - try: - _create_array(arr, store=_store, overwrite=True, path=path, **kwargs) - finally: - if may_need_closing: - # needed to ensure zip file records are written - _store.close() - - -def save_group(store: StoreLike, *args, path=None, **kwargs): - """Convenience function to save several NumPy arrays to the local file system, following a - similar API to the NumPy savez()/savez_compressed() functions. - - Parameters - ---------- - store : MutableMapping or string - Store or path to directory in file system or name of zip file. - args : ndarray - NumPy arrays with data to save. - path : str or None, optional - Path within the store where the group will be saved. - kwargs - NumPy arrays with data to save. - - Examples - -------- - Save several arrays to a directory on the file system (uses a - :class:`DirectoryStore`): - - >>> import zarr - >>> import numpy as np - >>> a1 = np.arange(10000) - >>> a2 = np.arange(10000, 0, -1) - >>> zarr.v2.save_group('data/example.zarr', a1, a2) - >>> loader = zarr.v2.load('data/example.zarr') - >>> loader - - >>> loader['arr_0'] - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - >>> loader['arr_1'] - array([10000, 9999, 9998, ..., 3, 2, 1]) - - Save several arrays using named keyword arguments:: - - >>> zarr.v2.save_group('data/example.zarr', foo=a1, bar=a2) - >>> loader = zarr.v2.load('data/example.zarr') - >>> loader - - >>> loader['foo'] - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - >>> loader['bar'] - array([10000, 9999, 9998, ..., 3, 2, 1]) - - Store several arrays in a single zip file (uses a :class:`ZipStore`):: - - >>> zarr.v2.save_group('data/example.zip', foo=a1, bar=a2) - >>> loader = zarr.v2.load('data/example.zip') - >>> loader - - >>> loader['foo'] - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - >>> loader['bar'] - array([10000, 9999, 9998, ..., 3, 2, 1]) - - Notes - ----- - Default compression options will be used. - - """ - if len(args) == 0 and len(kwargs) == 0: - raise ValueError("at least one array must be provided") - # handle polymorphic store arg - may_need_closing = _might_close(store) - _store: BaseStore = normalize_store_arg(store, mode="w") - path = _check_and_update_path(_store, path) - try: - grp = _create_group(_store, path=path, overwrite=True) - for i, arr in enumerate(args): - k = "arr_{}".format(i) - grp.create_dataset(k, data=arr, overwrite=True) - for k, arr in kwargs.items(): - grp.create_dataset(k, data=arr, overwrite=True) - finally: - if may_need_closing: - # needed to ensure zip file records are written - _store.close() - - -def save(store: StoreLike, *args, path=None, **kwargs): - """Convenience function to save an array or group of arrays to the local file system. - - Parameters - ---------- - store : MutableMapping or string - Store or path to directory in file system or name of zip file. - args : ndarray - NumPy arrays with data to save. - path : str or None, optional - The path within the group where the arrays will be saved. - kwargs - NumPy arrays with data to save. - - Examples - -------- - Save an array to a directory on the file system (uses a :class:`DirectoryStore`):: - - >>> import zarr - >>> import numpy as np - >>> arr = np.arange(10000) - >>> zarr.v2.save('data/example.zarr', arr) - >>> zarr.v2.load('data/example.zarr') - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - - Save an array to a Zip file (uses a :class:`ZipStore`):: - - >>> zarr.v2.save('data/example.zip', arr) - >>> zarr.v2.load('data/example.zip') - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - - Save several arrays to a directory on the file system (uses a - :class:`DirectoryStore` and stores arrays in a group):: - - >>> import zarr - >>> import numpy as np - >>> a1 = np.arange(10000) - >>> a2 = np.arange(10000, 0, -1) - >>> zarr.v2.save('data/example.zarr', a1, a2) - >>> loader = zarr.v2.load('data/example.zarr') - >>> loader - - >>> loader['arr_0'] - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - >>> loader['arr_1'] - array([10000, 9999, 9998, ..., 3, 2, 1]) - - Save several arrays using named keyword arguments:: - - >>> zarr.v2.save('data/example.zarr', foo=a1, bar=a2) - >>> loader = zarr.v2.load('data/example.zarr') - >>> loader - - >>> loader['foo'] - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - >>> loader['bar'] - array([10000, 9999, 9998, ..., 3, 2, 1]) - - Store several arrays in a single zip file (uses a :class:`ZipStore`):: - - >>> zarr.v2.save('data/example.zip', foo=a1, bar=a2) - >>> loader = zarr.v2.load('data/example.zip') - >>> loader - - >>> loader['foo'] - array([ 0, 1, 2, ..., 9997, 9998, 9999]) - >>> loader['bar'] - array([10000, 9999, 9998, ..., 3, 2, 1]) - - See Also - -------- - save_array, save_group - - """ - if len(args) == 0 and len(kwargs) == 0: - raise ValueError("at least one array must be provided") - if len(args) == 1 and len(kwargs) == 0: - save_array(store, args[0], path=path) - else: - save_group(store, *args, path=path, **kwargs) - - -class LazyLoader(Mapping): - def __init__(self, grp): - self.grp = grp - self.cache = dict() - - def __getitem__(self, item): - try: - return self.cache[item] - except KeyError: - arr = self.grp[item][...] - self.cache[item] = arr - return arr - - def __len__(self): - return len(self.grp) - - def __iter__(self): - return iter(self.grp) - - def __contains__(self, item): - return item in self.grp - - def __repr__(self): - r = ">> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> g4 = g3.create_group('baz') - >>> g5 = g3.create_group('qux') - >>> d1 = g5.create_dataset('baz', shape=100, chunks=10) - >>> g1.tree() - / - ├── bar - │ ├── baz - │ └── qux - │ └── baz (100,) float64 - └── foo - >>> import h5py - >>> h5f = h5py.File('data/example.h5', mode='w') - >>> zarr.v2.copy_all(g1, h5f) - (5, 0, 800) - >>> zarr.v2.tree(h5f) - / - ├── bar - │ ├── baz - │ └── qux - │ └── baz (100,) float64 - └── foo - - See Also - -------- - zarr.v2.hierarchy.Group.tree - - Notes - ----- - Please note that this is an experimental feature. The behaviour of this - function is still evolving and the default output and/or parameters may change - in future versions. - - """ - - return TreeViewer(grp, expand=expand, level=level) - - -class _LogWriter: - def __init__(self, log): - self.log_func = None - self.log_file = None - self.needs_closing = False - if log is None: - # don't do any logging - pass - elif callable(log): - self.log_func = log - elif isinstance(log, str): - self.log_file = _builtin_open(log, mode="w") - self.needs_closing = True - elif hasattr(log, "write"): - self.log_file = log - else: - raise TypeError( - "log must be a callable function, file path or file-like object, found %r" % log - ) - - def __enter__(self): - return self - - def __exit__(self, *args): - if self.log_file is not None and self.needs_closing: - self.log_file.close() - - def __call__(self, *args, **kwargs): - if self.log_file is not None: - kwargs["file"] = self.log_file - print(*args, **kwargs) - if hasattr(self.log_file, "flush"): - # get immediate feedback - self.log_file.flush() - elif self.log_func is not None: - self.log_func(*args, **kwargs) - - -def _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied): - # log a final message with a summary of what happened - if dry_run: - message = "dry run: " - else: - message = "all done: " - message += "{:,} copied, {:,} skipped".format(n_copied, n_skipped) - if not dry_run: - message += ", {:,} bytes copied".format(n_bytes_copied) - log(message) - - -def copy_store( - source, - dest, - source_path="", - dest_path="", - excludes=None, - includes=None, - flags=0, - if_exists="raise", - dry_run=False, - log=None, -): - """Copy data directly from the `source` store to the `dest` store. Use this - function when you want to copy a group or array in the most efficient way, - preserving all configuration and attributes. This function is more efficient - than the copy() or copy_all() functions because it avoids de-compressing and - re-compressing data, rather the compressed chunk data for each array are - copied directly between stores. - - Parameters - ---------- - source : Mapping - Store to copy data from. - dest : MutableMapping - Store to copy data into. - source_path : str, optional - Only copy data from under this path in the source store. - dest_path : str, optional - Copy data into this path in the destination store. - excludes : sequence of str, optional - One or more regular expressions which will be matched against keys in - the source store. Any matching key will not be copied. - includes : sequence of str, optional - One or more regular expressions which will be matched against keys in - the source store and will override any excludes also matching. - flags : int, optional - Regular expression flags used for matching excludes and includes. - if_exists : {'raise', 'replace', 'skip'}, optional - How to handle keys that already exist in the destination store. If - 'raise' then a CopyError is raised on the first key already present - in the destination store. If 'replace' then any data will be replaced in - the destination. If 'skip' then any existing keys will not be copied. - dry_run : bool, optional - If True, don't actually copy anything, just log what would have - happened. - log : callable, file path or file-like object, optional - If provided, will be used to log progress information. - - Returns - ------- - n_copied : int - Number of items copied. - n_skipped : int - Number of items skipped. - n_bytes_copied : int - Number of bytes of data that were actually copied. - - Examples - -------- - - >>> import zarr - >>> store1 = zarr.v2.DirectoryStore('data/example.zarr') - >>> root = zarr.v2.group(store1, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.create_group('bar') - >>> baz = bar.create_dataset('baz', shape=100, chunks=50, dtype='i8') - >>> import numpy as np - >>> baz[:] = np.arange(100) - >>> root.tree() - / - └── foo - └── bar - └── baz (100,) int64 - >>> from sys import stdout - >>> store2 = zarr.v2.ZipStore('data/example.zip', mode='w') - >>> zarr.v2.copy_store(store1, store2, log=stdout) - copy .zgroup - copy foo/.zgroup - copy foo/bar/.zgroup - copy foo/bar/baz/.zarray - copy foo/bar/baz/0 - copy foo/bar/baz/1 - all done: 6 copied, 0 skipped, 566 bytes copied - (6, 0, 566) - >>> new_root = zarr.v2.group(store2) - >>> new_root.tree() - / - └── foo - └── bar - └── baz (100,) int64 - >>> new_root['foo/bar/baz'][:] - array([ 0, 1, 2, ..., 97, 98, 99]) - >>> store2.close() # zip stores need to be closed - - Notes - ----- - Please note that this is an experimental feature. The behaviour of this - function is still evolving and the default behaviour and/or parameters may change - in future versions. - - """ - - # normalize paths - source_path = normalize_storage_path(source_path) - dest_path = normalize_storage_path(dest_path) - if source_path: - source_path = source_path + "/" - if dest_path: - dest_path = dest_path + "/" - - # normalize excludes and includes - if excludes is None: - excludes = [] - elif isinstance(excludes, str): - excludes = [excludes] - if includes is None: - includes = [] - elif isinstance(includes, str): - includes = [includes] - excludes = [re.compile(e, flags) for e in excludes] - includes = [re.compile(i, flags) for i in includes] - - # check if_exists parameter - valid_if_exists = ["raise", "replace", "skip"] - if if_exists not in valid_if_exists: - raise ValueError( - "if_exists must be one of {!r}; found {!r}".format(valid_if_exists, if_exists) - ) - - # setup counting variables - n_copied = n_skipped = n_bytes_copied = 0 - - source_store_version = getattr(source, "_store_version", 2) - dest_store_version = getattr(dest, "_store_version", 2) - if source_store_version != dest_store_version: - raise ValueError("zarr stores must share the same protocol version") - - if source_store_version > 2: - raise NotImplementedError("This function only supports Zarr version 2.") - - # setup logging - with _LogWriter(log) as log: - # iterate over source keys - for source_key in sorted(source.keys()): - # filter to keys under source path - if source_store_version == 2: - if not source_key.startswith(source_path): - continue - elif source_store_version == 3: - raise NotImplementedError("This function only supports Zarr version 2.") - # process excludes and includes - exclude = False - for prog in excludes: - if prog.search(source_key): - exclude = True - break - if exclude: - for prog in includes: - if prog.search(source_key): - exclude = False - break - if exclude: - continue - - # map key to destination path - if source_store_version == 2: - key_suffix = source_key[len(source_path) :] - dest_key = dest_path + key_suffix - elif source_store_version == 3: - raise NotImplementedError("This function only supports Zarr version 2.") - # create a descriptive label for this operation - descr = source_key - if dest_key != source_key: - descr = descr + " -> " + dest_key - - # decide what to do - do_copy = True - if if_exists != "replace" and dest_key in dest: - if if_exists == "raise": - raise CopyError("key {!r} exists in destination".format(dest_key)) - elif if_exists == "skip": - do_copy = False - - # take action - if do_copy: - log("copy {}".format(descr)) - if not dry_run: - data = source[source_key] - n_bytes_copied += buffer_size(data) - dest[dest_key] = data - n_copied += 1 - else: - log("skip {}".format(descr)) - n_skipped += 1 - - # log a final message with a summary of what happened - _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) - - return n_copied, n_skipped, n_bytes_copied - - -def _check_dest_is_group(dest): - if not hasattr(dest, "create_dataset"): - raise ValueError("dest must be a group, got {!r}".format(dest)) - - -def copy( - source, - dest, - name=None, - shallow=False, - without_attrs=False, - log=None, - if_exists="raise", - dry_run=False, - **create_kws, -): - """Copy the `source` array or group into the `dest` group. - - Parameters - ---------- - source : group or array/dataset - A zarr group or array, or an h5py group or dataset. - dest : group - A zarr or h5py group. - name : str, optional - Name to copy the object to. - shallow : bool, optional - If True, only copy immediate children of `source`. - without_attrs : bool, optional - Do not copy user attributes. - log : callable, file path or file-like object, optional - If provided, will be used to log progress information. - if_exists : {'raise', 'replace', 'skip', 'skip_initialized'}, optional - How to handle arrays that already exist in the destination group. If - 'raise' then a CopyError is raised on the first array already present - in the destination group. If 'replace' then any array will be - replaced in the destination. If 'skip' then any existing arrays will - not be copied. If 'skip_initialized' then any existing arrays with - all chunks initialized will not be copied (not available when copying to - h5py). - dry_run : bool, optional - If True, don't actually copy anything, just log what would have - happened. - **create_kws - Passed through to the create_dataset method when copying an array/dataset. - - Returns - ------- - n_copied : int - Number of items copied. - n_skipped : int - Number of items skipped. - n_bytes_copied : int - Number of bytes of data that were actually copied. - - Examples - -------- - Here's an example of copying a group named 'foo' from an HDF5 file to a - Zarr group:: - - >>> import h5py - >>> import zarr - >>> import numpy as np - >>> source = h5py.File('data/example.h5', mode='w') - >>> foo = source.create_group('foo') - >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) - >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) - >>> zarr.v2.tree(source) - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> dest = zarr.v2.group() - >>> from sys import stdout - >>> zarr.v2.copy(source['foo'], dest, log=stdout) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - all done: 3 copied, 0 skipped, 800 bytes copied - (3, 0, 800) - >>> dest.tree() # N.B., no spam - / - └── foo - └── bar - └── baz (100,) int64 - >>> source.close() - - The ``if_exists`` parameter provides options for how to handle pre-existing data in - the destination. Here are some examples of these options, also using - ``dry_run=True`` to find out what would happen without actually copying anything:: - - >>> source = zarr.v2.group() - >>> dest = zarr.v2.group() - >>> baz = source.create_dataset('foo/bar/baz', data=np.arange(100)) - >>> spam = source.create_dataset('foo/spam', data=np.arange(1000)) - >>> existing_spam = dest.create_dataset('foo/spam', data=np.arange(1000)) - >>> from sys import stdout - >>> try: - ... zarr.v2.copy(source['foo'], dest, log=stdout, dry_run=True) - ... except zarr.v2.CopyError as e: - ... print(e) - ... - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - an object 'spam' already exists in destination '/foo' - >>> zarr.v2.copy(source['foo'], dest, log=stdout, if_exists='replace', dry_run=True) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - copy /foo/spam (1000,) int64 - dry run: 4 copied, 0 skipped - (4, 0, 0) - >>> zarr.v2.copy(source['foo'], dest, log=stdout, if_exists='skip', dry_run=True) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - skip /foo/spam (1000,) int64 - dry run: 3 copied, 1 skipped - (3, 1, 0) - - Notes - ----- - Please note that this is an experimental feature. The behaviour of this - function is still evolving and the default behaviour and/or parameters may change - in future versions. - - """ - - # value checks - _check_dest_is_group(dest) - - # setup logging - with _LogWriter(log) as log: - # do the copying - n_copied, n_skipped, n_bytes_copied = _copy( - log, - source, - dest, - name=name, - root=True, - shallow=shallow, - without_attrs=without_attrs, - if_exists=if_exists, - dry_run=dry_run, - **create_kws, - ) - - # log a final message with a summary of what happened - _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) - - return n_copied, n_skipped, n_bytes_copied - - -def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_run, **create_kws): - # N.B., if this is a dry run, dest may be None - - # setup counting variables - n_copied = n_skipped = n_bytes_copied = 0 - - # are we copying to/from h5py? - source_h5py = source.__module__.startswith("h5py.") - dest_h5py = dest is not None and dest.__module__.startswith("h5py.") - - # check if_exists parameter - valid_if_exists = ["raise", "replace", "skip", "skip_initialized"] - if if_exists not in valid_if_exists: - raise ValueError( - "if_exists must be one of {!r}; found {!r}".format(valid_if_exists, if_exists) - ) - if dest_h5py and if_exists == "skip_initialized": - raise ValueError("{!r} can only be used when copying to zarr".format(if_exists)) - - # determine name to copy to - if name is None: - name = source.name.split("/")[-1] - if not name: - # this can happen if source is the root group - raise TypeError( - "source has no name, please provide the `name` " - "parameter to indicate a name to copy to" - ) - - if hasattr(source, "shape"): - # copy a dataset/array - - # check if already exists, decide what to do - do_copy = True - exists = dest is not None and name in dest - if exists: - if if_exists == "raise": - raise CopyError( - "an object {!r} already exists in destination {!r}".format(name, dest.name) - ) - elif if_exists == "skip": - do_copy = False - elif if_exists == "skip_initialized": - ds = dest[name] - if ds.nchunks_initialized == ds.nchunks: - do_copy = False - - # take action - if do_copy: - # log a message about what we're going to do - log("copy {} {} {}".format(source.name, source.shape, source.dtype)) - - if not dry_run: - # clear the way - if exists: - del dest[name] - - # setup creation keyword arguments - kws = create_kws.copy() - - # setup chunks option, preserve by default - kws.setdefault("chunks", source.chunks) - - # setup compression options - if source_h5py: - if dest_h5py: - # h5py -> h5py; preserve compression options by default - kws.setdefault("compression", source.compression) - kws.setdefault("compression_opts", source.compression_opts) - kws.setdefault("shuffle", source.shuffle) - kws.setdefault("fletcher32", source.fletcher32) - kws.setdefault("fillvalue", source.fillvalue) - else: - # h5py -> zarr; use zarr default compression options - kws.setdefault("fill_value", source.fillvalue) - else: - if dest_h5py: - # zarr -> h5py; use some vaguely sensible defaults - kws.setdefault("chunks", True) - kws.setdefault("compression", "gzip") - kws.setdefault("compression_opts", 1) - kws.setdefault("shuffle", False) - kws.setdefault("fillvalue", source.fill_value) - else: - # zarr -> zarr; preserve compression options by default - kws.setdefault("compressor", source.compressor) - kws.setdefault("filters", source.filters) - kws.setdefault("order", source.order) - kws.setdefault("fill_value", source.fill_value) - - # create new dataset in destination - ds = dest.create_dataset(name, shape=source.shape, dtype=source.dtype, **kws) - - # copy data - N.B., go chunk by chunk to avoid loading - # everything into memory - shape = ds.shape - chunks = ds.chunks - chunk_offsets = [range(0, s, c) for s, c in zip(shape, chunks)] - for offset in itertools.product(*chunk_offsets): - sel = tuple(slice(o, min(s, o + c)) for o, s, c in zip(offset, shape, chunks)) - ds[sel] = source[sel] - n_bytes_copied += ds.size * ds.dtype.itemsize - - # copy attributes - if not without_attrs: - if dest_h5py and "filters" in source.attrs: - # No filters key in v3 metadata so it was stored in the - # attributes instead. We cannot copy this key to - # HDF5 attrs, though! - source_attrs = source.attrs.asdict().copy() - source_attrs.pop("filters", None) - else: - source_attrs = source.attrs - ds.attrs.update(source_attrs) - - n_copied += 1 - - else: - log("skip {} {} {}".format(source.name, source.shape, source.dtype)) - n_skipped += 1 - - elif root or not shallow: - # copy a group - - # check if an array is in the way - do_copy = True - exists_array = dest is not None and name in dest and hasattr(dest[name], "shape") - if exists_array: - if if_exists == "raise": - raise CopyError( - "an array {!r} already exists in destination {!r}".format(name, dest.name) - ) - elif if_exists == "skip": - do_copy = False - - # take action - if do_copy: - # log action - log("copy {}".format(source.name)) - - if not dry_run: - # clear the way - if exists_array: - del dest[name] - - # require group in destination - grp = dest.require_group(name) - - # copy attributes - if not without_attrs: - grp.attrs.update(source.attrs) - - else: - # setup for dry run without creating any groups in the - # destination - if dest is not None: - grp = dest.get(name, None) - else: - grp = None - - # recurse - for k in source.keys(): - c, s, b = _copy( - log, - source[k], - grp, - name=k, - root=False, - shallow=shallow, - without_attrs=without_attrs, - if_exists=if_exists, - dry_run=dry_run, - **create_kws, - ) - n_copied += c - n_skipped += s - n_bytes_copied += b - - n_copied += 1 - - else: - log("skip {}".format(source.name)) - n_skipped += 1 - - return n_copied, n_skipped, n_bytes_copied - - -def copy_all( - source, - dest, - shallow=False, - without_attrs=False, - log=None, - if_exists="raise", - dry_run=False, - **create_kws, -): - """Copy all children of the `source` group into the `dest` group. - - Parameters - ---------- - source : group or array/dataset - A zarr group or array, or an h5py group or dataset. - dest : group - A zarr or h5py group. - shallow : bool, optional - If True, only copy immediate children of `source`. - without_attrs : bool, optional - Do not copy user attributes. - log : callable, file path or file-like object, optional - If provided, will be used to log progress information. - if_exists : {'raise', 'replace', 'skip', 'skip_initialized'}, optional - How to handle arrays that already exist in the destination group. If - 'raise' then a CopyError is raised on the first array already present - in the destination group. If 'replace' then any array will be - replaced in the destination. If 'skip' then any existing arrays will - not be copied. If 'skip_initialized' then any existing arrays with - all chunks initialized will not be copied (not available when copying to - h5py). - dry_run : bool, optional - If True, don't actually copy anything, just log what would have - happened. - **create_kws - Passed through to the create_dataset method when copying an - array/dataset. - - Returns - ------- - n_copied : int - Number of items copied. - n_skipped : int - Number of items skipped. - n_bytes_copied : int - Number of bytes of data that were actually copied. - - Examples - -------- - >>> import h5py - >>> import zarr - >>> import numpy as np - >>> source = h5py.File('data/example.h5', mode='w') - >>> foo = source.create_group('foo') - >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) - >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) - >>> zarr.v2.tree(source) - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> dest = zarr.v2.group() - >>> import sys - >>> zarr.v2.copy_all(source, dest, log=sys.stdout) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - copy /spam (100,) int64 - all done: 4 copied, 0 skipped, 1,600 bytes copied - (4, 0, 1600) - >>> dest.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> source.close() - - Notes - ----- - Please note that this is an experimental feature. The behaviour of this - function is still evolving and the default behaviour and/or parameters may change - in future versions. - - """ - - # value checks - _check_dest_is_group(dest) - - # setup counting variables - n_copied = n_skipped = n_bytes_copied = 0 - - # setup logging - with _LogWriter(log) as log: - for k in source.keys(): - c, s, b = _copy( - log, - source[k], - dest, - name=k, - root=False, - shallow=shallow, - without_attrs=without_attrs, - if_exists=if_exists, - dry_run=dry_run, - **create_kws, - ) - n_copied += c - n_skipped += s - n_bytes_copied += b - - dest.attrs.update(**source.attrs) - - # log a final message with a summary of what happened - _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) - - return n_copied, n_skipped, n_bytes_copied - - -def consolidate_metadata(store: BaseStore, metadata_key=".zmetadata", *, path=""): - """ - Consolidate all metadata for groups and arrays within the given store - into a single resource and put it under the given key. - - This produces a single object in the backend store, containing all the - metadata read from all the zarr-related keys that can be found. After - metadata have been consolidated, use :func:`open_consolidated` to open - the root group in optimised, read-only mode, using the consolidated - metadata to reduce the number of read operations on the backend store. - - Note, that if the metadata in the store is changed after this - consolidation, then the metadata read by :func:`open_consolidated` - would be incorrect unless this function is called again. - - .. note:: This is an experimental feature. - - Parameters - ---------- - store : MutableMapping or string - Store or path to directory in file system or name of zip file. - metadata_key : str - Key to put the consolidated metadata under. - path : str or None - Path corresponding to the group that is being consolidated. Not required - for zarr v2 stores. - - Returns - ------- - g : :class:`zarr.v2.hierarchy.Group` - Group instance, opened with the new consolidated metadata. - - See Also - -------- - open_consolidated - - """ - store = normalize_store_arg(store, mode="w") - - version = store._store_version - - if version == 2: - - def is_zarr_key(key): - return key.endswith(".zarray") or key.endswith(".zgroup") or key.endswith(".zattrs") - - else: - raise NotImplementedError("This function only supports Zarr version 2.") - out = { - "zarr_consolidated_format": 1, - "metadata": {key: json_loads(store[key]) for key in store if is_zarr_key(key)}, - } - store[metadata_key] = json_dumps(out) - return open_consolidated(store, metadata_key=metadata_key, path=path) - - -def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", **kwargs): - """Open group using metadata previously consolidated into a single key. - - This is an optimised method for opening a Zarr group, where instead of - traversing the group/array hierarchy by accessing the metadata keys at - each level, a single key contains all of the metadata for everything. - For remote data sources where the overhead of accessing a key is large - compared to the time to read data. - - The group accessed must have already had its metadata consolidated into a - single key using the function :func:`consolidate_metadata`. - - This optimised method only works in modes which do not change the - metadata, although the data may still be written/updated. - - Parameters - ---------- - store : MutableMapping or string - Store or path to directory in file system or name of zip file. - metadata_key : str - Key to read the consolidated metadata from. The default (.zmetadata) - corresponds to the default used by :func:`consolidate_metadata`. - mode : {'r', 'r+'}, optional - Persistence mode: 'r' means read only (must exist); 'r+' means - read/write (must exist) although only writes to data are allowed, - changes to metadata including creation of new arrays or group - are not allowed. - **kwargs - Additional parameters are passed through to :func:`zarr.v2.creation.open_array` or - :func:`zarr.v2.hierarchy.open_group`. - - Returns - ------- - g : :class:`zarr.v2.hierarchy.Group` - Group instance, opened with the consolidated metadata. - - See Also - -------- - consolidate_metadata - - """ - - # normalize parameters - store = normalize_store_arg(store, storage_options=kwargs.get("storage_options"), mode=mode) - if mode not in {"r", "r+"}: - raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}".format(mode)) - - path = kwargs.pop("path", None) - if store._store_version == 2: - ConsolidatedStoreClass = ConsolidatedMetadataStore - else: - raise NotImplementedError("This function only supports Zarr version 2.") - - # setup metadata store - meta_store = ConsolidatedStoreClass(store, metadata_key=metadata_key) - - # pass through - chunk_store = kwargs.pop("chunk_store", None) or store - return open(store=meta_store, chunk_store=chunk_store, mode=mode, path=path, **kwargs) diff --git a/src/zarr/v2/core.py b/src/zarr/v2/core.py deleted file mode 100644 index 9eeb467d68..0000000000 --- a/src/zarr/v2/core.py +++ /dev/null @@ -1,2855 +0,0 @@ -import binascii -import hashlib -import itertools -import math -import operator -import re -from functools import reduce -from typing import Any - -import numpy as np -from numcodecs import AsType, get_codec -from numcodecs.compat import ensure_bytes, ensure_ndarray_like - -from zarr.v2._storage.store import _prefix_to_attrs_key -from zarr.v2.attrs import Attributes -from zarr.v2.context import Context -from zarr.v2.errors import ArrayNotFoundError, ReadOnlyError, ArrayIndexError -from zarr.v2.indexing import ( - BasicIndexer, - CoordinateIndexer, - MaskIndexer, - OIndex, - OrthogonalIndexer, - VIndex, - BlockIndex, - BlockIndexer, - PartialChunkIterator, - check_fields, - check_no_multi_fields, - ensure_tuple, - err_too_many_indices, - is_contiguous_selection, - is_pure_fancy_indexing, - is_pure_orthogonal_indexing, - is_scalar, - pop_fields, -) -from zarr.v2.storage import ( - _prefix_to_array_key, - KVStore, - getsize, - listdir, - normalize_store_arg, -) -from zarr.v2.util import ( - ConstantMap, - UncompressedPartialReadBufferV3, - all_equal, - InfoReporter, - check_array_shape, - human_readable_size, - is_total_slice, - nolock, - normalize_chunks, - normalize_resize_args, - normalize_shape, - normalize_storage_path, - PartialReadBuffer, -) - -__all__ = ["Array"] - - -# noinspection PyUnresolvedReferences -class Array: - """Instantiate an array from an initialized store. - - Parameters - ---------- - store : MutableMapping - Array store, already initialized. - path : string, optional - Storage path. - read_only : bool, optional - True if array should be protected against modification. - chunk_store : MutableMapping, optional - Separate storage for chunks. If not provided, `store` will be used - for storage of both chunks and metadata. - synchronizer : object, optional - Array synchronizer. - cache_metadata : bool, optional - If True (default), array configuration metadata will be cached for the - lifetime of the object. If False, array metadata will be reloaded - prior to all data access and modification operations (may incur - overhead depending on storage and data access pattern). - cache_attrs : bool, optional - If True (default), user attributes will be cached for attribute read - operations. If False, user attributes are reloaded from the store prior - to all attribute read operations. - partial_decompress : bool, optional - If True and while the chunk_store is a FSStore and the compression used - is Blosc, when getting data from the array chunks will be partially - read and decompressed when possible. - - .. versionadded:: 2.7 - - write_empty_chunks : bool, optional - If True, all chunks will be stored regardless of their contents. If - False (default), each chunk is compared to the array's fill value prior - to storing. If a chunk is uniformly equal to the fill value, then that - chunk is not be stored, and the store entry for that chunk's key is - deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - - meta_array : array-like, optional - An array instance to use for determining arrays to create and return - to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 - """ - - def __init__( - self, - store: Any, # BaseStore not strictly required due to normalize_store_arg - path=None, - read_only=False, - chunk_store=None, - synchronizer=None, - cache_metadata=True, - cache_attrs=True, - partial_decompress=False, - write_empty_chunks=True, - meta_array=None, - ): - # N.B., expect at this point store is fully initialized with all - # configuration metadata fully specified and normalized - store = normalize_store_arg(store) - - if chunk_store is not None: - chunk_store = normalize_store_arg(chunk_store) - - self._store = store - self._chunk_store = chunk_store - self._transformed_chunk_store = None - self._path = normalize_storage_path(path) - if self._path: - self._key_prefix = self._path + "/" - else: - self._key_prefix = "" - self._read_only = bool(read_only) - self._synchronizer = synchronizer - self._cache_metadata = cache_metadata - self._is_view = False - self._partial_decompress = partial_decompress - self._write_empty_chunks = write_empty_chunks - if meta_array is not None: - self._meta_array = np.empty_like(meta_array, shape=()) - else: - self._meta_array = np.empty(()) - - # initialize metadata - self._load_metadata() - - # initialize attributes - akey = _prefix_to_attrs_key(self._store, self._key_prefix) - self._attrs = Attributes( - store, key=akey, read_only=read_only, synchronizer=synchronizer, cache=cache_attrs - ) - - # initialize info reporter - self._info_reporter = InfoReporter(self) - - # initialize indexing helpers - self._oindex = OIndex(self) - self._vindex = VIndex(self) - self._blocks = BlockIndex(self) - - def _load_metadata(self): - """(Re)load metadata from store.""" - if self._synchronizer is None: - self._load_metadata_nosync() - else: - mkey = _prefix_to_array_key(self._store, self._key_prefix) - with self._synchronizer[mkey]: - self._load_metadata_nosync() - - def _load_metadata_nosync(self): - try: - mkey = _prefix_to_array_key(self._store, self._key_prefix) - meta_bytes = self._store[mkey] - except KeyError: - raise ArrayNotFoundError(self._path) - else: - # decode and store metadata as instance members - meta = self._store._metadata_class.decode_array_metadata(meta_bytes) - self._meta = meta - self._shape = meta["shape"] - self._fill_value = meta["fill_value"] - dimension_separator = meta.get("dimension_separator", None) - - self._chunks = meta["chunks"] - self._dtype = meta["dtype"] - self._order = meta["order"] - if dimension_separator is None: - try: - dimension_separator = self._store._dimension_separator - except (AttributeError, KeyError): - pass - - # Fallback for any stores which do not choose a default - if dimension_separator is None: - dimension_separator = "." - - self._dimension_separator = dimension_separator - - # setup compressor - compressor = meta.get("compressor", None) - if compressor is None: - self._compressor = None - else: - self._compressor = get_codec(compressor) - - # setup filters - - filters = meta.get("filters", []) - - if filters: - filters = [get_codec(config) for config in filters] - self._filters = filters - - def _refresh_metadata(self): - if not self._cache_metadata: - self._load_metadata() - - def _refresh_metadata_nosync(self): - if not self._cache_metadata and not self._is_view: - self._load_metadata_nosync() - - def _flush_metadata_nosync(self): - if self._is_view: - raise PermissionError("operation not permitted for views") - - if self._compressor: - compressor_config = self._compressor.get_config() - else: - compressor_config = None - if self._filters: - filters_config = [f.get_config() for f in self._filters] - else: - filters_config = None - _compressor = compressor_config - meta = dict( - shape=self._shape, - compressor=_compressor, - fill_value=self._fill_value, - filters=filters_config, - ) - - meta.update( - dict( - chunks=self._chunks, - dtype=self._dtype, - order=self._order, - dimension_separator=self._dimension_separator, - ) - ) - mkey = _prefix_to_array_key(self._store, self._key_prefix) - self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta) - - @property - def store(self): - """A MutableMapping providing the underlying storage for the array.""" - return self._store - - @property - def path(self): - """Storage path.""" - return self._path - - @property - def name(self): - """Array name following h5py convention.""" - if self.path: - # follow h5py convention: add leading slash - name = self.path - if name[0] != "/": - name = "/" + name - return name - return None - - @property - def basename(self): - """Final component of name.""" - if self.name is not None: - return self.name.split("/")[-1] - return None - - @property - def read_only(self): - """A boolean, True if modification operations are not permitted.""" - return self._read_only - - @read_only.setter - def read_only(self, value): - self._read_only = bool(value) - - @property - def chunk_store(self): - """A MutableMapping providing the underlying storage for array chunks.""" - if self._transformed_chunk_store is not None: - return self._transformed_chunk_store - elif self._chunk_store is not None: - return self._chunk_store - else: - return self._store - - @property - def shape(self): - """A tuple of integers describing the length of each dimension of - the array.""" - # N.B., shape may change if array is resized, hence need to refresh - # metadata - self._refresh_metadata() - return self._shape - - @shape.setter - def shape(self, value): - self.resize(value) - - @property - def chunks(self): - """A tuple of integers describing the length of each dimension of a - chunk of the array.""" - return self._chunks - - @property - def dtype(self): - """The NumPy data type.""" - return self._dtype - - @property - def compressor(self): - """Primary compression codec.""" - return self._compressor - - @property - def fill_value(self): - """A value used for uninitialized portions of the array.""" - return self._fill_value - - @fill_value.setter - def fill_value(self, new): - self._fill_value = new - self._flush_metadata_nosync() - - @property - def order(self): - """A string indicating the order in which bytes are arranged within - chunks of the array.""" - return self._order - - @property - def filters(self): - """One or more codecs used to transform data prior to compression.""" - return self._filters - - @property - def synchronizer(self): - """Object used to synchronize write access to the array.""" - return self._synchronizer - - @property - def attrs(self): - """A MutableMapping containing user-defined attributes. Note that - attribute values must be JSON serializable.""" - return self._attrs - - @property - def ndim(self): - """Number of dimensions.""" - return len(self._shape) - - @property - def _size(self): - return reduce(operator.mul, self._shape, 1) - - @property - def size(self): - """The total number of elements in the array.""" - # N.B., this property depends on shape, and shape may change if array - # is resized, hence need to refresh metadata - self._refresh_metadata() - return self._size - - @property - def itemsize(self): - """The size in bytes of each item in the array.""" - return self.dtype.itemsize - - @property - def _nbytes(self): - return self._size * self.itemsize - - @property - def nbytes(self): - """The total number of bytes that would be required to store the - array without compression.""" - # N.B., this property depends on shape, and shape may change if array - # is resized, hence need to refresh metadata - self._refresh_metadata() - return self._nbytes - - @property - def nbytes_stored(self): - """The total number of stored bytes of data for the array. This - includes storage required for configuration metadata and user - attributes.""" - m = getsize(self._store, self._path) - if self._chunk_store is None: - return m - else: - n = getsize(self._chunk_store, self._path) - if m < 0 or n < 0: - return -1 - else: - return m + n - - @property - def _cdata_shape(self): - if self._shape == (): - return (1,) - else: - return tuple(math.ceil(s / c) for s, c in zip(self._shape, self._chunks)) - - @property - def cdata_shape(self): - """A tuple of integers describing the number of chunks along each - dimension of the array.""" - self._refresh_metadata() - return self._cdata_shape - - @property - def _nchunks(self): - return reduce(operator.mul, self._cdata_shape, 1) - - @property - def nchunks(self): - """Total number of chunks.""" - self._refresh_metadata() - return self._nchunks - - @property - def nchunks_initialized(self): - """The number of chunks that have been initialized with some data.""" - - # key pattern for chunk keys - prog = re.compile(r"\.".join([r"\d+"] * min(1, self.ndim))) - - # count chunk keys - return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) - - # backwards compatibility - initialized = nchunks_initialized - - @property - def is_view(self): - """A boolean, True if this array is a view on another array.""" - return self._is_view - - @property - def oindex(self): - """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and - :func:`set_orthogonal_selection` for documentation and examples.""" - return self._oindex - - @property - def vindex(self): - """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, - :func:`set_coordinate_selection`, :func:`get_mask_selection` and - :func:`set_mask_selection` for documentation and examples.""" - return self._vindex - - @property - def blocks(self): - """Shortcut for blocked chunked indexing, see :func:`get_block_selection` and - :func:`set_block_selection` for documentation and examples.""" - return self._blocks - - @property - def write_empty_chunks(self) -> bool: - """A Boolean, True if chunks composed of the array's fill value - will be stored. If False, such chunks will not be stored. - """ - return self._write_empty_chunks - - @property - def meta_array(self): - """An array-like instance to use for determining arrays to create and return - to users. - """ - return self._meta_array - - def __eq__(self, other): - return ( - isinstance(other, Array) - and self.store == other.store - and self.read_only == other.read_only - and self.path == other.path - and not self._is_view - # N.B., no need to compare other properties, should be covered by - # store comparison - ) - - def __array__(self, *args): - a = self[...] - if args: - a = a.astype(args[0]) - return a - - def islice(self, start=None, end=None): - """ - Yield a generator for iterating over the entire or parts of the - array. Uses a cache so chunks only have to be decompressed once. - - Parameters - ---------- - start : int, optional - Start index for the generator to start at. Defaults to 0. - end : int, optional - End index for the generator to stop at. Defaults to self.shape[0]. - - Yields - ------ - out : generator - A generator that can be used to iterate over the requested region - the array. - - Examples - -------- - Setup a 1-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100)) - - Iterate over part of the array: - >>> for value in z.islice(25, 30): value; - 25 - 26 - 27 - 28 - 29 - """ - - if len(self.shape) == 0: - # Same error as numpy - raise TypeError("iteration over a 0-d array") - if start is None: - start = 0 - if end is None or end > self.shape[0]: - end = self.shape[0] - - if not isinstance(start, int) or start < 0: - raise ValueError("start must be a nonnegative integer") - - if not isinstance(end, int) or end < 0: - raise ValueError("end must be a nonnegative integer") - - # Avoid repeatedly decompressing chunks by iterating over the chunks - # in the first dimension. - chunk_size = self.chunks[0] - chunk = None - for j in range(start, end): - if j % chunk_size == 0: - chunk = self[j : j + chunk_size] - # init chunk if we start offset of chunk borders - elif chunk is None: - chunk_start = j - j % chunk_size - chunk_end = chunk_start + chunk_size - chunk = self[chunk_start:chunk_end] - yield chunk[j % chunk_size] - - def __iter__(self): - return self.islice() - - def __len__(self): - if self.shape: - return self.shape[0] - else: - # 0-dimensional array, same error message as numpy - raise TypeError("len() of unsized object") - - def __getitem__(self, selection): - """Retrieve data for an item or region of the array. - - Parameters - ---------- - selection : tuple - An integer index or slice or tuple of int/slice objects specifying the - requested item or region for each dimension of the array. - - Returns - ------- - out : ndarray - A NumPy array containing the data for the requested region. - - Examples - -------- - Setup a 1-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100)) - - Retrieve a single item:: - - >>> z[5] - 5 - - Retrieve a region via slicing:: - - >>> z[:5] - array([0, 1, 2, 3, 4]) - >>> z[-5:] - array([95, 96, 97, 98, 99]) - >>> z[5:10] - array([5, 6, 7, 8, 9]) - >>> z[5:10:2] - array([5, 7, 9]) - >>> z[::2] - array([ 0, 2, 4, ..., 94, 96, 98]) - - Load the entire array into memory:: - - >>> z[...] - array([ 0, 1, 2, ..., 97, 98, 99]) - - Setup a 2-dimensional array:: - - >>> z = zarr.v2.array(np.arange(100).reshape(10, 10)) - - Retrieve an item:: - - >>> z[2, 2] - 22 - - Retrieve a region via slicing:: - - >>> z[1:3, 1:3] - array([[11, 12], - [21, 22]]) - >>> z[1:3, :] - array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) - >>> z[:, 1:3] - array([[ 1, 2], - [11, 12], - [21, 22], - [31, 32], - [41, 42], - [51, 52], - [61, 62], - [71, 72], - [81, 82], - [91, 92]]) - >>> z[0:5:2, 0:5:2] - array([[ 0, 2, 4], - [20, 22, 24], - [40, 42, 44]]) - >>> z[::2, ::2] - array([[ 0, 2, 4, 6, 8], - [20, 22, 24, 26, 28], - [40, 42, 44, 46, 48], - [60, 62, 64, 66, 68], - [80, 82, 84, 86, 88]]) - - Load the entire array into memory:: - - >>> z[...] - array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], - [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], - [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], - [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], - [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], - [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) - - For arrays with a structured dtype, specific fields can be retrieved, e.g.:: - - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.v2.array(a) - >>> z['foo'] - array([b'aaa', b'bbb', b'ccc'], - dtype='|S3') - - Notes - ----- - Slices with step > 1 are supported, but slices with negative step are not. - - Currently the implementation for __getitem__ is provided by - :func:`vindex` if the indexing is pure fancy indexing (ie a - broadcast-compatible tuple of integer array indices), or by - :func:`set_basic_selection` otherwise. - - Effectively, this means that the following indexing modes are supported: - - - integer indexing - - slice indexing - - mixed slice and integer indexing - - boolean indexing - - fancy indexing (vectorized list of integers) - - For specific indexing options including outer indexing, see the - methods listed under See Also. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, - set_orthogonal_selection, get_block_selection, set_block_selection, - vindex, oindex, blocks, __setitem__ - - """ - fields, pure_selection = pop_fields(selection) - if is_pure_fancy_indexing(pure_selection, self.ndim): - result = self.vindex[selection] - elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - result = self.get_orthogonal_selection(pure_selection, fields=fields) - else: - result = self.get_basic_selection(pure_selection, fields=fields) - return result - - def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): - """Retrieve data for an item or region of the array. - - Parameters - ---------- - selection : tuple - A tuple specifying the requested item or region for each dimension of the - array. May be any combination of int and/or slice for multidimensional arrays. - out : ndarray, optional - If given, load the selected data directly into this array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to - extract data for. - - Returns - ------- - out : ndarray - A NumPy array containing the data for the requested region. - - Examples - -------- - Setup a 1-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100)) - - Retrieve a single item:: - - >>> z.get_basic_selection(5) - 5 - - Retrieve a region via slicing:: - - >>> z.get_basic_selection(slice(5)) - array([0, 1, 2, 3, 4]) - >>> z.get_basic_selection(slice(-5, None)) - array([95, 96, 97, 98, 99]) - >>> z.get_basic_selection(slice(5, 10)) - array([5, 6, 7, 8, 9]) - >>> z.get_basic_selection(slice(5, 10, 2)) - array([5, 7, 9]) - >>> z.get_basic_selection(slice(None, None, 2)) - array([ 0, 2, 4, ..., 94, 96, 98]) - - Setup a 2-dimensional array:: - - >>> z = zarr.v2.array(np.arange(100).reshape(10, 10)) - - Retrieve an item:: - - >>> z.get_basic_selection((2, 2)) - 22 - - Retrieve a region via slicing:: - - >>> z.get_basic_selection((slice(1, 3), slice(1, 3))) - array([[11, 12], - [21, 22]]) - >>> z.get_basic_selection((slice(1, 3), slice(None))) - array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) - >>> z.get_basic_selection((slice(None), slice(1, 3))) - array([[ 1, 2], - [11, 12], - [21, 22], - [31, 32], - [41, 42], - [51, 52], - [61, 62], - [71, 72], - [81, 82], - [91, 92]]) - >>> z.get_basic_selection((slice(0, 5, 2), slice(0, 5, 2))) - array([[ 0, 2, 4], - [20, 22, 24], - [40, 42, 44]]) - >>> z.get_basic_selection((slice(None, None, 2), slice(None, None, 2))) - array([[ 0, 2, 4, 6, 8], - [20, 22, 24, 26, 28], - [40, 42, 44, 46, 48], - [60, 62, 64, 66, 68], - [80, 82, 84, 86, 88]]) - - For arrays with a structured dtype, specific fields can be retrieved, e.g.:: - - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.v2.array(a) - >>> z.get_basic_selection(slice(2), fields='foo') - array([b'aaa', b'bbb'], - dtype='|S3') - - Notes - ----- - Slices with step > 1 are supported, but slices with negative step are not. - - Currently this method provides the implementation for accessing data via the - square bracket notation (__getitem__). See :func:`__getitem__` for examples - using the alternative notation. - - See Also - -------- - set_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, - set_orthogonal_selection, get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - - # refresh metadata - if not self._cache_metadata: - self._load_metadata() - - # check args - check_fields(fields, self._dtype) - - # handle zero-dimensional arrays - if self._shape == (): - return self._get_basic_selection_zd(selection=selection, out=out, fields=fields) - else: - return self._get_basic_selection_nd(selection=selection, out=out, fields=fields) - - def _get_basic_selection_zd(self, selection, out=None, fields=None): - # special case basic selection for zero-dimensional array - - # check selection is valid - selection = ensure_tuple(selection) - if selection not in ((), (Ellipsis,)): - err_too_many_indices(selection, ()) - - try: - # obtain encoded data for chunk - ckey = self._chunk_key((0,)) - cdata = self.chunk_store[ckey] - - except KeyError: - # chunk not initialized - chunk = np.zeros_like(self._meta_array, shape=(), dtype=self._dtype) - if self._fill_value is not None: - chunk.fill(self._fill_value) - - else: - chunk = self._decode_chunk(cdata) - - # handle fields - if fields: - chunk = chunk[fields] - - # handle selection of the scalar value via empty tuple - if out is None: - out = chunk[selection] - else: - out[selection] = chunk[selection] - - return out - - def _get_basic_selection_nd(self, selection, out=None, fields=None): - # implementation of basic selection for array with at least one dimension - - # setup indexer - indexer = BasicIndexer(selection, self) - - return self._get_selection(indexer=indexer, out=out, fields=fields) - - def get_orthogonal_selection(self, selection, out=None, fields=None): - """Retrieve data by making a selection for each dimension of the array. For - example, if an array has 2 dimensions, allows selecting specific rows and/or - columns. The selection for each dimension can be either an integer (indexing a - single item), a slice, an array of integers, or a Boolean array where True - values indicate a selection. - - Parameters - ---------- - selection : tuple - A selection for each dimension of the array. May be any combination of int, - slice, integer array or Boolean array. - out : ndarray, optional - If given, load the selected data directly into this array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to - extract data for. - - Returns - ------- - out : ndarray - A NumPy array containing the data for the requested selection. - - Examples - -------- - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100).reshape(10, 10)) - - Retrieve rows and columns via any combination of int, slice, integer array and/or - Boolean array:: - - >>> z.get_orthogonal_selection(([1, 4], slice(None))) - array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) - >>> z.get_orthogonal_selection((slice(None), [1, 4])) - array([[ 1, 4], - [11, 14], - [21, 24], - [31, 34], - [41, 44], - [51, 54], - [61, 64], - [71, 74], - [81, 84], - [91, 94]]) - >>> z.get_orthogonal_selection(([1, 4], [1, 4])) - array([[11, 14], - [41, 44]]) - >>> sel = np.zeros(z.shape[0], dtype=bool) - >>> sel[1] = True - >>> sel[4] = True - >>> z.get_orthogonal_selection((sel, sel)) - array([[11, 14], - [41, 44]]) - - For convenience, the orthogonal selection functionality is also available via the - `oindex` property, e.g.:: - - >>> z.oindex[[1, 4], :] - array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) - >>> z.oindex[:, [1, 4]] - array([[ 1, 4], - [11, 14], - [21, 24], - [31, 34], - [41, 44], - [51, 54], - [61, 64], - [71, 74], - [81, 84], - [91, 94]]) - >>> z.oindex[[1, 4], [1, 4]] - array([[11, 14], - [41, 44]]) - >>> sel = np.zeros(z.shape[0], dtype=bool) - >>> sel[1] = True - >>> sel[4] = True - >>> z.oindex[sel, sel] - array([[11, 14], - [41, 44]]) - - Notes - ----- - Orthogonal indexing is also known as outer indexing. - - Slices with step > 1 are supported, but slices with negative step are not. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, - get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - - # refresh metadata - if not self._cache_metadata: - self._load_metadata() - - # check args - check_fields(fields, self._dtype) - - # setup indexer - indexer = OrthogonalIndexer(selection, self) - - return self._get_selection(indexer=indexer, out=out, fields=fields) - - def get_coordinate_selection(self, selection, out=None, fields=None): - """Retrieve a selection of individual items, by providing the indices - (coordinates) for each selected item. - - Parameters - ---------- - selection : tuple - An integer (coordinate) array for each dimension of the array. - out : ndarray, optional - If given, load the selected data directly into this array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to - extract data for. - - Returns - ------- - out : ndarray - A NumPy array containing the data for the requested selection. - - Examples - -------- - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100).reshape(10, 10)) - - Retrieve items by specifying their coordinates:: - - >>> z.get_coordinate_selection(([1, 4], [1, 4])) - array([11, 44]) - - For convenience, the coordinate selection functionality is also available via the - `vindex` property, e.g.:: - - >>> z.vindex[[1, 4], [1, 4]] - array([11, 44]) - - Notes - ----- - Coordinate indexing is also known as point selection, and is a form of vectorized - or inner indexing. - - Slices are not supported. Coordinate arrays must be provided for all dimensions - of the array. - - Coordinate arrays may be multidimensional, in which case the output array will - also be multidimensional. Coordinate arrays are broadcast against each other - before being applied. The shape of the output will be the same as the shape of - each coordinate array after broadcasting. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, - get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - - # refresh metadata - if not self._cache_metadata: - self._load_metadata() - - # check args - check_fields(fields, self._dtype) - - # setup indexer - indexer = CoordinateIndexer(selection, self) - - # handle output - need to flatten - if out is not None: - out = out.reshape(-1) - - out = self._get_selection(indexer=indexer, out=out, fields=fields) - - # restore shape - out = out.reshape(indexer.sel_shape) - - return out - - def get_block_selection(self, selection, out=None, fields=None): - """Retrieve a selection of individual chunk blocks, by providing the indices - (coordinates) for each chunk block. - - Parameters - ---------- - selection : tuple - An integer (coordinate) or slice for each dimension of the array. - out : ndarray, optional - If given, load the selected data directly into this array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to - extract data for. - - Returns - ------- - out : ndarray - A NumPy array containing the data for the requested selection. - - Examples - -------- - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100).reshape(10, 10), chunks=(3, 3)) - - Retrieve items by specifying their block coordinates:: - - >>> z.get_block_selection((1, slice(None))) - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - - Which is equivalent to:: - - >>> z[3:6, :] - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - - For convenience, the block selection functionality is also available via the - `blocks` property, e.g.:: - - >>> z.blocks[1] - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - - Notes - ----- - Block indexing is a convenience indexing method to work on individual chunks - with chunk index slicing. It has the same concept as Dask's `Array.blocks` - indexing. - - Slices are supported. However, only with a step size of one. - - Block index arrays may be multidimensional to index multidimensional arrays. - For example:: - - >>> z.blocks[0, 1:3] - array([[ 3, 4, 5, 6, 7, 8], - [13, 14, 15, 16, 17, 18], - [23, 24, 25, 26, 27, 28]]) - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, - set_coordinate_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - if not self._cache_metadata: - self._load_metadata() - - # check args - check_fields(fields, self._dtype) - - # setup indexer - indexer = BlockIndexer(selection, self) - - return self._get_selection(indexer=indexer, out=out, fields=fields) - - def get_mask_selection(self, selection, out=None, fields=None): - """Retrieve a selection of individual items, by providing a Boolean array of the - same shape as the array against which the selection is being made, where True - values indicate a selected item. - - Parameters - ---------- - selection : ndarray, bool - A Boolean array of the same shape as the array against which the selection is - being made. - out : ndarray, optional - If given, load the selected data directly into this array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to - extract data for. - - Returns - ------- - out : ndarray - A NumPy array containing the data for the requested selection. - - Examples - -------- - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.array(np.arange(100).reshape(10, 10)) - - Retrieve items by specifying a mask:: - - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[1, 1] = True - >>> sel[4, 4] = True - >>> z.get_mask_selection(sel) - array([11, 44]) - - For convenience, the mask selection functionality is also available via the - `vindex` property, e.g.:: - - >>> z.vindex[sel] - array([11, 44]) - - Notes - ----- - Mask indexing is a form of vectorized or inner indexing, and is equivalent to - coordinate indexing. Internally the mask array is converted to coordinate - arrays by calling `np.nonzero`. - - See Also - -------- - get_basic_selection, set_basic_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, - set_coordinate_selection, get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - """ - - # refresh metadata - if not self._cache_metadata: - self._load_metadata() - - # check args - check_fields(fields, self._dtype) - - # setup indexer - indexer = MaskIndexer(selection, self) - - return self._get_selection(indexer=indexer, out=out, fields=fields) - - def _get_selection(self, indexer, out=None, fields=None): - # We iterate over all chunks which overlap the selection and thus contain data - # that needs to be extracted. Each chunk is processed in turn, extracting the - # necessary data and storing into the correct location in the output array. - - # N.B., it is an important optimisation that we only visit chunks which overlap - # the selection. This minimises the number of iterations in the main for loop. - - # check fields are sensible - out_dtype = check_fields(fields, self._dtype) - - # determine output shape - out_shape = indexer.shape - - # setup output array - if out is None: - out = np.empty_like( - self._meta_array, shape=out_shape, dtype=out_dtype, order=self._order - ) - else: - check_array_shape("out", out, out_shape) - - # iterate over chunks - - if math.prod(out_shape) > 0: - # allow storage to get multiple items at once - lchunk_coords, lchunk_selection, lout_selection = zip(*indexer) - self._chunk_getitems( - lchunk_coords, - lchunk_selection, - out, - lout_selection, - drop_axes=indexer.drop_axes, - fields=fields, - ) - if out.shape: - return out - else: - return out[()] - - def __setitem__(self, selection, value): - """Modify data for an item or region of the array. - - Parameters - ---------- - selection : tuple - An integer index or slice or tuple of int/slice specifying the requested - region for each dimension of the array. - value : scalar or array-like - Value to be stored into the array. - - Examples - -------- - Setup a 1-dimensional array:: - - >>> import zarr - >>> z = zarr.v2.zeros(100, dtype=int) - - Set all array elements to the same scalar value:: - - >>> z[...] = 42 - >>> z[...] - array([42, 42, 42, ..., 42, 42, 42]) - - Set a portion of the array:: - - >>> z[:10] = np.arange(10) - >>> z[-10:] = np.arange(10)[::-1] - >>> z[...] - array([ 0, 1, 2, ..., 2, 1, 0]) - - Setup a 2-dimensional array:: - - >>> z = zarr.v2.zeros((5, 5), dtype=int) - - Set all array elements to the same scalar value:: - - >>> z[...] = 42 - - Set a portion of the array:: - - >>> z[0, :] = np.arange(z.shape[1]) - >>> z[:, 0] = np.arange(z.shape[0]) - >>> z[...] - array([[ 0, 1, 2, 3, 4], - [ 1, 42, 42, 42, 42], - [ 2, 42, 42, 42, 42], - [ 3, 42, 42, 42, 42], - [ 4, 42, 42, 42, 42]]) - - For arrays with a structured dtype, specific fields can be modified, e.g.:: - - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.v2.array(a) - >>> z['foo'] = b'zzz' - >>> z[...] - array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'zzz', 3, 12.6)], - dtype=[('foo', 'S3'), ('bar', ' 1 are supported, but slices with negative step are not. - - Currently the implementation for __setitem__ is provided by - :func:`vindex` if the indexing is pure fancy indexing (ie a - broadcast-compatible tuple of integer array indices), or by - :func:`set_basic_selection` otherwise. - - Effectively, this means that the following indexing modes are supported: - - - integer indexing - - slice indexing - - mixed slice and integer indexing - - boolean indexing - - fancy indexing (vectorized list of integers) - - For specific indexing options including outer indexing, see the - methods listed under See Also. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, - set_orthogonal_selection, get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__ - - """ - fields, pure_selection = pop_fields(selection) - if is_pure_fancy_indexing(pure_selection, self.ndim): - self.vindex[selection] = value - elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - self.set_orthogonal_selection(pure_selection, value, fields=fields) - else: - self.set_basic_selection(pure_selection, value, fields=fields) - - def set_basic_selection(self, selection, value, fields=None): - """Modify data for an item or region of the array. - - Parameters - ---------- - selection : tuple - An integer index or slice or tuple of int/slice specifying the requested - region for each dimension of the array. - value : scalar or array-like - Value to be stored into the array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to set - data for. - - Examples - -------- - Setup a 1-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.zeros(100, dtype=int) - - Set all array elements to the same scalar value:: - - >>> z.set_basic_selection(..., 42) - >>> z[...] - array([42, 42, 42, ..., 42, 42, 42]) - - Set a portion of the array:: - - >>> z.set_basic_selection(slice(10), np.arange(10)) - >>> z.set_basic_selection(slice(-10, None), np.arange(10)[::-1]) - >>> z[...] - array([ 0, 1, 2, ..., 2, 1, 0]) - - Setup a 2-dimensional array:: - - >>> z = zarr.v2.zeros((5, 5), dtype=int) - - Set all array elements to the same scalar value:: - - >>> z.set_basic_selection(..., 42) - - Set a portion of the array:: - - >>> z.set_basic_selection((0, slice(None)), np.arange(z.shape[1])) - >>> z.set_basic_selection((slice(None), 0), np.arange(z.shape[0])) - >>> z[...] - array([[ 0, 1, 2, 3, 4], - [ 1, 42, 42, 42, 42], - [ 2, 42, 42, 42, 42], - [ 3, 42, 42, 42, 42], - [ 4, 42, 42, 42, 42]]) - - For arrays with a structured dtype, the `fields` parameter can be used to set - data for a specific field, e.g.:: - - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.v2.array(a) - >>> z.set_basic_selection(slice(0, 2), b'zzz', fields='foo') - >>> z[:] - array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'ccc', 3, 12.6)], - dtype=[('foo', 'S3'), ('bar', '>> import zarr - >>> import numpy as np - >>> z = zarr.v2.zeros((5, 5), dtype=int) - - Set data for a selection of rows:: - - >>> z.set_orthogonal_selection(([1, 4], slice(None)), 1) - >>> z[...] - array([[0, 0, 0, 0, 0], - [1, 1, 1, 1, 1], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [1, 1, 1, 1, 1]]) - - Set data for a selection of columns:: - - >>> z.set_orthogonal_selection((slice(None), [1, 4]), 2) - >>> z[...] - array([[0, 2, 0, 0, 2], - [1, 2, 1, 1, 2], - [0, 2, 0, 0, 2], - [0, 2, 0, 0, 2], - [1, 2, 1, 1, 2]]) - - Set data for a selection of rows and columns:: - - >>> z.set_orthogonal_selection(([1, 4], [1, 4]), 3) - >>> z[...] - array([[0, 2, 0, 0, 2], - [1, 3, 1, 1, 3], - [0, 2, 0, 0, 2], - [0, 2, 0, 0, 2], - [1, 3, 1, 1, 3]]) - - For convenience, this functionality is also available via the `oindex` property. - E.g.:: - - >>> z.oindex[[1, 4], [1, 4]] = 4 - >>> z[...] - array([[0, 2, 0, 0, 2], - [1, 4, 1, 1, 4], - [0, 2, 0, 0, 2], - [0, 2, 0, 0, 2], - [1, 4, 1, 1, 4]]) - - Notes - ----- - Orthogonal indexing is also known as outer indexing. - - Slices with step > 1 are supported, but slices with negative step are not. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, - get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - - # guard conditions - if self._read_only: - raise ReadOnlyError - - # refresh metadata - if not self._cache_metadata: - self._load_metadata_nosync() - - # setup indexer - indexer = OrthogonalIndexer(selection, self) - - self._set_selection(indexer, value, fields=fields) - - def set_coordinate_selection(self, selection, value, fields=None): - """Modify a selection of individual items, by providing the indices (coordinates) - for each item to be modified. - - Parameters - ---------- - selection : tuple - An integer (coordinate) array for each dimension of the array. - value : scalar or array-like - Value to be stored into the array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to set - data for. - - Examples - -------- - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.zeros((5, 5), dtype=int) - - Set data for a selection of items:: - - >>> z.set_coordinate_selection(([1, 4], [1, 4]), 1) - >>> z[...] - array([[0, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 1]]) - - For convenience, this functionality is also available via the `vindex` property. - E.g.:: - - >>> z.vindex[[1, 4], [1, 4]] = 2 - >>> z[...] - array([[0, 0, 0, 0, 0], - [0, 2, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 2]]) - - Notes - ----- - Coordinate indexing is also known as point selection, and is a form of vectorized - or inner indexing. - - Slices are not supported. Coordinate arrays must be provided for all dimensions - of the array. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, - get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - - # guard conditions - if self._read_only: - raise ReadOnlyError - - # refresh metadata - if not self._cache_metadata: - self._load_metadata_nosync() - - # setup indexer - indexer = CoordinateIndexer(selection, self) - - # handle value - need ndarray-like flatten value - if not is_scalar(value, self._dtype): - try: - value = ensure_ndarray_like(value) - except TypeError: - # Handle types like `list` or `tuple` - value = np.array(value, like=self._meta_array) - if hasattr(value, "shape") and len(value.shape) > 1: - value = value.reshape(-1) - - self._set_selection(indexer, value, fields=fields) - - def set_block_selection(self, selection, value, fields=None): - """Modify a selection of individual blocks, by providing the chunk indices - (coordinates) for each block to be modified. - - Parameters - ---------- - selection : tuple - An integer (coordinate) or slice for each dimension of the array. - value : scalar or array-like - Value to be stored into the array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to set - data for. - - Examples - -------- - Set up a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.zeros((6, 6), dtype=int, chunks=2) - - Set data for a selection of items:: - - >>> z.set_block_selection((1, 0), 1) - >>> z[...] - array([[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]]) - - For convenience, this functionality is also available via the `blocks` property. - E.g.:: - - >>> z.blocks[2, 1] = 4 - >>> z[...] - array([[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [0, 0, 4, 4, 0, 0], - [0, 0, 4, 4, 0, 0]]) - - >>> z.blocks[:, 2] = 7 - >>> z[...] - array([[0, 0, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7], - [1, 1, 0, 0, 7, 7], - [1, 1, 0, 0, 7, 7], - [0, 0, 4, 4, 7, 7], - [0, 0, 4, 4, 7, 7]]) - - Notes - ----- - Block indexing is a convenience indexing method to work on individual chunks - with chunk index slicing. It has the same concept as Dask's `Array.blocks` - indexing. - - Slices are supported. However, only with a step size of one. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, - get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - # guard conditions - if self._read_only: - raise ReadOnlyError - - # refresh metadata - if not self._cache_metadata: - self._load_metadata_nosync() - - # setup indexer - indexer = BlockIndexer(selection, self) - - self._set_selection(indexer, value, fields=fields) - - def set_mask_selection(self, selection, value, fields=None): - """Modify a selection of individual items, by providing a Boolean array of the - same shape as the array against which the selection is being made, where True - values indicate a selected item. - - Parameters - ---------- - selection : ndarray, bool - A Boolean array of the same shape as the array against which the selection is - being made. - value : scalar or array-like - Value to be stored into the array. - fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to set - data for. - - Examples - -------- - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.v2.zeros((5, 5), dtype=int) - - Set data for a selection of items:: - - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[1, 1] = True - >>> sel[4, 4] = True - >>> z.set_mask_selection(sel, 1) - >>> z[...] - array([[0, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 1]]) - - For convenience, this functionality is also available via the `vindex` property. - E.g.:: - - >>> z.vindex[sel] = 2 - >>> z[...] - array([[0, 0, 0, 0, 0], - [0, 2, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 2]]) - - Notes - ----- - Mask indexing is a form of vectorized or inner indexing, and is equivalent to - coordinate indexing. Internally the mask array is converted to coordinate - arrays by calling `np.nonzero`. - - See Also - -------- - get_basic_selection, set_basic_selection, get_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, - set_coordinate_selection, get_block_selection, set_block_selection, - vindex, oindex, blocks, __getitem__, __setitem__ - - """ - - # guard conditions - if self._read_only: - raise ReadOnlyError - - # refresh metadata - if not self._cache_metadata: - self._load_metadata_nosync() - - # setup indexer - indexer = MaskIndexer(selection, self) - - self._set_selection(indexer, value, fields=fields) - - def _set_basic_selection_zd(self, selection, value, fields=None): - # special case __setitem__ for zero-dimensional array - - # check selection is valid - selection = ensure_tuple(selection) - if selection not in ((), (Ellipsis,)): - err_too_many_indices(selection, self._shape) - - # check fields - check_fields(fields, self._dtype) - fields = check_no_multi_fields(fields) - - # obtain key for chunk - ckey = self._chunk_key((0,)) - - # setup chunk - try: - # obtain compressed data for chunk - cdata = self.chunk_store[ckey] - - except KeyError: - # chunk not initialized - chunk = np.zeros_like(self._meta_array, shape=(), dtype=self._dtype) - if self._fill_value is not None: - chunk.fill(self._fill_value) - - else: - # decode chunk - chunk = self._decode_chunk(cdata).copy() - - # set value - if fields: - chunk[fields][selection] = value - else: - chunk[selection] = value - - # remove chunk if write_empty_chunks is false and it only contains the fill value - if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk): - try: - del self.chunk_store[ckey] - return - except Exception: # pragma: no cover - # deleting failed, fallback to overwriting - pass - else: - # encode and store - cdata = self._encode_chunk(chunk) - self.chunk_store[ckey] = cdata - - def _set_basic_selection_nd(self, selection, value, fields=None): - # implementation of __setitem__ for array with at least one dimension - - # setup indexer - indexer = BasicIndexer(selection, self) - - self._set_selection(indexer, value, fields=fields) - - def _set_selection(self, indexer, value, fields=None): - # We iterate over all chunks which overlap the selection and thus contain data - # that needs to be replaced. Each chunk is processed in turn, extracting the - # necessary data from the value array and storing into the chunk array. - - # N.B., it is an important optimisation that we only visit chunks which overlap - # the selection. This minimises the number of iterations in the main for loop. - - # check fields are sensible - check_fields(fields, self._dtype) - fields = check_no_multi_fields(fields) - - # determine indices of chunks overlapping the selection - sel_shape = indexer.shape - - # check value shape - if sel_shape == (): - # setting a single item - pass - elif is_scalar(value, self._dtype): - # setting a scalar value - pass - else: - if not hasattr(value, "shape"): - value = np.asanyarray(value, like=self._meta_array) - check_array_shape("value", value, sel_shape) - - # iterate over chunks in range - if ( - not hasattr(self.chunk_store, "setitems") - or self._synchronizer is not None - or any(map(lambda x: x == 0, self.shape)) - ): - # iterative approach - for chunk_coords, chunk_selection, out_selection in indexer: - # extract data to store - if sel_shape == (): - chunk_value = value - elif is_scalar(value, self._dtype): - chunk_value = value - else: - chunk_value = value[out_selection] - # handle missing singleton dimensions - if indexer.drop_axes: - item = [slice(None)] * self.ndim - for a in indexer.drop_axes: - item[a] = np.newaxis - item = tuple(item) - chunk_value = chunk_value[item] - - # put data - self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields) - else: - lchunk_coords, lchunk_selection, lout_selection = zip(*indexer) - chunk_values = [] - for out_selection in lout_selection: - if sel_shape == (): - chunk_values.append(value) - elif is_scalar(value, self._dtype): - chunk_values.append(value) - else: - cv = value[out_selection] - # handle missing singleton dimensions - if indexer.drop_axes: # pragma: no cover - item = [slice(None)] * self.ndim - for a in indexer.drop_axes: - item[a] = np.newaxis - item = tuple(item) - cv = chunk_value[item] - chunk_values.append(cv) - - self._chunk_setitems(lchunk_coords, lchunk_selection, chunk_values, fields=fields) - - def _process_chunk( - self, - out, - cdata, - chunk_selection, - drop_axes, - out_is_ndarray, - fields, - out_selection, - partial_read_decode=False, - ): - """Take binary data from storage and fill output array""" - if ( - out_is_ndarray - and not fields - and is_contiguous_selection(out_selection) - and is_total_slice(chunk_selection, self._chunks) - and not self._filters - and self._dtype != object - ): - dest = out[out_selection] - # Assume that array-like objects that doesn't have a - # `writeable` flag is writable. - dest_is_writable = getattr(dest, "writeable", True) - write_direct = dest_is_writable and ( - (self._order == "C" and dest.flags.c_contiguous) - or (self._order == "F" and dest.flags.f_contiguous) - ) - - if write_direct: - # optimization: we want the whole chunk, and the destination is - # contiguous, so we can decompress directly from the chunk - # into the destination array - if self._compressor: - if isinstance(cdata, PartialReadBuffer): - cdata = cdata.read_full() - self._compressor.decode(cdata, dest) - else: - chunk = ensure_ndarray_like(cdata).view(self._dtype) - chunk = chunk.reshape(self._chunks, order=self._order) - np.copyto(dest, chunk) - return - - # decode chunk - try: - if partial_read_decode: - cdata.prepare_chunk() - # size of chunk - tmp = np.empty_like(self._meta_array, shape=self._chunks, dtype=self.dtype) - index_selection = PartialChunkIterator(chunk_selection, self.chunks) - for start, nitems, partial_out_selection in index_selection: - expected_shape = [ - len(range(*partial_out_selection[i].indices(self.chunks[0] + 1))) - if i < len(partial_out_selection) - else dim - for i, dim in enumerate(self.chunks) - ] - cdata.read_part(start, nitems) - chunk_partial = self._decode_chunk( - cdata.buff, - start=start, - nitems=nitems, - expected_shape=expected_shape, - ) - tmp[partial_out_selection] = chunk_partial - out[out_selection] = tmp[chunk_selection] - return - except ArrayIndexError: - cdata = cdata.read_full() - chunk = self._decode_chunk(cdata) - - # select data from chunk - if fields: - chunk = chunk[fields] - tmp = chunk[chunk_selection] - if drop_axes: - tmp = np.squeeze(tmp, axis=drop_axes) - - # store selected data in output - out[out_selection] = tmp - - def _chunk_getitems( - self, lchunk_coords, lchunk_selection, out, lout_selection, drop_axes=None, fields=None - ): - """Obtain part or whole of chunks. - - Parameters - ---------- - chunk_coords : list of tuple of ints - Indices of the chunks. - chunk_selection : list of selections - Location of region within the chunks to extract. - out : ndarray - Array to store result in. - out_selection : list of selections - Location of regions within output array to store results in. - drop_axes : tuple of ints - Axes to squeeze out of the chunk. - fields - TODO - """ - - out_is_ndarray = True - try: - out = ensure_ndarray_like(out) - except TypeError: # pragma: no cover - out_is_ndarray = False - - # Keys to retrieve - ckeys = [self._chunk_key(ch) for ch in lchunk_coords] - - # Check if we can do a partial read - if ( - self._partial_decompress - and self._compressor - and self._compressor.codec_id == "blosc" - and hasattr(self._compressor, "decode_partial") - and not fields - and self.dtype != object - and hasattr(self.chunk_store, "getitems") - ): - partial_read_decode = True - cdatas = { - ckey: PartialReadBuffer(ckey, self.chunk_store) - for ckey in ckeys - if ckey in self.chunk_store - } - elif ( - self._partial_decompress - and not self._compressor - and not fields - and self.dtype != object - and hasattr(self.chunk_store, "get_partial_values") - and self.chunk_store.supports_efficient_get_partial_values - ): - partial_read_decode = True - cdatas = { - ckey: UncompressedPartialReadBufferV3( - ckey, self.chunk_store, itemsize=self.itemsize - ) - for ckey in ckeys - if ckey in self.chunk_store - } - elif hasattr(self.chunk_store, "get_partial_values"): - partial_read_decode = False - values = self.chunk_store.get_partial_values([(ckey, (0, None)) for ckey in ckeys]) - cdatas = {key: value for key, value in zip(ckeys, values) if value is not None} - else: - partial_read_decode = False - contexts = {} - if not isinstance(self._meta_array, np.ndarray): - contexts = ConstantMap(ckeys, constant=Context(meta_array=self._meta_array)) - cdatas = self.chunk_store.getitems(ckeys, contexts=contexts) - - for ckey, chunk_select, out_select in zip(ckeys, lchunk_selection, lout_selection): - if ckey in cdatas: - self._process_chunk( - out, - cdatas[ckey], - chunk_select, - drop_axes, - out_is_ndarray, - fields, - out_select, - partial_read_decode=partial_read_decode, - ) - else: - # check exception type - if self._fill_value is not None: - if fields: - fill_value = self._fill_value[fields] - else: - fill_value = self._fill_value - out[out_select] = fill_value - - def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): - ckeys = map(self._chunk_key, lchunk_coords) - cdatas = { - key: self._process_for_setitem(key, sel, val, fields=fields) - for key, sel, val in zip(ckeys, lchunk_selection, values) - } - to_store = {} - if not self.write_empty_chunks: - empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} - self._chunk_delitems(empty_chunks.keys()) - nonempty_keys = cdatas.keys() - empty_chunks.keys() - to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} - else: - to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} - self.chunk_store.setitems(to_store) - - def _chunk_delitems(self, ckeys): - if hasattr(self.store, "delitems"): - self.store.delitems(ckeys) - else: # pragma: no cover - # exempting this branch from coverage as there are no extant stores - # that will trigger this condition, but it's possible that they - # will be developed in the future. - tuple(map(self._chunk_delitem, ckeys)) - - def _chunk_delitem(self, ckey): - """ - Attempt to delete the value associated with ckey. - """ - try: - del self.chunk_store[ckey] - except KeyError: - pass - - def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): - """Replace part or whole of a chunk. - - Parameters - ---------- - chunk_coords : tuple of ints - Indices of the chunk. - chunk_selection : tuple of slices - Location of region within the chunk. - value : scalar or ndarray - Value to set. - - """ - - if self._synchronizer is None: - # no synchronization - lock = nolock - else: - # synchronize on the chunk - ckey = self._chunk_key(chunk_coords) - lock = self._synchronizer[ckey] - - with lock: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) - - def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): - ckey = self._chunk_key(chunk_coords) - cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) - - # attempt to delete chunk if it only contains the fill value - if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): - self._chunk_delitem(ckey) - else: - self.chunk_store[ckey] = self._encode_chunk(cdata) - - def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): - if is_total_slice(chunk_selection, self._chunks) and not fields: - # totally replace chunk - - # optimization: we are completely replacing the chunk, so no need - # to access the existing chunk data - - if is_scalar(value, self._dtype): - # setup array filled with value - chunk = np.empty_like( - self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order - ) - chunk.fill(value) - - else: - # ensure array is contiguous - chunk = value.astype(self._dtype, order=self._order, copy=False) - - else: - # partially replace the contents of this chunk - - try: - # obtain compressed data for chunk - cdata = self.chunk_store[ckey] - - except KeyError: - # chunk not initialized - if self._fill_value is not None: - chunk = np.empty_like( - self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order - ) - chunk.fill(self._fill_value) - elif self._dtype == object: - chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) - else: - # N.B., use zeros here so any region beyond the array has consistent - # and compressible data - chunk = np.zeros_like( - self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order - ) - - else: - # decode chunk - chunk = self._decode_chunk(cdata) - if not chunk.flags.writeable: - chunk = chunk.copy(order="K") - - # modify - if fields: - # N.B., currently multi-field assignment is not supported in numpy, so - # this only works for a single field - chunk[fields][chunk_selection] = value - else: - chunk[chunk_selection] = value - - return chunk - - def _chunk_key(self, chunk_coords): - return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) - - def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): - # decompress - if self._compressor: - # only decode requested items - if ( - all(x is not None for x in [start, nitems]) and self._compressor.codec_id == "blosc" - ) and hasattr(self._compressor, "decode_partial"): - chunk = self._compressor.decode_partial(cdata, start, nitems) - else: - chunk = self._compressor.decode(cdata) - else: - chunk = cdata - - # apply filters - if self._filters: - for f in reversed(self._filters): - chunk = f.decode(chunk) - - # view as numpy array with correct dtype - chunk = ensure_ndarray_like(chunk) - # special case object dtype, because incorrect handling can lead to - # segfaults and other bad things happening - if self._dtype != object: - chunk = chunk.view(self._dtype) - elif chunk.dtype != object: - # If we end up here, someone must have hacked around with the filters. - # We cannot deal with object arrays unless there is an object - # codec in the filter chain, i.e., a filter that converts from object - # array to something else during encoding, and converts back to object - # array during decoding. - raise RuntimeError("cannot read object array without object codec") - - # ensure correct chunk shape - chunk = chunk.reshape(-1, order="A") - chunk = chunk.reshape(expected_shape or self._chunks, order=self._order) - - return chunk - - def _encode_chunk(self, chunk): - # apply filters - if self._filters: - for f in self._filters: - chunk = f.encode(chunk) - - # check object encoding - if ensure_ndarray_like(chunk).dtype == object: - raise RuntimeError("cannot write object array without object codec") - - # compress - if self._compressor: - cdata = self._compressor.encode(chunk) - else: - cdata = chunk - - # ensure in-memory data is immutable and easy to compare - if isinstance(self.chunk_store, KVStore) or isinstance(self._chunk_store, KVStore): - cdata = ensure_bytes(cdata) - - return cdata - - def __repr__(self): - t = type(self) - r = "<{}.{}".format(t.__module__, t.__name__) - if self.name: - r += " %r" % self.name - r += " %s" % str(self.shape) - r += " %s" % self.dtype - if self._read_only: - r += " read-only" - r += ">" - return r - - @property - def info(self): - """Report some diagnostic information about the array. - - Examples - -------- - >>> import zarr - >>> z = zarr.v2.zeros(1000000, chunks=100000, dtype='i4') - >>> z.info - Type : zarr.v2.core.Array - Data type : int32 - Shape : (1000000,) - Chunk shape : (100000,) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.v2.storage.KVStore - No. bytes : 4000000 (3.8M) - No. bytes stored : 320 - Storage ratio : 12500.0 - Chunks initialized : 0/10 - - """ - return self._info_reporter - - def info_items(self): - return self._synchronized_op(self._info_items_nosync) - - def _info_items_nosync(self): - def typestr(o): - return "{}.{}".format(type(o).__module__, type(o).__name__) - - def bytestr(n): - if n > 2**10: - return "{} ({})".format(n, human_readable_size(n)) - else: - return str(n) - - items = [] - - # basic info - if self.name is not None: - items += [("Name", self.name)] - items += [ - ("Type", typestr(self)), - ("Data type", "%s" % self.dtype), - ("Shape", str(self.shape)), - ("Chunk shape", str(self.chunks)), - ("Order", self.order), - ("Read-only", str(self.read_only)), - ] - - # filters - if self.filters: - for i, f in enumerate(self.filters): - items += [("Filter [%s]" % i, repr(f))] - - # compressor - items += [("Compressor", repr(self.compressor))] - - # synchronizer - if self._synchronizer is not None: - items += [("Synchronizer type", typestr(self._synchronizer))] - - # storage info - items += [("Store type", typestr(self._store))] - if self._chunk_store is not None: - items += [("Chunk store type", typestr(self._chunk_store))] - items += [("No. bytes", bytestr(self.nbytes))] - if self.nbytes_stored > 0: - items += [ - ("No. bytes stored", bytestr(self.nbytes_stored)), - ("Storage ratio", "%.1f" % (self.nbytes / self.nbytes_stored)), - ] - items += [("Chunks initialized", "{}/{}".format(self.nchunks_initialized, self.nchunks))] - - return items - - def digest(self, hashname="sha1"): - """ - Compute a checksum for the data. Default uses sha1 for speed. - - Examples - -------- - >>> import binascii - >>> import zarr - >>> z = zarr.v2.empty(shape=(10000, 10000), chunks=(1000, 1000)) - >>> binascii.hexlify(z.digest()) - b'041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' - >>> z = zarr.v2.zeros(shape=(10000, 10000), chunks=(1000, 1000)) - >>> binascii.hexlify(z.digest()) - b'7162d416d26a68063b66ed1f30e0a866e4abed60' - >>> z = zarr.v2.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) - >>> binascii.hexlify(z.digest()) - b'cb387af37410ae5a3222e893cf3373e4e4f22816' - """ - - h = hashlib.new(hashname) - - for i in itertools.product(*[range(s) for s in self.cdata_shape]): - h.update(self.chunk_store.get(self._chunk_key(i), b"")) - - mkey = _prefix_to_array_key(self._store, self._key_prefix) - h.update(self.store.get(mkey, b"")) - - h.update(self.store.get(self.attrs.key, b"")) - - checksum = h.digest() - - return checksum - - def hexdigest(self, hashname="sha1"): - """ - Compute a checksum for the data. Default uses sha1 for speed. - - Examples - -------- - >>> import zarr - >>> z = zarr.v2.empty(shape=(10000, 10000), chunks=(1000, 1000)) - >>> z.hexdigest() - '041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' - >>> z = zarr.v2.zeros(shape=(10000, 10000), chunks=(1000, 1000)) - >>> z.hexdigest() - '7162d416d26a68063b66ed1f30e0a866e4abed60' - >>> z = zarr.v2.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) - >>> z.hexdigest() - 'cb387af37410ae5a3222e893cf3373e4e4f22816' - """ - - checksum = binascii.hexlify(self.digest(hashname=hashname)) - - # This is a bytes object on Python 3 and we want a str. - if not isinstance(checksum, str): - checksum = checksum.decode("utf8") - - return checksum - - def __getstate__(self): - return { - "store": self._store, - "path": self._path, - "read_only": self._read_only, - "chunk_store": self._chunk_store, - "synchronizer": self._synchronizer, - "cache_metadata": self._cache_metadata, - "cache_attrs": self._attrs.cache, - "partial_decompress": self._partial_decompress, - "write_empty_chunks": self._write_empty_chunks, - "meta_array": self._meta_array, - } - - def __setstate__(self, state): - self.__init__(**state) - - def _synchronized_op(self, f, *args, **kwargs): - if self._synchronizer is None: - # no synchronization - lock = nolock - - else: - # synchronize on the array - mkey = _prefix_to_array_key(self._store, self._key_prefix) - lock = self._synchronizer[mkey] - - with lock: - self._refresh_metadata_nosync() - result = f(*args, **kwargs) - - return result - - def _write_op(self, f, *args, **kwargs): - # guard condition - if self._read_only: - raise ReadOnlyError - - return self._synchronized_op(f, *args, **kwargs) - - def resize(self, *args): - """Change the shape of the array by growing or shrinking one or more - dimensions. - - Examples - -------- - >>> import zarr - >>> z = zarr.v2.zeros(shape=(10000, 10000), chunks=(1000, 1000)) - >>> z.shape - (10000, 10000) - >>> z.resize(20000, 10000) - >>> z.shape - (20000, 10000) - >>> z.resize(30000, 1000) - >>> z.shape - (30000, 1000) - - Notes - ----- - When resizing an array, the data are not rearranged in any way. - - If one or more dimensions are shrunk, any chunks falling outside the - new array shape will be deleted from the underlying store. - However, it is noteworthy that the chunks partially falling inside the new array - (i.e. boundary chunks) will remain intact, and therefore, - the data falling outside the new array but inside the boundary chunks - would be restored by a subsequent resize operation that grows the array size. - - """ - - return self._write_op(self._resize_nosync, *args) - - def _resize_nosync(self, *args): - # normalize new shape argument - old_shape = self._shape - new_shape = normalize_resize_args(old_shape, *args) - old_cdata_shape = self._cdata_shape - - # update metadata - self._shape = new_shape - self._flush_metadata_nosync() - - # determine the new number and arrangement of chunks - chunks = self._chunks - new_cdata_shape = tuple(math.ceil(s / c) for s, c in zip(new_shape, chunks)) - - # remove any chunks not within range - # The idea is that, along each dimension, - # only find and remove the chunk slices that exist in 'old' but not 'new' data. - # Note that a mutable list ('old_cdata_shape_working_list') is introduced here - # to dynamically adjust the number of chunks along the already-processed dimensions - # in order to avoid duplicate chunk removal. - chunk_store = self.chunk_store - old_cdata_shape_working_list = list(old_cdata_shape) - for idx_cdata, (val_old_cdata, val_new_cdata) in enumerate( - zip(old_cdata_shape_working_list, new_cdata_shape) - ): - for cidx in itertools.product( - *[ - range(n_new, n_old) if (idx == idx_cdata) else range(n_old) - for idx, (n_old, n_new) in enumerate( - zip(old_cdata_shape_working_list, new_cdata_shape) - ) - ] - ): - key = self._chunk_key(cidx) - try: - del chunk_store[key] - except KeyError: - # chunk not initialized - pass - old_cdata_shape_working_list[idx_cdata] = min(val_old_cdata, val_new_cdata) - - def append(self, data, axis=0): - """Append `data` to `axis`. - - Parameters - ---------- - data : array-like - Data to be appended. - axis : int - Axis along which to append. - - Returns - ------- - new_shape : tuple - - Notes - ----- - The size of all dimensions other than `axis` must match between this - array and `data`. - - Examples - -------- - >>> import numpy as np - >>> import zarr - >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) - >>> z = zarr.v2.array(a, chunks=(1000, 100)) - >>> z.shape - (10000, 1000) - >>> z.append(a) - (20000, 1000) - >>> z.append(np.vstack([a, a]), axis=1) - (20000, 2000) - >>> z.shape - (20000, 2000) - - """ - return self._write_op(self._append_nosync, data, axis=axis) - - def _append_nosync(self, data, axis=0): - # ensure data is array-like - if not hasattr(data, "shape"): - data = np.asanyarray(data, like=self._meta_array) - - # ensure shapes are compatible for non-append dimensions - self_shape_preserved = tuple(s for i, s in enumerate(self._shape) if i != axis) - data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) - if self_shape_preserved != data_shape_preserved: - raise ValueError( - "shape of data to append is not compatible with the array; " - "all dimensions must match except for the dimension being " - "appended" - ) - - # remember old shape - old_shape = self._shape - - # determine new shape - new_shape = tuple( - self._shape[i] if i != axis else self._shape[i] + data.shape[i] - for i in range(len(self._shape)) - ) - - # resize - self._resize_nosync(new_shape) - - # store data - # noinspection PyTypeChecker - append_selection = tuple( - slice(None) if i != axis else slice(old_shape[i], new_shape[i]) - for i in range(len(self._shape)) - ) - self[append_selection] = data - - return new_shape - - def view( - self, - shape=None, - chunks=None, - dtype=None, - fill_value=None, - filters=None, - read_only=None, - synchronizer=None, - ): - """Return an array sharing the same data. - - Parameters - ---------- - shape : int or tuple of ints - Array shape. - chunks : int or tuple of ints, optional - Chunk shape. - dtype : string or dtype, optional - NumPy dtype. - fill_value : object - Default value to use for uninitialized portions of the array. - filters : sequence, optional - Sequence of filters to use to encode chunk data prior to - compression. - read_only : bool, optional - True if array should be protected against modification. - synchronizer : object, optional - Array synchronizer. - - Notes - ----- - WARNING: This is an experimental feature and should be used with care. - There are plenty of ways to generate errors and/or cause data - corruption. - - Examples - -------- - - Bypass filters: - - >>> import zarr - >>> import numpy as np - >>> np.random.seed(42) - >>> labels = ['female', 'male'] - >>> data = np.random.choice(labels, size=10000) - >>> filters = [zarr.v2.Categorize(labels=labels, - ... dtype=data.dtype, - ... astype='u1')] - >>> a = zarr.v2.array(data, chunks=1000, filters=filters) - >>> a[:] - array(['female', 'male', 'female', ..., 'male', 'male', 'female'], - dtype='>> v = a.view(dtype='u1', filters=[]) - >>> v.is_view - True - >>> v[:] - array([1, 2, 1, ..., 2, 2, 1], dtype=uint8) - - Views can be used to modify data: - - >>> x = v[:] - >>> x.sort() - >>> v[:] = x - >>> v[:] - array([1, 1, 1, ..., 2, 2, 2], dtype=uint8) - >>> a[:] - array(['female', 'female', 'female', ..., 'male', 'male', 'male'], - dtype='>> data = np.random.randint(0, 2, size=10000, dtype='u1') - >>> a = zarr.v2.array(data, chunks=1000) - >>> a[:] - array([0, 0, 1, ..., 1, 0, 0], dtype=uint8) - >>> v = a.view(dtype=bool) - >>> v[:] - array([False, False, True, ..., True, False, False]) - >>> np.all(a[:].view(dtype=bool) == v[:]) - True - - An array can be viewed with a dtype with a different item size, however - some care is needed to adjust the shape and chunk shape so that chunk - data is interpreted correctly: - - >>> data = np.arange(10000, dtype='u2') - >>> a = zarr.v2.array(data, chunks=1000) - >>> a[:10] - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint16) - >>> v = a.view(dtype='u1', shape=20000, chunks=2000) - >>> v[:10] - array([0, 0, 1, 0, 2, 0, 3, 0, 4, 0], dtype=uint8) - >>> np.all(a[:].view('u1') == v[:]) - True - - Change fill value for uninitialized chunks: - - >>> a = zarr.v2.full(10000, chunks=1000, fill_value=-1, dtype='i1') - >>> a[:] - array([-1, -1, -1, ..., -1, -1, -1], dtype=int8) - >>> v = a.view(fill_value=42) - >>> v[:] - array([42, 42, 42, ..., 42, 42, 42], dtype=int8) - - Note that resizing or appending to views is not permitted: - - >>> a = zarr.v2.empty(10000) - >>> v = a.view() - >>> try: - ... v.resize(20000) - ... except PermissionError as e: - ... print(e) - operation not permitted for views - - """ - - store = self._store - chunk_store = self._chunk_store - path = self._path - if read_only is None: - read_only = self._read_only - if synchronizer is None: - synchronizer = self._synchronizer - a = Array( - store=store, - path=path, - chunk_store=chunk_store, - read_only=read_only, - synchronizer=synchronizer, - cache_metadata=True, - ) - a._is_view = True - - # allow override of some properties - if dtype is None: - dtype = self._dtype - else: - dtype = np.dtype(dtype) - a._dtype = dtype - if shape is None: - shape = self._shape - else: - shape = normalize_shape(shape) - a._shape = shape - if chunks is not None: - chunks = normalize_chunks(chunks, shape, dtype.itemsize) - a._chunks = chunks - if fill_value is not None: - a._fill_value = fill_value - if filters is not None: - a._filters = filters - - return a - - def astype(self, dtype): - """Returns a view that does on the fly type conversion of the underlying data. - - Parameters - ---------- - dtype : string or dtype - NumPy dtype. - - Notes - ----- - This method returns a new Array object which is a view on the same - underlying chunk data. Modifying any data via the view is currently - not permitted and will result in an error. This is an experimental - feature and its behavior is subject to change in the future. - - See Also - -------- - Array.view - - Examples - -------- - - >>> import zarr - >>> import numpy as np - >>> data = np.arange(100, dtype=np.uint8) - >>> a = zarr.v2.array(data, chunks=10) - >>> a[:] - array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, - 96, 97, 98, 99], dtype=uint8) - >>> v = a.astype(np.float32) - >>> v.is_view - True - >>> v[:] - array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., - 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., - 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., - 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., - 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., - 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., - 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., - 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., - 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., - 90., 91., 92., 93., 94., 95., 96., 97., 98., 99.], - dtype=float32) - """ - - dtype = np.dtype(dtype) - - filters = [] - if self._filters: - filters.extend(self._filters) - filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype)) - - return self.view(filters=filters, dtype=dtype, read_only=True) diff --git a/src/zarr/v2/creation.py b/src/zarr/v2/creation.py deleted file mode 100644 index d0ba00603d..0000000000 --- a/src/zarr/v2/creation.py +++ /dev/null @@ -1,707 +0,0 @@ -from typing import Optional -from warnings import warn - -import numpy as np -from numcodecs.registry import codec_registry - -from zarr.v2.core import Array -from zarr.v2.errors import ( - ArrayNotFoundError, - ContainsArrayError, - ContainsGroupError, -) -from zarr.v2.storage import ( - contains_array, - contains_group, - default_compressor, - init_array, - normalize_storage_path, - normalize_store_arg, -) -from zarr.v2.util import normalize_dimension_separator - - -def create( - shape, - chunks=True, - dtype=None, - compressor="default", - fill_value: Optional[int] = 0, - order="C", - store=None, - synchronizer=None, - overwrite=False, - path=None, - chunk_store=None, - filters=None, - cache_metadata=True, - cache_attrs=True, - read_only=False, - object_codec=None, - dimension_separator=None, - write_empty_chunks=True, - *, - meta_array=None, - **kwargs, -): - """Create an array. - - Parameters - ---------- - shape : int or tuple of ints - Array shape. - chunks : int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - If an int, the chunk size in each dimension will be given by the value - of `chunks`. Default is True. - dtype : string or dtype, optional - NumPy dtype. - compressor : Codec, optional - Primary compressor. - fill_value : object - Default value to use for uninitialized portions of the array. - order : {'C', 'F'}, optional - Memory layout to be used within each chunk. - store : MutableMapping or string - Store or path to directory in file system or name of zip file. - synchronizer : object, optional - Array synchronizer. - overwrite : bool, optional - If True, delete all pre-existing data in `store` at `path` before - creating the array. - path : string, optional - Path under which array is stored. - chunk_store : MutableMapping, optional - Separate storage for chunks. If not provided, `store` will be used - for storage of both chunks and metadata. - filters : sequence of Codecs, optional - Sequence of filters to use to encode chunk data prior to compression. - cache_metadata : bool, optional - If True, array configuration metadata will be cached for the - lifetime of the object. If False, array metadata will be reloaded - prior to all data access and modification operations (may incur - overhead depending on storage and data access pattern). - cache_attrs : bool, optional - If True (default), user attributes will be cached for attribute read - operations. If False, user attributes are reloaded from the store prior - to all attribute read operations. - read_only : bool, optional - True if array should be protected against modification. - object_codec : Codec, optional - A codec to encode object arrays, only needed if dtype=object. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - .. versionadded:: 2.8 - - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their - contents. If False, each chunk is compared to the array's fill value - prior to storing. If a chunk is uniformly equal to the fill value, then - that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - - meta_array : array-like, optional - An array instance to use for determining arrays to create and return - to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 - - Returns - ------- - z : zarr.v2.core.Array - - Examples - -------- - - Create an array with default settings:: - - >>> import zarr - >>> z = zarr.v2.create((10000, 10000), chunks=(1000, 1000)) - >>> z - - - Create an array with different some different configuration options:: - - >>> from numcodecs import Blosc - >>> compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.BITSHUFFLE) - >>> z = zarr.v2.create((10000, 10000), chunks=(1000, 1000), dtype='i1', order='F', - ... compressor=compressor) - >>> z - - - To create an array with object dtype requires a filter that can handle Python object - encoding, e.g., `MsgPack` or `Pickle` from `numcodecs`:: - - >>> from numcodecs import MsgPack - >>> z = zarr.v2.create((10000, 10000), chunks=(1000, 1000), dtype=object, - ... object_codec=MsgPack()) - >>> z - - - Example with some filters, and also storing chunks separately from metadata:: - - >>> from numcodecs import Quantize, Adler32 - >>> store, chunk_store = dict(), dict() - >>> z = zarr.v2.create((10000, 10000), chunks=(1000, 1000), dtype='f8', - ... filters=[Quantize(digits=2, dtype='f8'), Adler32()], - ... store=store, chunk_store=chunk_store) - >>> z - - - """ - - # handle polymorphic store arg - store = normalize_store_arg(store, mode="w") - - # API compatibility with h5py - compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) - - # optional array metadata - if dimension_separator is None: - dimension_separator = getattr(store, "_dimension_separator", None) - else: - store_separator = getattr(store, "_dimension_separator", None) - if store_separator not in (None, dimension_separator): - raise ValueError( - f"Specified dimension_separator: {dimension_separator}" - f"conflicts with store's separator: " - f"{store_separator}" - ) - dimension_separator = normalize_dimension_separator(dimension_separator) - - # initialize array metadata - init_array( - store, - shape=shape, - chunks=chunks, - dtype=dtype, - compressor=compressor, - fill_value=fill_value, - order=order, - overwrite=overwrite, - path=path, - chunk_store=chunk_store, - filters=filters, - object_codec=object_codec, - dimension_separator=dimension_separator, - ) - - # instantiate array - z = Array( - store, - path=path, - chunk_store=chunk_store, - synchronizer=synchronizer, - cache_metadata=cache_metadata, - cache_attrs=cache_attrs, - read_only=read_only, - write_empty_chunks=write_empty_chunks, - meta_array=meta_array, - ) - - return z - - -def _kwargs_compat(compressor, fill_value, kwargs): - # to be compatible with h5py, as well as backwards-compatible with Zarr - # 1.x, accept 'compression' and 'compression_opts' keyword arguments - - if compressor != "default": - # 'compressor' overrides 'compression' - if "compression" in kwargs: - warn( - "'compression' keyword argument overridden by 'compressor'", - stacklevel=3, - ) - del kwargs["compression"] - if "compression_opts" in kwargs: - warn( - "'compression_opts' keyword argument overridden by 'compressor'", - stacklevel=3, - ) - del kwargs["compression_opts"] - - elif "compression" in kwargs: - compression = kwargs.pop("compression") - compression_opts = kwargs.pop("compression_opts", None) - - if compression is None or compression == "none": - compressor = None - - elif compression == "default": - compressor = default_compressor - - elif isinstance(compression, str): - codec_cls = codec_registry[compression] - - # handle compression_opts - if isinstance(compression_opts, dict): - compressor = codec_cls(**compression_opts) - elif isinstance(compression_opts, (list, tuple)): - compressor = codec_cls(*compression_opts) - elif compression_opts is None: - compressor = codec_cls() - else: - # assume single argument, e.g., int - compressor = codec_cls(compression_opts) - - # be lenient here if user gives compressor as 'compression' - elif hasattr(compression, "get_config"): - compressor = compression - - else: - raise ValueError("bad value for compression: %r" % compression) - - # handle 'fillvalue' - if "fillvalue" in kwargs: - # to be compatible with h5py, accept 'fillvalue' instead of - # 'fill_value' - fill_value = kwargs.pop("fillvalue") - - # ignore other keyword arguments - for k in kwargs: - warn("ignoring keyword argument %r" % k) - - return compressor, fill_value - - -def empty(shape, **kwargs): - """Create an empty array. - - For parameter definitions see :func:`zarr.v2.creation.create`. - - Notes - ----- - The contents of an empty Zarr array are not defined. On attempting to - retrieve data from an empty Zarr array, any values may be returned, - and these are not guaranteed to be stable from one access to the next. - - """ - return create(shape=shape, fill_value=None, **kwargs) - - -def zeros(shape, **kwargs): - """Create an array, with zero being used as the default value for - uninitialized portions of the array. - - For parameter definitions see :func:`zarr.v2.creation.create`. - - Examples - -------- - >>> import zarr - >>> z = zarr.v2.zeros((10000, 10000), chunks=(1000, 1000)) - >>> z - - >>> z[:2, :2] - array([[0., 0.], - [0., 0.]]) - - """ - - return create(shape=shape, fill_value=0, **kwargs) - - -def ones(shape, **kwargs): - """Create an array, with one being used as the default value for - uninitialized portions of the array. - - For parameter definitions see :func:`zarr.v2.creation.create`. - - Examples - -------- - >>> import zarr - >>> z = zarr.v2.ones((10000, 10000), chunks=(1000, 1000)) - >>> z - - >>> z[:2, :2] - array([[1., 1.], - [1., 1.]]) - - """ - - return create(shape=shape, fill_value=1, **kwargs) - - -def full(shape, fill_value, **kwargs): - """Create an array, with `fill_value` being used as the default value for - uninitialized portions of the array. - - For parameter definitions see :func:`zarr.v2.creation.create`. - - Examples - -------- - >>> import zarr - >>> z = zarr.v2.full((10000, 10000), chunks=(1000, 1000), fill_value=42) - >>> z - - >>> z[:2, :2] - array([[42., 42.], - [42., 42.]]) - - """ - - return create(shape=shape, fill_value=fill_value, **kwargs) - - -def _get_shape_chunks(a): - shape = None - chunks = None - - if hasattr(a, "shape") and isinstance(a.shape, tuple): - shape = a.shape - - if hasattr(a, "chunks") and isinstance(a.chunks, tuple) and (len(a.chunks) == len(a.shape)): - chunks = a.chunks - - elif hasattr(a, "chunklen"): - # bcolz carray - chunks = (a.chunklen,) + a.shape[1:] - - return shape, chunks - - -def array(data, **kwargs): - """Create an array filled with `data`. - - The `data` argument should be a NumPy array or array-like object. For - other parameter definitions see :func:`zarr.v2.creation.create`. - - Examples - -------- - >>> import numpy as np - >>> import zarr - >>> a = np.arange(100000000).reshape(10000, 10000) - >>> z = zarr.v2.array(a, chunks=(1000, 1000)) - >>> z - - - """ - - # ensure data is array-like - if not hasattr(data, "shape") or not hasattr(data, "dtype"): - data = np.asanyarray(data) - - # setup dtype - kw_dtype = kwargs.get("dtype") - if kw_dtype is None: - kwargs["dtype"] = data.dtype - else: - kwargs["dtype"] = kw_dtype - - # setup shape and chunks - data_shape, data_chunks = _get_shape_chunks(data) - kwargs["shape"] = data_shape - kw_chunks = kwargs.get("chunks") - if kw_chunks is None: - kwargs["chunks"] = data_chunks - else: - kwargs["chunks"] = kw_chunks - - # pop read-only to apply after storing the data - read_only = kwargs.pop("read_only", False) - - # instantiate array - z = create(**kwargs) - - # fill with data - z[...] = data - - # set read_only property afterwards - z.read_only = read_only - - return z - - -def open_array( - store=None, - mode="a", - shape=None, - chunks=True, - dtype=None, - compressor="default", - fill_value=0, - order="C", - synchronizer=None, - filters=None, - cache_metadata=True, - cache_attrs=True, - path=None, - object_codec=None, - chunk_store=None, - storage_options=None, - partial_decompress=False, - write_empty_chunks=True, - *, - dimension_separator=None, - meta_array=None, - **kwargs, -): - """Open an array using file-mode-like semantics. - - Parameters - ---------- - store : MutableMapping or string, optional - Store or path to directory in file system or name of zip file. - mode : {'r', 'r+', 'a', 'w', 'w-'}, optional - Persistence mode: 'r' means read only (must exist); 'r+' means - read/write (must exist); 'a' means read/write (create if doesn't - exist); 'w' means create (overwrite if exists); 'w-' means create - (fail if exists). - shape : int or tuple of ints, optional - Array shape. - chunks : int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - If an int, the chunk size in each dimension will be given by the value - of `chunks`. Default is True. - dtype : string or dtype, optional - NumPy dtype. - compressor : Codec, optional - Primary compressor. - fill_value : object, optional - Default value to use for uninitialized portions of the array. - order : {'C', 'F'}, optional - Memory layout to be used within each chunk. - synchronizer : object, optional - Array synchronizer. - filters : sequence, optional - Sequence of filters to use to encode chunk data prior to compression. - cache_metadata : bool, optional - If True, array configuration metadata will be cached for the - lifetime of the object. If False, array metadata will be reloaded - prior to all data access and modification operations (may incur - overhead depending on storage and data access pattern). - cache_attrs : bool, optional - If True (default), user attributes will be cached for attribute read - operations. If False, user attributes are reloaded from the store prior - to all attribute read operations. - path : string, optional - Array path within store. - object_codec : Codec, optional - A codec to encode object arrays, only needed if dtype=object. - chunk_store : MutableMapping or string, optional - Store or path to directory in file system or name of zip file. - storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. - partial_decompress : bool, optional - If True and while the chunk_store is a FSStore and the compression used - is Blosc, when getting data from the array chunks will be partially - read and decompressed when possible. - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their - contents. If False, each chunk is compared to the array's fill value - prior to storing. If a chunk is uniformly equal to the fill value, then - that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - - dimension_separator : {None, '.', '/'}, optional - Can be used to specify whether the array is in a flat ('.') or nested - ('/') format. If None, the appropriate value will be read from `store` - when present. Otherwise, defaults to '.'. - meta_array : array-like, optional - An array instance to use for determining arrays to create and return - to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.15 - - Returns - ------- - z : zarr.v2.core.Array - - Examples - -------- - >>> import numpy as np - >>> import zarr - >>> z1 = zarr.v2.open_array('data/example.zarr', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), fill_value=0) - >>> z1[:] = np.arange(100000000).reshape(10000, 10000) - >>> z1 - - >>> z2 = zarr.v2.open_array('data/example.zarr', mode='r') - >>> z2 - - >>> np.all(z1[:] == z2[:]) - True - - Notes - ----- - There is no need to close an array. Data are automatically flushed to the - file system. - - """ - - # use same mode semantics as h5py - # r : read only, must exist - # r+ : read/write, must exist - # w : create, delete if exists - # w- or x : create, fail if exists - # a : read/write if exists, create otherwise (default) - - # handle polymorphic store arg - store = normalize_store_arg(store, storage_options=storage_options, mode=mode) - - if chunk_store is not None: - chunk_store = normalize_store_arg(chunk_store, storage_options=storage_options, mode=mode) - - # respect the dimension separator specified in a store, if present - if dimension_separator is None: - if hasattr(store, "_dimension_separator"): - dimension_separator = store._dimension_separator - else: - dimension_separator = "." - - path = normalize_storage_path(path) - - # API compatibility with h5py - compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) - - # ensure fill_value of correct type - if fill_value is not None: - fill_value = np.array(fill_value, dtype=dtype)[()] - - # ensure store is initialized - - if mode in ["r", "r+"]: - if not contains_array(store, path=path): - if contains_group(store, path=path): - raise ContainsGroupError(path) - raise ArrayNotFoundError(path) - - elif mode == "w": - init_array( - store, - shape=shape, - chunks=chunks, - dtype=dtype, - compressor=compressor, - fill_value=fill_value, - order=order, - filters=filters, - overwrite=True, - path=path, - object_codec=object_codec, - chunk_store=chunk_store, - dimension_separator=dimension_separator, - ) - - elif mode == "a": - if not contains_array(store, path=path): - if contains_group(store, path=path): - raise ContainsGroupError(path) - init_array( - store, - shape=shape, - chunks=chunks, - dtype=dtype, - compressor=compressor, - fill_value=fill_value, - order=order, - filters=filters, - path=path, - object_codec=object_codec, - chunk_store=chunk_store, - dimension_separator=dimension_separator, - ) - - elif mode in ["w-", "x"]: - if contains_group(store, path=path): - raise ContainsGroupError(path) - elif contains_array(store, path=path): - raise ContainsArrayError(path) - else: - init_array( - store, - shape=shape, - chunks=chunks, - dtype=dtype, - compressor=compressor, - fill_value=fill_value, - order=order, - filters=filters, - path=path, - object_codec=object_codec, - chunk_store=chunk_store, - dimension_separator=dimension_separator, - ) - - # determine read only status - read_only = mode == "r" - - # instantiate array - z = Array( - store, - read_only=read_only, - synchronizer=synchronizer, - cache_metadata=cache_metadata, - cache_attrs=cache_attrs, - path=path, - chunk_store=chunk_store, - write_empty_chunks=write_empty_chunks, - meta_array=meta_array, - ) - - return z - - -def _like_args(a, kwargs): - shape, chunks = _get_shape_chunks(a) - if shape is not None: - kwargs.setdefault("shape", shape) - if chunks is not None: - kwargs.setdefault("chunks", chunks) - - if hasattr(a, "dtype"): - kwargs.setdefault("dtype", a.dtype) - - if isinstance(a, Array): - kwargs.setdefault("compressor", a.compressor) - kwargs.setdefault("order", a.order) - kwargs.setdefault("filters", a.filters) - else: - kwargs.setdefault("compressor", "default") - kwargs.setdefault("order", "C") - - -def empty_like(a, **kwargs): - """Create an empty array like `a`.""" - _like_args(a, kwargs) - return empty(**kwargs) - - -def zeros_like(a, **kwargs): - """Create an array of zeros like `a`.""" - _like_args(a, kwargs) - return zeros(**kwargs) - - -def ones_like(a, **kwargs): - """Create an array of ones like `a`.""" - _like_args(a, kwargs) - return ones(**kwargs) - - -def full_like(a, **kwargs): - """Create a filled array like `a`.""" - _like_args(a, kwargs) - if isinstance(a, Array): - kwargs.setdefault("fill_value", a.fill_value) - return full(**kwargs) - - -def open_like(a, path, **kwargs): - """Open a persistent array like `a`.""" - _like_args(a, kwargs) - if isinstance(a, Array): - kwargs.setdefault("fill_value", a.fill_value) - return open_array(path, **kwargs) diff --git a/src/zarr/v2/errors.py b/src/zarr/v2/errors.py deleted file mode 100644 index 30c9b13d39..0000000000 --- a/src/zarr/v2/errors.py +++ /dev/null @@ -1,80 +0,0 @@ -class MetadataError(Exception): - pass - - -class CopyError(RuntimeError): - pass - - -class _BaseZarrError(ValueError): - _msg = "" - - def __init__(self, *args): - super().__init__(self._msg.format(*args)) - - -class ArrayIndexError(IndexError): - pass - - -class _BaseZarrIndexError(IndexError): - _msg = "" - - def __init__(self, *args): - super().__init__(self._msg.format(*args)) - - -class ContainsGroupError(_BaseZarrError): - _msg = "path {0!r} contains a group" - - -class ContainsArrayError(_BaseZarrError): - _msg = "path {0!r} contains an array" - - -class ArrayNotFoundError(_BaseZarrError): - _msg = "array not found at path %r' {0!r}" - - -class GroupNotFoundError(_BaseZarrError): - _msg = "group not found at path {0!r}" - - -class PathNotFoundError(_BaseZarrError): - _msg = "nothing found at path {0!r}" - - -class BadCompressorError(_BaseZarrError): - _msg = "bad compressor; expected Codec object, found {0!r}" - - -class FSPathExistNotDir(GroupNotFoundError): - _msg = "path exists but is not a directory: %r" - - -class ReadOnlyError(PermissionError): - def __init__(self): - super().__init__("object is read-only") - - -class BoundsCheckError(_BaseZarrIndexError): - _msg = "index out of bounds for dimension with length {0}" - - -class NegativeStepError(IndexError): - def __init__(self): - super().__init__("only slices with step >= 1 are supported") - - -def err_too_many_indices(selection, shape): - raise IndexError( - "too many indices for array; expected {}, got {}".format(len(shape), len(selection)) - ) - - -class VindexInvalidSelectionError(_BaseZarrIndexError): - _msg = ( - "unsupported selection type for vectorized indexing; only " - "coordinate selection (tuple of integer arrays) and mask selection " - "(single Boolean array) are supported; got {0!r}" - ) diff --git a/src/zarr/v2/hierarchy.py b/src/zarr/v2/hierarchy.py deleted file mode 100644 index 25e47311b6..0000000000 --- a/src/zarr/v2/hierarchy.py +++ /dev/null @@ -1,1401 +0,0 @@ -from collections.abc import MutableMapping -from itertools import islice -from typing import Any - -import numpy as np - -from zarr.v2.attrs import Attributes -from zarr.v2.core import Array -from zarr.v2.creation import ( - array, - create, - empty, - empty_like, - full, - full_like, - ones, - ones_like, - zeros, - zeros_like, -) -from zarr.v2.errors import ( - ContainsArrayError, - ContainsGroupError, - GroupNotFoundError, - ReadOnlyError, -) -from zarr.v2.storage import ( - _prefix_to_group_key, - BaseStore, - MemoryStore, - group_meta_key, - attrs_key, - contains_array, - contains_group, - init_group, - listdir, - normalize_store_arg, - rename, - rmdir, -) - -from zarr.v2.util import ( - InfoReporter, - TreeViewer, - is_valid_python_name, - nolock, - normalize_shape, - normalize_storage_path, -) - - -class Group(MutableMapping[str, Any]): - """Instantiate a group from an initialized store. - - Parameters - ---------- - store : MutableMapping - Group store, already initialized. - If the Group is used in a context manager, and the store has a ``close`` method, - it will be called on exit. - path : string, optional - Group path. - read_only : bool, optional - True if group should be protected against modification. - chunk_store : MutableMapping, optional - Separate storage for chunks. If not provided, `store` will be used - for storage of both chunks and metadata. - cache_attrs : bool, optional - If True (default), user attributes will be cached for attribute read - operations. If False, user attributes are reloaded from the store prior - to all attribute read operations. - synchronizer : object, optional - Array synchronizer. - - meta_array : array-like, optional - An array instance to use for determining arrays to create and return - to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 - - Attributes - ---------- - store - path - name - read_only - chunk_store - synchronizer - attrs - info - meta_array - - Methods - ------- - __len__ - __iter__ - __contains__ - __getitem__ - __enter__ - __exit__ - group_keys - groups - array_keys - arrays - visit - visitkeys - visitvalues - visititems - tree - create_group - require_group - create_groups - require_groups - create_dataset - require_dataset - create - empty - zeros - ones - full - array - empty_like - zeros_like - ones_like - full_like - info - move - - """ - - def __init__( - self, - store, - path=None, - read_only=False, - chunk_store=None, - cache_attrs=True, - synchronizer=None, - *, - meta_array=None, - ): - store: BaseStore = _normalize_store_arg(store) - if chunk_store is not None: - chunk_store: BaseStore = _normalize_store_arg(chunk_store) - self._store = store - self._chunk_store = chunk_store - self._path = normalize_storage_path(path) - if self._path: - self._key_prefix = self._path + "/" - else: - self._key_prefix = "" - self._read_only = read_only - self._synchronizer = synchronizer - if meta_array is not None: - self._meta_array = np.empty_like(meta_array, shape=()) - else: - self._meta_array = np.empty(()) - - # guard conditions - if contains_array(store, path=self._path): - raise ContainsArrayError(path) - - # initialize metadata - mkey = None - try: - mkey = _prefix_to_group_key(self._store, self._key_prefix) - assert not mkey.endswith("root/.group") - meta_bytes = store[mkey] - except KeyError: - raise GroupNotFoundError(path) - else: - self._meta = self._store._metadata_class.decode_group_metadata(meta_bytes) - - # setup attributes - akey = self._key_prefix + attrs_key - - self._attrs = Attributes( - store, key=akey, read_only=read_only, cache=cache_attrs, synchronizer=synchronizer - ) - - # setup info - self._info = InfoReporter(self) - - @property - def store(self): - """A MutableMapping providing the underlying storage for the group.""" - return self._store - - @property - def path(self): - """Storage path.""" - return self._path - - @property - def name(self): - """Group name following h5py convention.""" - if self._path: - # follow h5py convention: add leading slash - name = self._path - if name[0] != "/": - name = "/" + name - return name - return "/" - - @property - def basename(self): - """Final component of name.""" - return self.name.split("/")[-1] - - @property - def read_only(self): - """A boolean, True if modification operations are not permitted.""" - return self._read_only - - @property - def chunk_store(self): - """A MutableMapping providing the underlying storage for array chunks.""" - if self._chunk_store is None: - return self._store - else: - return self._chunk_store - - @property - def synchronizer(self): - """Object used to synchronize write access to groups and arrays.""" - return self._synchronizer - - @property - def attrs(self): - """A MutableMapping containing user-defined attributes. Note that - attribute values must be JSON serializable.""" - return self._attrs - - @property - def info(self): - """Return diagnostic information about the group.""" - return self._info - - @property - def meta_array(self): - """An array-like instance to use for determining arrays to create and return - to users. - """ - return self._meta_array - - def __eq__(self, other): - return ( - isinstance(other, Group) - and self._store == other.store - and self._read_only == other.read_only - and self._path == other.path - # N.B., no need to compare attributes, should be covered by - # store comparison - ) - - def __iter__(self): - """Return an iterator over group member names. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) - >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) - >>> for name in g1: - ... print(name) - bar - baz - foo - quux - - """ - - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_array(self._store, path) or contains_group(self._store, path): - yield key - - def __len__(self): - """Number of members.""" - return sum(1 for _ in self) - - def __repr__(self): - t = type(self) - r = "<{}.{}".format(t.__module__, t.__name__) - if self.name: - r += " %r" % self.name - if self._read_only: - r += " read-only" - r += ">" - return r - - def __enter__(self): - """Return the Group for use as a context manager.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Call the close method of the underlying Store.""" - self.store.close() - - def info_items(self): - def typestr(o): - return "{}.{}".format(type(o).__module__, type(o).__name__) - - items = [] - - # basic info - if self.name is not None: - items += [("Name", self.name)] - items += [ - ("Type", typestr(self)), - ("Read-only", str(self.read_only)), - ] - - # synchronizer - if self._synchronizer is not None: - items += [("Synchronizer type", typestr(self._synchronizer))] - - # storage info - items += [("Store type", typestr(self._store))] - if self._chunk_store is not None: - items += [("Chunk store type", typestr(self._chunk_store))] - - # members - items += [("No. members", len(self))] - array_keys = sorted(self.array_keys()) - group_keys = sorted(self.group_keys()) - items += [("No. arrays", len(array_keys))] - items += [("No. groups", len(group_keys))] - if array_keys: - items += [("Arrays", ", ".join(array_keys))] - if group_keys: - items += [("Groups", ", ".join(group_keys))] - - return items - - def __getstate__(self): - return { - "store": self._store, - "path": self._path, - "read_only": self._read_only, - "chunk_store": self._chunk_store, - "cache_attrs": self._attrs.cache, - "synchronizer": self._synchronizer, - "meta_array": self._meta_array, - } - - def __setstate__(self, state): - self.__init__(**state) - - def _item_path(self, item): - absolute = isinstance(item, str) and item and item[0] == "/" - path = normalize_storage_path(item) - if not absolute and self._path: - path = self._key_prefix + path - return path - - def __contains__(self, item): - """Test for group membership. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> d1 = g1.create_dataset('bar', shape=100, chunks=10) - >>> 'foo' in g1 - True - >>> 'bar' in g1 - True - >>> 'baz' in g1 - False - - """ - path = self._item_path(item) - return contains_array(self._store, path) or contains_group( - self._store, path, explicit_only=False - ) - - def __getitem__(self, item): - """Obtain a group member. - - Parameters - ---------- - item : string - Member name or path. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> d1 = g1.create_dataset('foo/bar/baz', shape=100, chunks=10) - >>> g1['foo'] - - >>> g1['foo/bar'] - - >>> g1['foo/bar/baz'] - - - """ - path = self._item_path(item) - if contains_array(self._store, path): - return Array( - self._store, - read_only=self._read_only, - path=path, - chunk_store=self._chunk_store, - synchronizer=self._synchronizer, - cache_attrs=self.attrs.cache, - meta_array=self._meta_array, - ) - elif contains_group(self._store, path, explicit_only=True): - return Group( - self._store, - read_only=self._read_only, - path=path, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - meta_array=self._meta_array, - ) - else: - raise KeyError(item) - - def __setitem__(self, item, value): - self.array(item, value, overwrite=True) - - def __delitem__(self, item): - return self._write_op(self._delitem_nosync, item) - - def _delitem_nosync(self, item): - path = self._item_path(item) - if contains_array(self._store, path) or contains_group( - self._store, path, explicit_only=False - ): - rmdir(self._store, path) - else: - raise KeyError(item) - - def __getattr__(self, item): - # allow access to group members via dot notation - try: - return self.__getitem__(item) - except KeyError: - raise AttributeError - - def __dir__(self): - # noinspection PyUnresolvedReferences - base = super().__dir__() - keys = sorted(set(base + list(self))) - keys = [k for k in keys if is_valid_python_name(k)] - return keys - - def _ipython_key_completions_(self): - return sorted(self) - - def group_keys(self): - """Return an iterator over member names for groups only. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) - >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) - >>> sorted(g1.group_keys()) - ['bar', 'foo'] - - """ - - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path): - yield key - - def groups(self): - """Return an iterator over (name, value) pairs for groups only. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) - >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) - >>> for n, v in g1.groups(): - ... print(n, type(v)) - bar - foo - - """ - - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path, explicit_only=False): - yield ( - key, - Group( - self._store, - path=path, - read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - ), - ) - - def array_keys(self, recurse=False): - """Return an iterator over member names for arrays only. - - Parameters - ---------- - recurse : recurse, optional - Option to return member names for all arrays, even from groups - below the current one. If False, only member names for arrays in - the current group will be returned. Default value is False. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) - >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) - >>> sorted(g1.array_keys()) - ['baz', 'quux'] - - """ - return self._array_iter(keys_only=True, method="array_keys", recurse=recurse) - - def arrays(self, recurse=False): - """Return an iterator over (name, value) pairs for arrays only. - - Parameters - ---------- - recurse : recurse, optional - Option to return (name, value) pairs for all arrays, even from groups - below the current one. If False, only (name, value) pairs for arrays in - the current group will be returned. Default value is False. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) - >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) - >>> for n, v in g1.arrays(): - ... print(n, type(v)) - baz - quux - - """ - return self._array_iter(keys_only=False, method="arrays", recurse=recurse) - - def _array_iter(self, keys_only, method, recurse): - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_array(self._store, path): - _key = key.rstrip("/") - yield _key if keys_only else (_key, self[key]) - elif recurse and contains_group(self._store, path): - group = self[key] - yield from getattr(group, method)(recurse=recurse) - - def visitvalues(self, func): - """Run ``func`` on each object. - - Note: If ``func`` returns ``None`` (or doesn't return), - iteration continues. However, if ``func`` returns - anything else, it ceases and returns that value. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> g4 = g3.create_group('baz') - >>> g5 = g3.create_group('quux') - >>> def print_visitor(obj): - ... print(obj) - >>> g1.visitvalues(print_visitor) - - - - - >>> g3.visitvalues(print_visitor) - - - - """ - - def _visit(obj): - yield obj - keys = sorted(getattr(obj, "keys", lambda: [])()) - for k in keys: - yield from _visit(obj[k]) - - for each_obj in islice(_visit(self), 1, None): - value = func(each_obj) - if value is not None: - return value - - def visit(self, func): - """Run ``func`` on each object's path. - - Note: If ``func`` returns ``None`` (or doesn't return), - iteration continues. However, if ``func`` returns - anything else, it ceases and returns that value. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> g4 = g3.create_group('baz') - >>> g5 = g3.create_group('quux') - >>> def print_visitor(name): - ... print(name) - >>> g1.visit(print_visitor) - bar - bar/baz - bar/quux - foo - >>> g3.visit(print_visitor) - baz - quux - - Search for members matching some name query can be implemented using - ``visit`` that is, ``find`` and ``findall``. Consider the following - tree:: - - / - ├── aaa - │ └── bbb - │ └── ccc - │ └── aaa - ├── bar - └── foo - - It is created as follows: - - >>> root = zarr.v2.group() - >>> foo = root.create_group("foo") - >>> bar = root.create_group("bar") - >>> root.create_group("aaa").create_group("bbb").create_group("ccc").create_group("aaa") - - - For ``find``, the first path that matches a given pattern (for example - "aaa") is returned. Note that a non-None value is returned in the visit - function to stop further iteration. - - >>> import re - >>> pattern = re.compile("aaa") - >>> found = None - >>> def find(path): - ... global found - ... if pattern.search(path) is not None: - ... found = path - ... return True - ... - >>> root.visit(find) - True - >>> print(found) - aaa - - For ``findall``, all the results are gathered into a list - - >>> pattern = re.compile("aaa") - >>> found = [] - >>> def findall(path): - ... if pattern.search(path) is not None: - ... found.append(path) - ... - >>> root.visit(findall) - >>> print(found) - ['aaa', 'aaa/bbb', 'aaa/bbb/ccc', 'aaa/bbb/ccc/aaa'] - - To match only on the last part of the path, use a greedy regex to filter - out the prefix: - - >>> prefix_pattern = re.compile(r".*/") - >>> pattern = re.compile("aaa") - >>> found = [] - >>> def findall(path): - ... match = prefix_pattern.match(path) - ... if match is None: - ... name = path - ... else: - ... _, end = match.span() - ... name = path[end:] - ... if pattern.search(name) is not None: - ... found.append(path) - ... return None - ... - >>> root.visit(findall) - >>> print(found) - ['aaa', 'aaa/bbb/ccc/aaa'] - """ - - base_len = len(self.name) - return self.visitvalues(lambda o: func(o.name[base_len:].lstrip("/"))) - - def visitkeys(self, func): - """An alias for :py:meth:`~Group.visit`.""" - - return self.visit(func) - - def visititems(self, func): - """Run ``func`` on each object's path and the object itself. - - Note: If ``func`` returns ``None`` (or doesn't return), - iteration continues. However, if ``func`` returns - anything else, it ceases and returns that value. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> g4 = g3.create_group('baz') - >>> g5 = g3.create_group('quux') - >>> def print_visitor(name, obj): - ... print((name, obj)) - >>> g1.visititems(print_visitor) - ('bar', ) - ('bar/baz', ) - ('bar/quux', ) - ('foo', ) - >>> g3.visititems(print_visitor) - ('baz', ) - ('quux', ) - - """ - - base_len = len(self.name) - return self.visitvalues(lambda o: func(o.name[base_len:].lstrip("/"), o)) - - def tree(self, expand=False, level=None): - """Provide a ``print``-able display of the hierarchy. - - Parameters - ---------- - expand : bool, optional - Only relevant for HTML representation. If True, tree will be fully expanded. - level : int, optional - Maximum depth to descend into hierarchy. - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> g4 = g3.create_group('baz') - >>> g5 = g3.create_group('quux') - >>> d1 = g5.create_dataset('baz', shape=100, chunks=10) - >>> g1.tree() - / - ├── bar - │ ├── baz - │ └── quux - │ └── baz (100,) float64 - └── foo - >>> g1.tree(level=2) - / - ├── bar - │ ├── baz - │ └── quux - └── foo - >>> g3.tree() - bar - ├── baz - └── quux - └── baz (100,) float64 - - Notes - ----- - Please note that this is an experimental feature. The behaviour of this - function is still evolving and the default output and/or parameters may change - in future versions. - - """ - - return TreeViewer(self, expand=expand, level=level) - - def _write_op(self, f, *args, **kwargs): - # guard condition - if self._read_only: - raise ReadOnlyError - - if self._synchronizer is None: - # no synchronization - lock = nolock - else: - # synchronize on the root group - lock = self._synchronizer[group_meta_key] - - with lock: - return f(*args, **kwargs) - - def create_group(self, name, overwrite=False): - """Create a sub-group. - - Parameters - ---------- - name : string - Group name. - overwrite : bool, optional - If True, overwrite any existing array with the given name. - - Returns - ------- - g : zarr.v2.hierarchy.Group - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.create_group('foo') - >>> g3 = g1.create_group('bar') - >>> g4 = g1.create_group('baz/quux') - - """ - - return self._write_op(self._create_group_nosync, name, overwrite=overwrite) - - def _create_group_nosync(self, name, overwrite=False): - path = self._item_path(name) - - # create terminal group - init_group(self._store, path=path, chunk_store=self._chunk_store, overwrite=overwrite) - - return Group( - self._store, - path=path, - read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - ) - - def create_groups(self, *names, **kwargs): - """Convenience method to create multiple groups in a single call.""" - return tuple(self.create_group(name, **kwargs) for name in names) - - def require_group(self, name, overwrite=False): - """Obtain a sub-group, creating one if it doesn't exist. - - Parameters - ---------- - name : string - Group name. - overwrite : bool, optional - Overwrite any existing array with given `name` if present. - - Returns - ------- - g : zarr.v2.hierarchy.Group - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> g2 = g1.require_group('foo') - >>> g3 = g1.require_group('foo') - >>> g2 == g3 - True - - """ - - return self._write_op(self._require_group_nosync, name, overwrite=overwrite) - - def _require_group_nosync(self, name, overwrite=False): - path = self._item_path(name) - - # create terminal group if necessary - if not contains_group(self._store, path): - init_group( - store=self._store, path=path, chunk_store=self._chunk_store, overwrite=overwrite - ) - - return Group( - self._store, - path=path, - read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - ) - - def require_groups(self, *names): - """Convenience method to require multiple groups in a single call.""" - return tuple(self.require_group(name) for name in names) - - # noinspection PyIncorrectDocstring - def create_dataset(self, name, **kwargs): - """Create an array. - - Arrays are known as "datasets" in HDF5 terminology. For compatibility - with h5py, Zarr groups also implement the require_dataset() method. - - Parameters - ---------- - name : string - Array name. - data : array-like, optional - Initial data. - shape : int or tuple of ints - Array shape. - chunks : int or tuple of ints, optional - Chunk shape. If not provided, will be guessed from `shape` and - `dtype`. - dtype : string or dtype, optional - NumPy dtype. - compressor : Codec, optional - Primary compressor. - fill_value : object - Default value to use for uninitialized portions of the array. - order : {'C', 'F'}, optional - Memory layout to be used within each chunk. - synchronizer : zarr.v2.sync.ArraySynchronizer, optional - Array synchronizer. - filters : sequence of Codecs, optional - Sequence of filters to use to encode chunk data prior to - compression. - overwrite : bool, optional - If True, replace any existing array or group with the given name. - cache_metadata : bool, optional - If True, array configuration metadata will be cached for the - lifetime of the object. If False, array metadata will be reloaded - prior to all data access and modification operations (may incur - overhead depending on storage and data access pattern). - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - Returns - ------- - a : zarr.v2.core.Array - - Examples - -------- - >>> import zarr - >>> g1 = zarr.v2.group() - >>> d1 = g1.create_dataset('foo', shape=(10000, 10000), - ... chunks=(1000, 1000)) - >>> d1 - - >>> d2 = g1.create_dataset('bar/baz/qux', shape=(100, 100, 100), - ... chunks=(100, 10, 10)) - >>> d2 - - - """ - assert "mode" not in kwargs - - return self._write_op(self._create_dataset_nosync, name, **kwargs) - - def _create_dataset_nosync(self, name, data=None, **kwargs): - assert "mode" not in kwargs - path = self._item_path(name) - - # determine synchronizer - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - - # create array - if data is None: - a = create(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - else: - a = array(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - return a - - def require_dataset(self, name, shape, dtype=None, exact=False, **kwargs): - """Obtain an array, creating if it doesn't exist. - - Arrays are known as "datasets" in HDF5 terminology. For compatibility - with h5py, Zarr groups also implement the create_dataset() method. - - Other `kwargs` are as per :func:`zarr.v2.hierarchy.Group.create_dataset`. - - Parameters - ---------- - name : string - Array name. - shape : int or tuple of ints - Array shape. - dtype : string or dtype, optional - NumPy dtype. - exact : bool, optional - If True, require `dtype` to match exactly. If false, require - `dtype` can be cast from array dtype. - - """ - - return self._write_op( - self._require_dataset_nosync, name, shape=shape, dtype=dtype, exact=exact, **kwargs - ) - - def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, **kwargs): - path = self._item_path(name) - - if contains_array(self._store, path): - # array already exists at path, validate that it is the right shape and type - - synchronizer = kwargs.get("synchronizer", self._synchronizer) - cache_metadata = kwargs.get("cache_metadata", True) - cache_attrs = kwargs.get("cache_attrs", self.attrs.cache) - a = Array( - self._store, - path=path, - read_only=self._read_only, - chunk_store=self._chunk_store, - synchronizer=synchronizer, - cache_metadata=cache_metadata, - cache_attrs=cache_attrs, - meta_array=self._meta_array, - ) - shape = normalize_shape(shape) - if shape != a.shape: - raise TypeError( - "shape do not match existing array; expected {}, got {}".format(a.shape, shape) - ) - dtype = np.dtype(dtype) - if exact: - if dtype != a.dtype: - raise TypeError( - "dtypes do not match exactly; expected {}, got {}".format(a.dtype, dtype) - ) - else: - if not np.can_cast(dtype, a.dtype): - raise TypeError("dtypes ({}, {}) cannot be safely cast".format(dtype, a.dtype)) - return a - - else: - return self._create_dataset_nosync(name, shape=shape, dtype=dtype, **kwargs) - - def create(self, name, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.create`.""" - return self._write_op(self._create_nosync, name, **kwargs) - - def _create_nosync(self, name, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return create(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - def empty(self, name, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.empty`.""" - return self._write_op(self._empty_nosync, name, **kwargs) - - def _empty_nosync(self, name, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return empty(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - def zeros(self, name, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.zeros`.""" - return self._write_op(self._zeros_nosync, name, **kwargs) - - def _zeros_nosync(self, name, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return zeros(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - def ones(self, name, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.ones`.""" - return self._write_op(self._ones_nosync, name, **kwargs) - - def _ones_nosync(self, name, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return ones(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - def full(self, name, fill_value, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.full`.""" - return self._write_op(self._full_nosync, name, fill_value, **kwargs) - - def _full_nosync(self, name, fill_value, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return full( - store=self._store, - path=path, - chunk_store=self._chunk_store, - fill_value=fill_value, - **kwargs, - ) - - def array(self, name, data, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.array`.""" - return self._write_op(self._array_nosync, name, data, **kwargs) - - def _array_nosync(self, name, data, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return array(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) - - def empty_like(self, name, data, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.empty_like`.""" - return self._write_op(self._empty_like_nosync, name, data, **kwargs) - - def _empty_like_nosync(self, name, data, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return empty_like( - data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs - ) - - def zeros_like(self, name, data, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.zeros_like`.""" - return self._write_op(self._zeros_like_nosync, name, data, **kwargs) - - def _zeros_like_nosync(self, name, data, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return zeros_like( - data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs - ) - - def ones_like(self, name, data, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.ones_like`.""" - return self._write_op(self._ones_like_nosync, name, data, **kwargs) - - def _ones_like_nosync(self, name, data, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return ones_like( - data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs - ) - - def full_like(self, name, data, **kwargs): - """Create an array. Keyword arguments as per - :func:`zarr.v2.creation.full_like`.""" - return self._write_op(self._full_like_nosync, name, data, **kwargs) - - def _full_like_nosync(self, name, data, **kwargs): - path = self._item_path(name) - kwargs.setdefault("synchronizer", self._synchronizer) - kwargs.setdefault("cache_attrs", self.attrs.cache) - return full_like( - data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs - ) - - def _move_nosync(self, path, new_path): - rename(self._store, path, new_path) - if self._chunk_store is not None: - rename(self._chunk_store, path, new_path) - - def move(self, source, dest): - """Move contents from one path to another relative to the Group. - - Parameters - ---------- - source : string - Name or path to a Zarr object to move. - dest : string - New name or path of the Zarr object. - """ - - source = self._item_path(source) - dest = self._item_path(dest) - - # Check that source exists. - if not ( - contains_array(self._store, source) - or contains_group(self._store, source, explicit_only=False) - ): - raise ValueError(f'The source, "{source}", does not exist.') - if contains_array(self._store, dest) or contains_group( - self._store, dest, explicit_only=False - ): - raise ValueError(f'The dest, "{dest}", already exists.') - - # Ensure groups needed for `dest` exist. - if "/" in dest: - self.require_group("/" + dest.rsplit("/", 1)[0]) - - self._write_op(self._move_nosync, source, dest) - - -def _normalize_store_arg(store, *, storage_options=None, mode="r"): - if store is None: - return MemoryStore() - return normalize_store_arg(store, storage_options=storage_options, mode=mode) - - -def group( - store=None, - overwrite=False, - chunk_store=None, - cache_attrs=True, - synchronizer=None, - path=None, - *, - meta_array=None, -): - """Create a group. - - Parameters - ---------- - store : MutableMapping or string, optional - Store or path to directory in file system. - overwrite : bool, optional - If True, delete any pre-existing data in `store` at `path` before - creating the group. - chunk_store : MutableMapping, optional - Separate storage for chunks. If not provided, `store` will be used - for storage of both chunks and metadata. - cache_attrs : bool, optional - If True (default), user attributes will be cached for attribute read - operations. If False, user attributes are reloaded from the store prior - to all attribute read operations. - synchronizer : object, optional - Array synchronizer. - path : string, optional - Group path within store. - meta_array : array-like, optional - An array instance to use for determining arrays to create and return - to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.16.1 - - Returns - ------- - g : zarr.v2.hierarchy.Group - - Examples - -------- - Create a group in memory:: - - >>> import zarr - >>> g = zarr.v2.group() - >>> g - - - Create a group with a different store:: - - >>> store = zarr.v2.DirectoryStore('data/example.zarr') - >>> g = zarr.v2.group(store=store, overwrite=True) - >>> g - - - """ - - # handle polymorphic store arg - store = _normalize_store_arg(store, mode="w") - - path = normalize_storage_path(path) - - requires_init = overwrite or not contains_group(store) - - if requires_init: - init_group(store, overwrite=overwrite, chunk_store=chunk_store, path=path) - - return Group( - store, - read_only=False, - chunk_store=chunk_store, - cache_attrs=cache_attrs, - synchronizer=synchronizer, - path=path, - meta_array=meta_array, - ) - - -def open_group( - store=None, - mode="a", - cache_attrs=True, - synchronizer=None, - path=None, - chunk_store=None, - storage_options=None, - *, - meta_array=None, -): - """Open a group using file-mode-like semantics. - - Parameters - ---------- - store : MutableMapping or string, optional - Store or path to directory in file system or name of zip file. - mode : {'r', 'r+', 'a', 'w', 'w-'}, optional - Persistence mode: 'r' means read only (must exist); 'r+' means - read/write (must exist); 'a' means read/write (create if doesn't - exist); 'w' means create (overwrite if exists); 'w-' means create - (fail if exists). - cache_attrs : bool, optional - If True (default), user attributes will be cached for attribute read - operations. If False, user attributes are reloaded from the store prior - to all attribute read operations. - synchronizer : object, optional - Array synchronizer. - path : string, optional - Group path within store. - chunk_store : MutableMapping or string, optional - Store or path to directory in file system or name of zip file. - storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. - meta_array : array-like, optional - An array instance to use for determining arrays to create and return - to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 - - Returns - ------- - g : zarr.v2.hierarchy.Group - - Examples - -------- - >>> import zarr - >>> root = zarr.v2.open_group('data/example.zarr', mode='w') - >>> foo = root.create_group('foo') - >>> bar = root.create_group('bar') - >>> root - - >>> root2 = zarr.v2.open_group('data/example.zarr', mode='a') - >>> root2 - - >>> root == root2 - True - - """ - - # handle polymorphic store arg - store = _normalize_store_arg(store, storage_options=storage_options, mode=mode) - - if chunk_store is not None: - chunk_store = _normalize_store_arg(chunk_store, storage_options=storage_options, mode=mode) - - path = normalize_storage_path(path) - - # ensure store is initialized - - if mode in ["r", "r+"]: - if not contains_group(store, path=path): - if contains_array(store, path=path): - raise ContainsArrayError(path) - raise GroupNotFoundError(path) - - elif mode == "w": - init_group(store, overwrite=True, path=path, chunk_store=chunk_store) - - elif mode == "a": - if not contains_group(store, path=path): - if contains_array(store, path=path): - raise ContainsArrayError(path) - init_group(store, path=path, chunk_store=chunk_store) - - elif mode in ["w-", "x"]: - if contains_array(store, path=path): - raise ContainsArrayError(path) - elif contains_group(store, path=path): - raise ContainsGroupError(path) - else: - init_group(store, path=path, chunk_store=chunk_store) - - # determine read only status - read_only = mode == "r" - - return Group( - store, - read_only=read_only, - cache_attrs=cache_attrs, - synchronizer=synchronizer, - path=path, - chunk_store=chunk_store, - meta_array=meta_array, - ) diff --git a/src/zarr/v2/indexing.py b/src/zarr/v2/indexing.py deleted file mode 100644 index 880baf3f72..0000000000 --- a/src/zarr/v2/indexing.py +++ /dev/null @@ -1,1074 +0,0 @@ -import collections -import itertools -import math -import numbers - -import numpy as np - - -from zarr.v2.errors import ( - ArrayIndexError, - NegativeStepError, - err_too_many_indices, - VindexInvalidSelectionError, - BoundsCheckError, -) - - -def is_integer(x): - """True if x is an integer (both pure Python or NumPy). - - Note that Python's bool is considered an integer too. - """ - return isinstance(x, numbers.Integral) - - -def is_integer_list(x): - """True if x is a list of integers. - - This function assumes ie *does not check* that all elements of the list - have the same type. Mixed type lists will result in other errors that will - bubble up anyway. - """ - return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) - - -def is_integer_array(x, ndim=None): - t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" - if ndim is not None: - t = t and len(x.shape) == ndim - return t - - -def is_bool_array(x, ndim=None): - t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool - if ndim is not None: - t = t and len(x.shape) == ndim - return t - - -def is_scalar(value, dtype): - if np.isscalar(value): - return True - if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): - return True - return False - - -def is_pure_fancy_indexing(selection, ndim): - """Check whether a selection contains only scalars or integer array-likes. - - Parameters - ---------- - selection : tuple, slice, or scalar - A valid selection value for indexing into arrays. - - Returns - ------- - is_pure : bool - True if the selection is a pure fancy indexing expression (ie not mixed - with boolean or slices). - """ - if ndim == 1: - if is_integer_list(selection) or is_integer_array(selection): - return True - # if not, we go through the normal path below, because a 1-tuple - # of integers is also allowed. - no_slicing = ( - isinstance(selection, tuple) - and len(selection) == ndim - and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) - ) - return ( - no_slicing - and all( - is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) - for elem in selection - ) - and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) - ) - - -def is_pure_orthogonal_indexing(selection, ndim): - if not ndim: - return False - - # Case 1: Selection is a single iterable of integers - if is_integer_list(selection) or is_integer_array(selection, ndim=1): - return True - - # Case two: selection contains either zero or one integer iterables. - # All other selection elements are slices or integers - return ( - isinstance(selection, tuple) - and len(selection) == ndim - and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 - and all( - is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, (int, slice)) - for elem in selection - ) - ) - - -def normalize_integer_selection(dim_sel, dim_len): - # normalize type to int - dim_sel = int(dim_sel) - - # handle wraparound - if dim_sel < 0: - dim_sel = dim_len + dim_sel - - # handle out of bounds - if dim_sel >= dim_len or dim_sel < 0: - raise BoundsCheckError(dim_len) - - return dim_sel - - -ChunkDimProjection = collections.namedtuple( - "ChunkDimProjection", ("dim_chunk_ix", "dim_chunk_sel", "dim_out_sel") -) -"""A mapping from chunk to output array for a single dimension. - -Parameters ----------- -dim_chunk_ix - Index of chunk. -dim_chunk_sel - Selection of items from chunk array. -dim_out_sel - Selection of items in target (output) array. - -""" - - -class IntDimIndexer: - def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize - dim_sel = normalize_integer_selection(dim_sel, dim_len) - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nitems = 1 - - def __iter__(self): - dim_chunk_ix = self.dim_sel // self.dim_chunk_len - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel - dim_offset - dim_out_sel = None - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def ceildiv(a, b): - return math.ceil(a / b) - - -class SliceDimIndexer: - def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize - self.start, self.stop, self.step = dim_sel.indices(dim_len) - if self.step < 1: - raise NegativeStepError - - # store attributes - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nitems = max(0, ceildiv((self.stop - self.start), self.step)) - self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) - - def __iter__(self): - # figure out the range of chunks we need to visit - dim_chunk_ix_from = self.start // self.dim_chunk_len - dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) - - # iterate over chunks in range - for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - # compute offsets for chunk within overall array - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) - - # determine chunk length, accounting for trailing chunk - dim_chunk_len = dim_limit - dim_offset - - if self.start < dim_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - remainder = (dim_offset - self.start) % self.step - if remainder: - dim_chunk_sel_start += self.step - remainder - # compute number of previous items, provides offset into output array - dim_out_offset = ceildiv((dim_offset - self.start), self.step) - - else: - # selection starts within current chunk - dim_chunk_sel_start = self.start - dim_offset - dim_out_offset = 0 - - if self.stop > dim_limit: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = self.stop - dim_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) - - # If there are no elements on the selection within this chunk, then skip - if dim_chunk_nitems == 0: - continue - - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def check_selection_length(selection, shape): - if len(selection) > len(shape): - err_too_many_indices(selection, shape) - - -def replace_ellipsis(selection, shape): - selection = ensure_tuple(selection) - - # count number of ellipsis present - n_ellipsis = sum(1 for i in selection if i is Ellipsis) - - if n_ellipsis > 1: - # more than 1 is an error - raise IndexError("an index can only have a single ellipsis ('...')") - - elif n_ellipsis == 1: - # locate the ellipsis, count how many items to left and right - n_items_l = selection.index(Ellipsis) # items to left of ellipsis - n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis - n_items = len(selection) - 1 # all non-ellipsis items - - if n_items >= len(shape): - # ellipsis does nothing, just remove it - selection = tuple(i for i in selection if i != Ellipsis) - - else: - # replace ellipsis with as many slices are needed for number of dims - new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) - if n_items_r: - new_item += selection[-n_items_r:] - selection = new_item - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += (slice(None),) * (len(shape) - len(selection)) - - # check selection not too long - check_selection_length(selection, shape) - - return selection - - -def replace_lists(selection): - return tuple( - np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection - ) - - -def ensure_tuple(v): - if not isinstance(v, tuple): - v = (v,) - return v - - -ChunkProjection = collections.namedtuple( - "ChunkProjection", ("chunk_coords", "chunk_selection", "out_selection") -) -"""A mapping of items from chunk to output array. Can be used to extract items from the -chunk array for loading into an output array. Can also be used to extract items from a -value array for setting/updating in a chunk array. - -Parameters ----------- -chunk_coords - Indices of chunk. -chunk_selection - Selection of items from chunk array. -out_selection - Selection of items in target (output) array. - -""" - - -def is_slice(s): - return isinstance(s, slice) - - -def is_contiguous_slice(s): - return is_slice(s) and (s.step is None or s.step == 1) - - -def is_positive_slice(s): - return is_slice(s) and (s.step is None or s.step >= 1) - - -def is_contiguous_selection(selection): - selection = ensure_tuple(selection) - return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) - - -def is_basic_selection(selection): - selection = ensure_tuple(selection) - return all(is_integer(s) or is_positive_slice(s) for s in selection) - - -# noinspection PyProtectedMember -class BasicIndexer: - def __init__(self, selection, array): - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif is_slice(dim_sel): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError( - "unsupported selection item for basic indexing; " - "expected integer or slice, got {!r}".format(type(dim_sel)) - ) - - dim_indexers.append(dim_indexer) - - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.drop_axes = () - - def __iter__(self): - for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( - p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -class BoolArrayDimIndexer: - def __init__(self, dim_sel, dim_len, dim_chunk_len): - # check number of dimensions - if not is_bool_array(dim_sel, 1): - raise IndexError("Boolean arrays in an orthogonal selection must be 1-dimensional only") - - # check shape - if dim_sel.shape[0] != dim_len: - raise IndexError( - "Boolean array has the wrong length for dimension; expected {}, got {}".format( - dim_len, dim_sel.shape[0] - ) - ) - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) - - # precompute number of selected items for each chunk - self.chunk_nitems = np.zeros(self.nchunks, dtype="i8") - for dim_chunk_ix in range(self.nchunks): - dim_offset = dim_chunk_ix * self.dim_chunk_len - self.chunk_nitems[dim_chunk_ix] = np.count_nonzero( - self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] - ) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = self.chunk_nitems_cumsum[-1] - self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] - - def __iter__(self): - # iterate over chunks with at least one item - for dim_chunk_ix in self.dim_chunk_ixs: - # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] - - # pad out if final chunk - if dim_chunk_sel.shape[0] < self.dim_chunk_len: - tmp = np.zeros(self.dim_chunk_len, dtype=bool) - tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel - dim_chunk_sel = tmp - - # find region in output - if dim_chunk_ix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_ix] - dim_out_sel = slice(start, stop) - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -class Order: - UNKNOWN = 0 - INCREASING = 1 - DECREASING = 2 - UNORDERED = 3 - - @staticmethod - def check(a): - diff = np.diff(a) - diff_positive = diff >= 0 - n_diff_positive = np.count_nonzero(diff_positive) - all_increasing = n_diff_positive == len(diff_positive) - any_increasing = n_diff_positive > 0 - if all_increasing: - order = Order.INCREASING - elif any_increasing: - order = Order.UNORDERED - else: - order = Order.DECREASING - return order - - -def wraparound_indices(x, dim_len): - loc_neg = x < 0 - if np.any(loc_neg): - x[loc_neg] = x[loc_neg] + dim_len - - -def boundscheck_indices(x, dim_len): - if np.any(x < 0) or np.any(x >= dim_len): - raise BoundsCheckError(dim_len) - - -class IntArrayDimIndexer: - """Integer array selection against a single dimension.""" - - def __init__( - self, - dim_sel, - dim_len, - dim_chunk_len, - wraparound=True, - boundscheck=True, - order=Order.UNKNOWN, - ): - # ensure 1d array - dim_sel = np.asanyarray(dim_sel) - if not is_integer_array(dim_sel, 1): - raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") - - # handle wraparound - if wraparound: - wraparound_indices(dim_sel, dim_len) - - # handle out of bounds - if boundscheck: - boundscheck_indices(dim_sel, dim_len) - - # store attributes - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) - self.nitems = len(dim_sel) - - # determine which chunk is needed for each selection item - # note: for dense integer selections, the division operation here is the - # bottleneck - dim_sel_chunk = dim_sel // dim_chunk_len - - # determine order of indices - if order == Order.UNKNOWN: - order = Order.check(dim_sel) - self.order = order - - if self.order == Order.INCREASING: - self.dim_sel = dim_sel - self.dim_out_sel = None - elif self.order == Order.DECREASING: - self.dim_sel = dim_sel[::-1] - # TODO should be possible to do this without creating an arange - self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) - else: - # sort indices to group by chunk - self.dim_out_sel = np.argsort(dim_sel_chunk) - self.dim_sel = np.take(dim_sel, self.dim_out_sel) - - # precompute number of selected items for each chunk - self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) - - # find chunks that we need to visit - self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] - - # compute offsets into the output array - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - - def __iter__(self): - for dim_chunk_ix in self.dim_chunk_ixs: - # find region in output - if dim_chunk_ix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_ix] - if self.order == Order.INCREASING: - dim_out_sel = slice(start, stop) - else: - dim_out_sel = self.dim_out_sel[start:stop] - - # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[start:stop] - dim_offset - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def slice_to_range(s: slice, l: int): # noqa: E741 - return range(*s.indices(l)) - - -def ix_(selection, shape): - """Convert an orthogonal selection to a numpy advanced (fancy) selection, like ``numpy.ix_`` - but with support for slices and single ints.""" - - # normalisation - selection = replace_ellipsis(selection, shape) - - # replace slice and int as these are not supported by numpy.ix_ - selection = [ - slice_to_range(dim_sel, dim_len) - if isinstance(dim_sel, slice) - else [dim_sel] - if is_integer(dim_sel) - else dim_sel - for dim_sel, dim_len in zip(selection, shape) - ] - - # now get numpy to convert to a coordinate selection - selection = np.ix_(*selection) - - return selection - - -def oindex(a, selection): - """Implementation of orthogonal indexing with slices and ints.""" - selection = replace_ellipsis(selection, a.shape) - drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) - selection = ix_(selection, a.shape) - result = a[selection] - if drop_axes: - result = result.squeeze(axis=drop_axes) - return result - - -def oindex_set(a, selection, value): - selection = replace_ellipsis(selection, a.shape) - drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) - selection = ix_(selection, a.shape) - if not np.isscalar(value) and drop_axes: - value = np.asanyarray(value) - value_selection = [slice(None)] * len(a.shape) - for i in drop_axes: - value_selection[i] = np.newaxis - value_selection = tuple(value_selection) - value = value[value_selection] - a[selection] = value - - -# noinspection PyProtectedMember -class OrthogonalIndexer: - def __init__(self, selection, array): - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - - # normalize list to array - selection = replace_lists(selection) - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif isinstance(dim_sel, slice): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif is_integer_array(dim_sel): - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif is_bool_array(dim_sel): - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError( - "unsupported selection item for orthogonal indexing; " - "expected integer, slice, integer array or Boolean " - "array, got {!r}".format(type(dim_sel)) - ) - - dim_indexers.append(dim_indexer) - - self.array = array - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.is_advanced = not is_basic_selection(selection) - if self.is_advanced: - self.drop_axes = tuple( - i - for i, dim_indexer in enumerate(self.dim_indexers) - if isinstance(dim_indexer, IntDimIndexer) - ) - else: - self.drop_axes = () - - def __iter__(self): - for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( - p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None - ) - - # handle advanced indexing arrays orthogonally - if self.is_advanced: - # N.B., numpy doesn't support orthogonal indexing directly as yet, - # so need to work around via np.ix_. Also np.ix_ does not support a - # mixture of arrays and slices or integers, so need to convert slices - # and integers into ranges. - chunk_selection = ix_(chunk_selection, self.array._chunks) - - # special case for non-monotonic indices - if not is_basic_selection(out_selection): - out_selection = ix_(out_selection, self.shape) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -class OIndex: - def __init__(self, array): - self.array = array - - def __getitem__(self, selection): - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.get_orthogonal_selection(selection, fields=fields) - - def __setitem__(self, selection, value): - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.set_orthogonal_selection(selection, value, fields=fields) - - -# noinspection PyProtectedMember -class BlockIndexer: - def __init__(self, selection, array): - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - - # normalize list to array - selection = replace_lists(selection) - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_size in zip(selection, array._shape, array._chunks): - dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) - - if is_integer(dim_sel): - if dim_sel < 0: - dim_sel = dim_numchunks + dim_sel - - start = dim_sel * dim_chunk_size - stop = start + dim_chunk_size - slice_ = slice(start, stop) - - elif is_slice(dim_sel): - start = dim_sel.start if dim_sel.start is not None else 0 - stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks - - if dim_sel.step not in {1, None}: - raise IndexError( - "unsupported selection item for block indexing; " - "expected integer or slice with step=1, got {!r}".format(type(dim_sel)) - ) - - # Can't reuse wraparound_indices because it expects a numpy array - # We have integers here. - if start < 0: - start = dim_numchunks + start - if stop < 0: - stop = dim_numchunks + stop - - start = start * dim_chunk_size - stop = stop * dim_chunk_size - slice_ = slice(start, stop) - - else: - raise IndexError( - "unsupported selection item for block indexing; " - "expected integer or slice, got {!r}".format(type(dim_sel)) - ) - - dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) - dim_indexers.append(dim_indexer) - - if start >= dim_len or start < 0: - raise BoundsCheckError(dim_len) - - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers) - self.drop_axes = () - - def __iter__(self): - for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( - p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -class BlockIndex: - def __init__(self, array): - self.array = array - - def __getitem__(self, selection): - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.get_block_selection(selection, fields=fields) - - def __setitem__(self, selection, value): - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.set_block_selection(selection, value, fields=fields) - - -# noinspection PyProtectedMember -def is_coordinate_selection(selection, array): - return (len(selection) == len(array._shape)) and all( - is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection - ) - - -# noinspection PyProtectedMember -def is_mask_selection(selection, array): - return ( - len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == array._shape - ) - - -# noinspection PyProtectedMember -class CoordinateIndexer: - def __init__(self, selection, array): - # some initial normalization - selection = ensure_tuple(selection) - selection = tuple([i] if is_integer(i) else i for i in selection) - selection = replace_lists(selection) - - # validation - if not is_coordinate_selection(selection, array): - raise IndexError( - "invalid coordinate selection; expected one integer " - "(coordinate) array per dimension of the target array, " - "got {!r}".format(selection) - ) - - # handle wraparound, boundscheck - for dim_sel, dim_len in zip(selection, array.shape): - # handle wraparound - wraparound_indices(dim_sel, dim_len) - - # handle out of bounds - boundscheck_indices(dim_sel, dim_len) - - # compute chunk index for each point in the selection - chunks_multi_index = tuple( - dim_sel // dim_chunk_len for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) - ) - - # broadcast selection - this will raise error if array dimensions don't match - selection = np.broadcast_arrays(*selection) - chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) - - # remember shape of selection, because we will flatten indices for processing - self.sel_shape = selection[0].shape if selection[0].shape else (1,) - - # flatten selection - selection = [dim_sel.reshape(-1) for dim_sel in selection] - chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] - - # ravel chunk indices - chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) - - # group points by chunk - if np.any(np.diff(chunks_raveled_indices) < 0): - # optimisation, only sort if needed - sel_sort = np.argsort(chunks_raveled_indices) - selection = tuple(dim_sel[sel_sort] for dim_sel in selection) - else: - sel_sort = None - - # store attributes - self.selection = selection - self.sel_sort = sel_sort - self.shape = selection[0].shape if selection[0].shape else (1,) - self.drop_axes = () - self.array = array - - # precompute number of selected items for each chunk - self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - # locate the chunks we need to process - self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] - - # unravel chunk indices - self.chunk_mixs = np.unravel_index(self.chunk_rixs, array._cdata_shape) - - def __iter__(self): - # iterate over chunks - for i, chunk_rix in enumerate(self.chunk_rixs): - chunk_coords = tuple(m[i] for m in self.chunk_mixs) - if chunk_rix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[chunk_rix - 1] - stop = self.chunk_nitems_cumsum[chunk_rix] - if self.sel_sort is None: - out_selection = slice(start, stop) - else: - out_selection = self.sel_sort[start:stop] - - chunk_offsets = tuple( - dim_chunk_ix * dim_chunk_len - for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) - ) - chunk_selection = tuple( - dim_sel[start:stop] - dim_chunk_offset - for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -# noinspection PyProtectedMember -class MaskIndexer(CoordinateIndexer): - def __init__(self, selection, array): - # some initial normalization - selection = ensure_tuple(selection) - selection = replace_lists(selection) - - # validation - if not is_mask_selection(selection, array): - raise IndexError( - "invalid mask selection; expected one Boolean (mask)" - "array with the same shape as the target array, got {!r}".format(selection) - ) - - # convert to indices - selection = np.nonzero(selection[0]) - - # delegate the rest to superclass - super().__init__(selection, array) - - -class VIndex: - def __init__(self, array): - self.array = array - - def __getitem__(self, selection): - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array): - return self.array.get_coordinate_selection(selection, fields=fields) - elif is_mask_selection(selection, self.array): - return self.array.get_mask_selection(selection, fields=fields) - else: - raise VindexInvalidSelectionError(selection) - - def __setitem__(self, selection, value): - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array): - self.array.set_coordinate_selection(selection, value, fields=fields) - elif is_mask_selection(selection, self.array): - self.array.set_mask_selection(selection, value, fields=fields) - else: - raise VindexInvalidSelectionError(selection) - - -def check_fields(fields, dtype): - # early out - if fields is None: - return dtype - # check type - if not isinstance(fields, (str, list, tuple)): - raise IndexError( - "'fields' argument must be a string or list of strings; found {!r}".format(type(fields)) - ) - if fields: - if dtype.names is None: - raise IndexError("invalid 'fields' argument, array does not have any fields") - try: - if isinstance(fields, str): - # single field selection - out_dtype = dtype[fields] - else: - # multiple field selection - out_dtype = np.dtype([(f, dtype[f]) for f in fields]) - except KeyError as e: - raise IndexError("invalid 'fields' argument, field not found: {!r}".format(e)) - else: - return out_dtype - else: - return dtype - - -def check_no_multi_fields(fields): - if isinstance(fields, list): - if len(fields) == 1: - return fields[0] - elif len(fields) > 1: - raise IndexError("multiple fields are not supported for this operation") - return fields - - -def pop_fields(selection): - if isinstance(selection, str): - # single field selection - fields = selection - selection = () - elif not isinstance(selection, tuple): - # single selection item, no fields - fields = None - # leave selection as-is - else: - # multiple items, split fields from selection items - fields = [f for f in selection if isinstance(f, str)] - fields = fields[0] if len(fields) == 1 else fields - selection = tuple(s for s in selection if not isinstance(s, str)) - selection = selection[0] if len(selection) == 1 else selection - return fields, selection - - -def make_slice_selection(selection): - ls = [] - for dim_selection in selection: - if is_integer(dim_selection): - ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) - elif isinstance(dim_selection, np.ndarray): - if len(dim_selection) == 1: - ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) - else: - raise ArrayIndexError - else: - ls.append(dim_selection) - return ls - - -class PartialChunkIterator: - """Iterator to retrieve the specific coordinates of requested data - from within a compressed chunk. - - Parameters - ---------- - selection : tuple - tuple of slice objects to take from the chunk - arr_shape : shape of chunk to select data from - - Attributes - ---------- - arr_shape - selection - - Returns - ------- - Tuple with 3 elements: - - start: int - elements offset in the chunk to read from - nitems: int - number of elements to read in the chunk from start - partial_out_selection: list of slices - indices of a temporary empty array of size `Array._chunks` to assign - the decompressed data to after the partial read. - - Notes - ----- - An array is flattened when compressed with blosc, so this iterator takes - the wanted selection of an array and determines the wanted coordinates - of the flattened, compressed data to be read and then decompressed. The - decompressed data is then placed in a temporary empty array of size - `Array._chunks` at the indices yielded as partial_out_selection. - Once all the slices yielded by this iterator have been read, decompressed - and written to the temporary array, the wanted slice of the chunk can be - indexed from the temporary array and written to the out_selection slice - of the out array. - - """ - - def __init__(self, selection, arr_shape): - selection = make_slice_selection(selection) - self.arr_shape = arr_shape - - # number of selection dimensions can't be greater than the number of chunk dimensions - if len(selection) > len(self.arr_shape): - raise ValueError( - "Selection has more dimensions then the array:\n" - f"selection dimensions = {len(selection)}\n" - f"array dimensions = {len(self.arr_shape)}" - ) - - # any selection can not be out of the range of the chunk - selection_shape = np.empty(self.arr_shape)[tuple(selection)].shape - if any( - selection_dim < 0 or selection_dim > arr_dim - for selection_dim, arr_dim in zip(selection_shape, self.arr_shape) - ): - raise IndexError( - "a selection index is out of range for the dimension" - ) # pragma: no cover - - for i, dim_size in enumerate(self.arr_shape[::-1]): - index = len(self.arr_shape) - (i + 1) - if index <= len(selection) - 1: - slice_size = selection_shape[index] - if slice_size == dim_size and index > 0: - selection.pop() - else: - break - - chunk_loc_slices = [] - last_dim_slice = None if selection[-1].step > 1 else selection.pop() - for arr_shape_i, sl in zip(arr_shape, selection): - dim_chunk_loc_slices = [] - assert isinstance(sl, slice) - for x in slice_to_range(sl, arr_shape_i): - dim_chunk_loc_slices.append(slice(x, x + 1, 1)) - chunk_loc_slices.append(dim_chunk_loc_slices) - if last_dim_slice: - chunk_loc_slices.append([last_dim_slice]) - self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) - - def __iter__(self): - chunk1 = self.chunk_loc_slices[0] - nitems = (chunk1[-1].stop - chunk1[-1].start) * np.prod( - self.arr_shape[len(chunk1) :], dtype=int - ) - for partial_out_selection in self.chunk_loc_slices: - start = 0 - for i, sl in enumerate(partial_out_selection): - start += sl.start * np.prod(self.arr_shape[i + 1 :], dtype=int) - yield start, nitems, partial_out_selection diff --git a/src/zarr/v2/meta.py b/src/zarr/v2/meta.py deleted file mode 100644 index 2f7ce1242e..0000000000 --- a/src/zarr/v2/meta.py +++ /dev/null @@ -1,302 +0,0 @@ -import base64 -import itertools -from collections.abc import Mapping - -import numpy as np - -from zarr.v2.errors import MetadataError -from zarr.v2.util import json_dumps, json_loads - -from typing import cast, Union, Any, List, Mapping as MappingType, TYPE_CHECKING - -if TYPE_CHECKING: # pragma: no cover - pass - - -ZARR_FORMAT = 2 - -# FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} - -_v3_core_types = {"bool", "i1", "u1"} | set("".join(d) for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8"))) - -# The set of complex types allowed ({"c8", ">c16"}) -_v3_complex_types = set(f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("8", "16"))) - -# All dtype.str values corresponding to datetime64 and timedelta64 -# see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units -_date_units = ["Y", "M", "W", "D"] -_time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] -_v3_datetime_types = set( - f"{end}{kind}8[{unit}]" - for end, unit, kind in itertools.product("<>", _date_units + _time_units, ("m", "M")) -) - - -def get_extended_dtype_info(dtype) -> dict: - if dtype.str in _v3_complex_types: - return dict( - extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/complex-dtypes/v1.0.html", - type=dtype.str, - fallback=None, - ) - elif dtype.str == "|O": - return dict( - extension="TODO: object array protocol URL", - type=dtype.str, - fallback=None, - ) - elif dtype.str.startswith("|S"): - return dict( - extension="TODO: bytestring array protocol URL", - type=dtype.str, - fallback=None, - ) - elif dtype.str.startswith("U"): - return dict( - extension="TODO: unicode array protocol URL", - type=dtype.str, - fallback=None, - ) - elif dtype.str.startswith("|V"): - return dict( - extension="TODO: structured array protocol URL", - type=dtype.descr, - fallback=None, - ) - elif dtype.str in _v3_datetime_types: - return dict( - extension="https://zarr-specs.readthedocs.io/en/latest/extensions/data-types/datetime/v1.0.html", - type=dtype.str, - fallback=None, - ) - else: - raise ValueError(f"Unsupported dtype: {dtype}") - - -class Metadata2: - ZARR_FORMAT = ZARR_FORMAT - - @classmethod - def parse_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - # Here we allow that a store may return an already-parsed metadata object, - # or a string of JSON that we will parse here. We allow for an already-parsed - # object to accommodate a consolidated metadata store, where all the metadata for - # all groups and arrays will already have been parsed from JSON. - - if isinstance(s, Mapping): - # assume metadata has already been parsed into a mapping object - meta = s - - else: - # assume metadata needs to be parsed as JSON - meta = json_loads(s) - - return meta - - @classmethod - def decode_array_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - meta = cls.parse_metadata(s) - - # check metadata format - zarr_format = meta.get("zarr_format", None) - if zarr_format != cls.ZARR_FORMAT: - raise MetadataError("unsupported zarr format: %s" % zarr_format) - - # extract array metadata fields - try: - dtype = cls.decode_dtype(meta["dtype"]) - if dtype.hasobject: - import numcodecs - - object_codec = numcodecs.get_codec(meta["filters"][0]) - else: - object_codec = None - - dimension_separator = meta.get("dimension_separator", None) - fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) - meta = dict( - zarr_format=meta["zarr_format"], - shape=tuple(meta["shape"]), - chunks=tuple(meta["chunks"]), - dtype=dtype, - compressor=meta["compressor"], - fill_value=fill_value, - order=meta["order"], - filters=meta["filters"], - ) - if dimension_separator: - meta["dimension_separator"] = dimension_separator - except Exception as e: - raise MetadataError("error decoding metadata") from e - else: - return meta - - @classmethod - def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: - dtype = meta["dtype"] - sdshape = () - if dtype.subdtype is not None: - dtype, sdshape = dtype.subdtype - - dimension_separator = meta.get("dimension_separator") - if dtype.hasobject: - import numcodecs - - object_codec = numcodecs.get_codec(meta["filters"][0]) - else: - object_codec = None - - meta = dict( - zarr_format=cls.ZARR_FORMAT, - shape=meta["shape"] + sdshape, - chunks=meta["chunks"], - dtype=cls.encode_dtype(dtype), - compressor=meta["compressor"], - fill_value=cls.encode_fill_value(meta["fill_value"], dtype, object_codec), - order=meta["order"], - filters=meta["filters"], - ) - if dimension_separator: - meta["dimension_separator"] = dimension_separator - - return json_dumps(meta) - - @classmethod - def encode_dtype(cls, d: np.dtype): - if d.fields is None: - return d.str - else: - return d.descr - - @classmethod - def _decode_dtype_descr(cls, d) -> List[Any]: - # need to convert list of lists to list of tuples - if isinstance(d, list): - # recurse to handle nested structures - d = [(k[0], cls._decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d] - return d - - @classmethod - def decode_dtype(cls, d) -> np.dtype: - d = cls._decode_dtype_descr(d) - return np.dtype(d) - - @classmethod - def decode_group_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - meta = cls.parse_metadata(s) - - # check metadata format version - zarr_format = meta.get("zarr_format", None) - if zarr_format != cls.ZARR_FORMAT: - raise MetadataError("unsupported zarr format: %s" % zarr_format) - - meta = dict(zarr_format=zarr_format) - return meta - - # N.B., keep `meta` parameter as a placeholder for future - # noinspection PyUnusedLocal - @classmethod - def encode_group_metadata(cls, meta=None) -> bytes: - meta = dict(zarr_format=cls.ZARR_FORMAT) - return json_dumps(meta) - - @classmethod - def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: - # early out - if v is None: - return v - if dtype.kind == "V" and dtype.hasobject: - if object_codec is None: - raise ValueError("missing object_codec for object array") - v = base64.standard_b64decode(v) - v = object_codec.decode(v) - v = np.array(v, dtype=dtype)[()] - return v - if dtype.kind == "f": - if v == "NaN": - return np.nan - elif v == "Infinity": - return np.inf - elif v == "-Infinity": - return -np.inf - else: - return np.array(v, dtype=dtype)[()] - elif dtype.kind == "c": - v = ( - cls.decode_fill_value(v[0], dtype.type().real.dtype), - cls.decode_fill_value(v[1], dtype.type().imag.dtype), - ) - v = v[0] + 1j * v[1] - return np.array(v, dtype=dtype)[()] - elif dtype.kind == "S": - # noinspection PyBroadException - try: - v = base64.standard_b64decode(v) - except Exception: - # be lenient, allow for other values that may have been used before base64 - # encoding and may work as fill values, e.g., the number 0 - pass - v = np.array(v, dtype=dtype)[()] - return v - elif dtype.kind == "V": - v = base64.standard_b64decode(v) - v = np.array(v, dtype=dtype.str).view(dtype)[()] - return v - elif dtype.kind == "U": - # leave as-is - return v - else: - return np.array(v, dtype=dtype)[()] - - @classmethod - def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: - # early out - if v is None: - return v - if dtype.kind == "V" and dtype.hasobject: - if object_codec is None: - raise ValueError("missing object_codec for object array") - v = object_codec.encode(v) - v = str(base64.standard_b64encode(v), "ascii") - return v - if dtype.kind == "f": - if np.isnan(v): - return "NaN" - elif np.isposinf(v): - return "Infinity" - elif np.isneginf(v): - return "-Infinity" - else: - return float(v) - elif dtype.kind in ("u", "i"): - return int(v) - elif dtype.kind == "b": - return bool(v) - elif dtype.kind == "c": - c = cast(np.complex128, np.dtype(complex).type()) - v = ( - cls.encode_fill_value(v.real, c.real.dtype, object_codec), - cls.encode_fill_value(v.imag, c.imag.dtype, object_codec), - ) - return v - elif dtype.kind in ("S", "V"): - v = str(base64.standard_b64encode(v), "ascii") - return v - elif dtype.kind == "U": - return v - elif dtype.kind in ("m", "M"): - return int(v.view("i8")) - else: - return v - - -parse_metadata = Metadata2.parse_metadata -decode_array_metadata = Metadata2.decode_array_metadata -encode_array_metadata = Metadata2.encode_array_metadata -encode_dtype = Metadata2.encode_dtype -_decode_dtype_descr = Metadata2._decode_dtype_descr -decode_dtype = Metadata2.decode_dtype -decode_group_metadata = Metadata2.decode_group_metadata -encode_group_metadata = Metadata2.encode_group_metadata -decode_fill_value = Metadata2.decode_fill_value -encode_fill_value = Metadata2.encode_fill_value diff --git a/src/zarr/v2/meta_v1.py b/src/zarr/v2/meta_v1.py deleted file mode 100644 index 881b9191eb..0000000000 --- a/src/zarr/v2/meta_v1.py +++ /dev/null @@ -1,64 +0,0 @@ -import json - -import numpy as np - -from zarr.v2.errors import MetadataError - - -def decode_metadata(b): - s = str(b, "ascii") - meta = json.loads(s) - zarr_format = meta.get("zarr_format", None) - if zarr_format != 1: - raise MetadataError("unsupported zarr format: %s" % zarr_format) - try: - meta = dict( - zarr_format=meta["zarr_format"], - shape=tuple(meta["shape"]), - chunks=tuple(meta["chunks"]), - dtype=decode_dtype(meta["dtype"]), - compression=meta["compression"], - compression_opts=meta["compression_opts"], - fill_value=meta["fill_value"], - order=meta["order"], - ) - except Exception as e: - raise MetadataError("error decoding metadata: %s" % e) - else: - return meta - - -def encode_metadata(meta): - meta = dict( - zarr_format=1, - shape=meta["shape"], - chunks=meta["chunks"], - dtype=encode_dtype(meta["dtype"]), - compression=meta["compression"], - compression_opts=meta["compression_opts"], - fill_value=meta["fill_value"], - order=meta["order"], - ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) - b = s.encode("ascii") - return b - - -def encode_dtype(d): - if d.fields is None: - return d.str - else: - return d.descr - - -def _decode_dtype_descr(d): - # need to convert list of lists to list of tuples - if isinstance(d, list): - # recurse to handle nested structures - d = [(f, _decode_dtype_descr(v)) for f, v in d] - return d - - -def decode_dtype(d): - d = _decode_dtype_descr(d) - return np.dtype(d) diff --git a/src/zarr/v2/n5.py b/src/zarr/v2/n5.py deleted file mode 100644 index ece110f49d..0000000000 --- a/src/zarr/v2/n5.py +++ /dev/null @@ -1,897 +0,0 @@ -"""This module contains a storage class and codec to support the N5 format.""" - -import os -import struct -import sys -from typing import Any, Dict, Optional, cast -import warnings - -import numpy as np -from numcodecs.abc import Codec -from numcodecs.compat import ndarray_copy -from numcodecs.registry import get_codec, register_codec - -from zarr.v2.meta import ZARR_FORMAT, json_dumps, json_loads -from zarr.v2.storage import FSStore -from zarr.v2.storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path -from zarr.v2.storage import array_meta_key as zarr_array_meta_key -from zarr.v2.storage import attrs_key as zarr_attrs_key -from zarr.v2.storage import group_meta_key as zarr_group_meta_key - -N5_FORMAT = "2.0.0" - -zarr_to_n5_keys = [ - ("chunks", "blockSize"), - ("dtype", "dataType"), - ("compressor", "compression"), - ("shape", "dimensions"), -] -n5_attrs_key = "attributes.json" -n5_keywords = ["n5", "dataType", "dimensions", "blockSize", "compression"] - - -class N5Store(NestedDirectoryStore): - """Storage class using directories and files on a standard file system, - following the N5 format (https://github.com/saalfeldlab/n5). - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5Store('data/array.n5') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5Store('data/group.n5') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - - This is an experimental feature. - - Safe to write in multiple threads or processes. - - """ - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs: - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # remove previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - super().__delitem__(key_new) - - def __contains__(self, key): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - return super().__contains__(key_new) - - def __eq__(self, other): - return isinstance(other, N5Store) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - path = cast(str, path) - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and os.path.isdir(entry_path): - for dir_path, _, file_names in os.walk(entry_path): - for file_name in file_names: - file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path + os.path.sep)[1] - new_child = rel_path.replace(os.path.sep, ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - return sorted(children) - - else: - return children - - def _load_n5_attrs(self, path: str) -> Dict[str, Any]: - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - if not path.endswith(n5_attrs_key): - attrs_key = os.path.join(path, n5_attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -class N5FSStore(FSStore): - """Implementation of the N5 format (https://github.com/saalfeldlab/n5) - using `fsspec`, which allows storage on a variety of filesystems. Based - on `zarr.N5Store`. - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - This is an experimental feature. - Safe to write in multiple threads or processes. - - Be advised that the `_dimension_separator` property of this store - (and arrays it creates) is ".", but chunks saved by this store will - in fact be "/" separated, as proscribed by the N5 format. - - This is counter-intuitive (to say the least), but not arbitrary. - Chunks in N5 format are stored with reversed dimension order - relative to Zarr chunks: a chunk of a 3D Zarr array would be stored - on a file system as `/0/1/2`, but in N5 the same chunk would be - stored as `/2/1/0`. Therefore, stores targeting N5 must intercept - chunk keys and flip the order of the dimensions before writing to - storage, and this procedure requires chunk keys with "." separated - dimensions, hence the Zarr arrays targeting N5 have the deceptive - "." dimension separator. - """ - - _array_meta_key = "attributes.json" - _group_meta_key = "attributes.json" - _attrs_key = "attributes.json" - - def __init__(self, *args, **kwargs): - if "dimension_separator" in kwargs: - warnings.warn("Keyword argument `dimension_separator` will be ignored") - kwargs["dimension_separator"] = "." - super().__init__(*args, **kwargs) - - @staticmethod - def _swap_separator(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - def _normalize_key(self, key: str): - if is_chunk_key(key): - key = invert_chunk_coords(key) - - key = normalize_storage_path(key).lstrip("/") - if key: - *bits, end = key.split("/") - - if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): - end = end.replace(".", "/") - key = "/".join(bits + [end]) - return key.lower() if self.normalize_keys else key - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs.keys(): - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # replace previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - elif is_chunk_key(key): - key_new = self._swap_separator(key) - else: - key_new = key - super().__delitem__(key_new) - - def __contains__(self, key: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - return super().__contains__(key_new) - - def __eq__(self, other: Any): - return isinstance(other, N5FSStore) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._array_meta_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and self.fs.isdir(entry_path): - for file_name in self.fs.find(entry_path): - file_path = os.path.join(root_path, file_name) - rel_path = file_path.split(root_path)[1] - new_child = rel_path.lstrip("/").replace("/", ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._group_meta_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - return sorted(children) - else: - return children - - def _load_n5_attrs(self, path: str): - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - if not path.endswith(self._attrs_key): - attrs_key = os.path.join(path, self._attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -def is_chunk_key(key: str): - rv = False - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - rv = bool(_prog_ckey.match(last_segment)) - return rv - - -def invert_chunk_coords(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - -def group_metadata_to_n5(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from zarr to N5 format.""" - del group_metadata["zarr_format"] - # TODO: This should only exist at the top-level - group_metadata["n5"] = N5_FORMAT - return group_metadata - - -def group_metadata_to_zarr(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from N5 to zarr format.""" - # This only exists at the top level - group_metadata.pop("n5", None) - group_metadata["zarr_format"] = ZARR_FORMAT - return group_metadata - - -def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dict[str, Any]: - """Convert array metadata from zarr to N5 format. If the `top_level` keyword argument is True, - then the `N5` : N5_FORMAT key : value pair will be inserted into the metadata.""" - - for f, t in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - del array_metadata["zarr_format"] - if top_level: - array_metadata["n5"] = N5_FORMAT - try: - dtype = np.dtype(array_metadata["dataType"]) - except TypeError: - raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") - - array_metadata["dataType"] = dtype.name - array_metadata["dimensions"] = array_metadata["dimensions"][::-1] - array_metadata["blockSize"] = array_metadata["blockSize"][::-1] - - if "fill_value" in array_metadata: - if array_metadata["fill_value"] != 0 and array_metadata["fill_value"] is not None: - raise ValueError( - f"""Received fill_value = {array_metadata['fill_value']}, - but N5 only supports fill_value = 0""" - ) - del array_metadata["fill_value"] - - if "order" in array_metadata: - if array_metadata["order"] != "C": - raise ValueError( - f"Received order = {array_metadata['order']}, but N5 only supports order = C" - ) - del array_metadata["order"] - - if "filters" in array_metadata: - if array_metadata["filters"] != [] and array_metadata["filters"] is not None: - raise ValueError("Received filters, but N5 storage does not support zarr filters") - del array_metadata["filters"] - - assert "compression" in array_metadata - compressor_config = array_metadata["compression"] - compressor_config = compressor_config_to_n5(compressor_config) - array_metadata["compression"] = compressor_config - - if "dimension_separator" in array_metadata: - del array_metadata["dimension_separator"] - - return array_metadata - - -def array_metadata_to_zarr( - array_metadata: Dict[str, Any], top_level: bool = False -) -> Dict[str, Any]: - """Convert array metadata from N5 to zarr format. - If the `top_level` keyword argument is True, then the `N5` key will be removed from metadata""" - for t, f in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - if top_level: - array_metadata.pop("n5") - array_metadata["zarr_format"] = ZARR_FORMAT - - array_metadata["shape"] = array_metadata["shape"][::-1] - array_metadata["chunks"] = array_metadata["chunks"][::-1] - array_metadata["fill_value"] = 0 # also if None was requested - array_metadata["order"] = "C" - array_metadata["filters"] = [] - array_metadata["dimension_separator"] = "." - array_metadata["dtype"] = np.dtype(array_metadata["dtype"]).str - - compressor_config = array_metadata["compressor"] - compressor_config = compressor_config_to_zarr(compressor_config) - array_metadata["compressor"] = { - "id": N5ChunkWrapper.codec_id, - "compressor_config": compressor_config, - "dtype": array_metadata["dtype"], - "chunk_shape": array_metadata["chunks"], - } - - return array_metadata - - -def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: - """Get all zarr attributes from an N5 attributes dictionary (i.e., - all non-keyword attributes).""" - - # remove all N5 keywords - for n5_key in n5_keywords: - if n5_key in attrs: - del attrs[n5_key] - - return attrs - - -def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: - if compressor_config is None: - return {"type": "raw"} - else: - _compressor_config = compressor_config - - # peel wrapper, if present - if _compressor_config["id"] == N5ChunkWrapper.codec_id: - _compressor_config = _compressor_config["compressor_config"] - - codec_id = _compressor_config["id"] - n5_config = {"type": codec_id} - - if codec_id == "bz2": - n5_config["type"] = "bzip2" - n5_config["blockSize"] = _compressor_config["level"] - - elif codec_id == "blosc": - n5_config["cname"] = _compressor_config["cname"] - n5_config["clevel"] = _compressor_config["clevel"] - n5_config["shuffle"] = _compressor_config["shuffle"] - n5_config["blocksize"] = _compressor_config["blocksize"] - - elif codec_id == "lzma": - # Switch to XZ for N5 if we are using the default XZ format. - # Note: 4 is the default, which is lzma.CHECK_CRC64. - if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: - n5_config["type"] = "xz" - else: - warnings.warn( - "Not all N5 implementations support lzma compression (yet). You " - "might not be able to open the dataset with another N5 library.", - RuntimeWarning, - ) - n5_config["format"] = _compressor_config["format"] - n5_config["check"] = _compressor_config["check"] - n5_config["filters"] = _compressor_config["filters"] - - # The default is lzma.PRESET_DEFAULT, which is 6. - if _compressor_config["preset"]: - n5_config["preset"] = _compressor_config["preset"] - else: - n5_config["preset"] = 6 - - elif codec_id == "zlib": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = True - - elif codec_id == "gzip": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = False - - else: - n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) - - return n5_config - - -def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: - codec_id = compressor_config["type"] - zarr_config = {"id": codec_id} - - if codec_id == "bzip2": - zarr_config["id"] = "bz2" - zarr_config["level"] = compressor_config["blockSize"] - - elif codec_id == "blosc": - zarr_config["cname"] = compressor_config["cname"] - zarr_config["clevel"] = compressor_config["clevel"] - zarr_config["shuffle"] = compressor_config["shuffle"] - zarr_config["blocksize"] = compressor_config["blocksize"] - - elif codec_id == "lzma": - zarr_config["format"] = compressor_config["format"] - zarr_config["check"] = compressor_config["check"] - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = compressor_config["filters"] - - elif codec_id == "xz": - zarr_config["id"] = "lzma" - zarr_config["format"] = 1 # lzma.FORMAT_XZ - zarr_config["check"] = -1 - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = None - - elif codec_id == "gzip": - if compressor_config.get("useZlib"): - zarr_config["id"] = "zlib" - zarr_config["level"] = compressor_config["level"] - else: - zarr_config["id"] = "gzip" - zarr_config["level"] = compressor_config["level"] - - elif codec_id == "raw": - return None - - else: - zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) - - return zarr_config - - -class N5ChunkWrapper(Codec): # type: ignore[misc] - codec_id = "n5_wrapper" - - def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): - self.dtype = np.dtype(dtype) - self.chunk_shape = tuple(chunk_shape) - # is the dtype a little endian format? - self._little_endian = self.dtype.byteorder == "<" or ( - self.dtype.byteorder == "=" and sys.byteorder == "little" - ) - - if compressor: - if compressor_config is not None: - raise ValueError("Only one of compressor_config or compressor should be given.") - compressor_config = compressor.get_config() - - if compressor_config is None and compressor is None or compressor_config["id"] == "raw": - self.compressor_config = None - self._compressor = None - else: - self._compressor = get_codec(compressor_config) - self.compressor_config = self._compressor.get_config() - - def get_config(self): - config = {"id": self.codec_id, "compressor_config": self.compressor_config} - return config - - def encode(self, chunk): - assert chunk.flags.c_contiguous - - header = self._create_header(chunk) - chunk = self._to_big_endian(chunk) - - if self._compressor: - return header + self._compressor.encode(chunk) - else: - return header + chunk.tobytes(order="A") - - def decode(self, chunk, out=None) -> bytes: - len_header, chunk_shape = self._read_header(chunk) - chunk = chunk[len_header:] - - if out is not None: - # out should only be used if we read a complete chunk - assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( - self.chunk_shape, chunk_shape - ) - - if self._compressor: - self._compressor.decode(chunk, out) - else: - ndarray_copy(chunk, out) - - # we can byteswap in-place - if self._little_endian: - out.byteswap(True) - - return out - - else: - if self._compressor: - chunk = self._compressor.decode(chunk) - - # more expensive byteswap - chunk = self._from_big_endian(chunk) - - # read partial chunk - if chunk_shape != self.chunk_shape: - chunk = np.frombuffer(chunk, dtype=self.dtype) - chunk = chunk.reshape(chunk_shape) - complete_chunk = np.zeros(self.chunk_shape, dtype=self.dtype) - target_slices = tuple(slice(0, s) for s in chunk_shape) - complete_chunk[target_slices] = chunk - chunk = complete_chunk - - return chunk - - @staticmethod - def _create_header(chunk): - mode = struct.pack(">H", 0) - num_dims = struct.pack(">H", len(chunk.shape)) - shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) - - return mode + num_dims + shape - - @staticmethod - def _read_header(chunk): - num_dims = struct.unpack(">H", chunk[2:4])[0] - shape = tuple( - struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) - )[::-1] - - len_header = 4 + num_dims * 4 - - return len_header, shape - - def _to_big_endian(self, data): - # assumes data is ndarray - - if self._little_endian: - return data.byteswap() - return data - - def _from_big_endian(self, data): - # assumes data is byte array in big endian - - if not self._little_endian: - return data - - a = np.frombuffer(data, self.dtype.newbyteorder(">")) - return a.astype(self.dtype) - - -register_codec(N5ChunkWrapper, N5ChunkWrapper.codec_id) diff --git a/src/zarr/v2/storage.py b/src/zarr/v2/storage.py deleted file mode 100644 index 67240e520d..0000000000 --- a/src/zarr/v2/storage.py +++ /dev/null @@ -1,2822 +0,0 @@ -"""This module contains storage classes for use with Zarr arrays and groups. - -Note that any object implementing the :class:`MutableMapping` interface from the -:mod:`collections` module in the Python standard library can be used as a Zarr -array store, as long as it accepts string (str) keys and bytes values. - -In addition to the :class:`MutableMapping` interface, store classes may also implement -optional methods `listdir` (list members of a "directory") and `rmdir` (remove all -members of a "directory"). These methods should be implemented if the store class is -aware of the hierarchical organisation of resources within the store and can provide -efficient implementations. If these methods are not available, Zarr will fall back to -slower implementations that work via the :class:`MutableMapping` interface. Store -classes may also optionally implement a `rename` method (rename all members under a given -path) and a `getsize` method (return the size in bytes of a given value). - -""" - -import atexit -import errno -import glob -import multiprocessing -import operator -import os -import re -import shutil -import sys -import tempfile -import warnings -import zipfile -from collections import OrderedDict -from collections.abc import MutableMapping -from os import scandir -from pickle import PicklingError -from threading import Lock, RLock -from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any -import uuid -import time - -from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like -from numcodecs.registry import codec_registry -from zarr.v2.context import Context - -from zarr.v2.errors import ( - MetadataError, - BadCompressorError, - ContainsArrayError, - ContainsGroupError, - FSPathExistNotDir, - ReadOnlyError, -) -from zarr.v2.meta import encode_array_metadata, encode_group_metadata -from zarr.v2.util import ( - buffer_size, - json_loads, - nolock, - normalize_chunks, - normalize_dimension_separator, - normalize_dtype, - normalize_fill_value, - normalize_order, - normalize_shape, - normalize_storage_path, - retry_call, - ensure_contiguous_ndarray_or_bytes, -) - -from zarr.v2._storage.absstore import ABSStore # noqa: F401 -from zarr.v2._storage.store import ( # noqa: F401 - _listdir_from_keys, - _rename_from_keys, - _rmdir_from_keys, - _path_to_prefix, - _prefix_to_array_key, - _prefix_to_group_key, - array_meta_key, - attrs_key, - group_meta_key, - DEFAULT_ZARR_VERSION, - BaseStore, - Store, -) - -__doctest_requires__ = { - ("RedisStore", "RedisStore.*"): ["redis"], - ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], - ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], -} - - -try: - # noinspection PyUnresolvedReferences - from zarr.v2.codecs import Blosc - - default_compressor = Blosc() -except ImportError: # pragma: no cover - from zarr.v2.codecs import Zlib - - default_compressor = Zlib() - - -Path = Union[str, bytes, None] -# allow MutableMapping for backwards compatibility -StoreLike = Union[BaseStore, MutableMapping[str, Any]] - - -def contains_array(store: StoreLike, path: Path = None) -> bool: - """Return True if the store contains an array at the given logical path.""" - path = normalize_storage_path(path) - prefix = _path_to_prefix(path) - key = _prefix_to_array_key(store, prefix) - return key in store - - -def contains_group(store: StoreLike, path: Path = None, explicit_only=True) -> bool: - """Return True if the store contains a group at the given logical path.""" - path = normalize_storage_path(path) - prefix = _path_to_prefix(path) - key = _prefix_to_group_key(store, prefix) - return key in store - - -def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore: - if store is None: - store = KVStore(dict()) - return store - if isinstance(store, os.PathLike): - store = os.fspath(store) - if FSStore._fsspec_installed(): - import fsspec - - if isinstance(store, fsspec.FSMap): - return FSStore( - store.root, - fs=store.fs, - mode=mode, - check=store.check, - create=store.create, - missing_exceptions=store.missing_exceptions, - **(storage_options or {}), - ) - if isinstance(store, str): - if "://" in store or "::" in store: - return FSStore(store, mode=mode, **(storage_options or {})) - elif storage_options: - raise ValueError("storage_options passed with non-fsspec path") - if store.endswith(".zip"): - return ZipStore(store, mode=mode) - elif store.endswith(".n5"): - from zarr.v2.n5 import N5Store - - return N5Store(store) - else: - return DirectoryStore(store) - else: - store = Store._ensure_store(store) - return store - - -def rmdir(store: StoreLike, path: Path = None): - """Remove all items under the given path. If `store` provides a `rmdir` method, - this will be called, otherwise will fall back to implementation via the - `Store` interface.""" - path = normalize_storage_path(path) - if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore - # pass through - store.rmdir(path) - else: - # slow version, delete one key at a time - _rmdir_from_keys(store, path) - - -def rename(store: Store, src_path: Path, dst_path: Path): - """Rename all items under the given path. If `store` provides a `rename` method, - this will be called, otherwise will fall back to implementation via the - `Store` interface.""" - src_path = normalize_storage_path(src_path) - dst_path = normalize_storage_path(dst_path) - if hasattr(store, "rename"): - # pass through - store.rename(src_path, dst_path) - else: - # slow version, delete one key at a time - _rename_from_keys(store, src_path, dst_path) - - -def listdir(store: BaseStore, path: Path = None): - """Obtain a directory listing for the given path. If `store` provides a `listdir` - method, this will be called, otherwise will fall back to implementation via the - `MutableMapping` interface.""" - path = normalize_storage_path(path) - if hasattr(store, "listdir"): - # pass through - return store.listdir(path) - else: - # slow version, iterate through all keys - warnings.warn( - f"Store {store} has no `listdir` method. From zarr 2.9 onwards " - "may want to inherit from `Store`.", - stacklevel=2, - ) - return _listdir_from_keys(store, path) - - -def _getsize(store: BaseStore, path: Path = None) -> int: - # compute from size of values - if isinstance(path, str) and path in store: - v = store[path] - size = buffer_size(v) - else: - path = "" if path is None else normalize_storage_path(path) - size = 0 - - members = listdir(store, path) - prefix = _path_to_prefix(path) - members = [prefix + k for k in members] - for k in members: - try: - v = store[k] - except KeyError: - pass - else: - try: - size += buffer_size(v) - except TypeError: - return -1 - return size - - -def getsize(store: BaseStore, path: Path = None) -> int: - """Compute size of stored items for a given path. If `store` provides a `getsize` - method, this will be called, otherwise will return -1.""" - if hasattr(store, "getsize"): - # pass through - path = normalize_storage_path(path) - return store.getsize(path) - elif isinstance(store, MutableMapping): - return _getsize(store, path) - else: - return -1 - - -def _require_parent_group( - path: Optional[str], - store: StoreLike, - chunk_store: Optional[StoreLike], - overwrite: bool, -): - # assume path is normalized - if path: - segments = path.split("/") - for i in range(len(segments)): - p = "/".join(segments[:i]) - if contains_array(store, p): - _init_group_metadata(store, path=p, chunk_store=chunk_store, overwrite=overwrite) - elif not contains_group(store, p): - _init_group_metadata(store, path=p, chunk_store=chunk_store) - - -def init_array( - store: StoreLike, - shape: Union[int, Tuple[int, ...]], - chunks: Union[bool, int, Tuple[int, ...]] = True, - dtype=None, - compressor="default", - fill_value=None, - order: str = "C", - overwrite: bool = False, - path: Optional[Path] = None, - chunk_store: Optional[StoreLike] = None, - filters=None, - object_codec=None, - dimension_separator=None, - storage_transformers=(), -): - """Initialize an array store with the given configuration. Note that this is a low-level - function and there should be no need to call this directly from user code. - - Parameters - ---------- - store : Store - A mapping that supports string keys and bytes-like values. - shape : int or tuple of ints - Array shape. - chunks : bool, int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - dtype : string or dtype, optional - NumPy dtype. - compressor : Codec, optional - Primary compressor. - fill_value : object - Default value to use for uninitialized portions of the array. - order : {'C', 'F'}, optional - Memory layout to be used within each chunk. - overwrite : bool, optional - If True, erase all data in `store` prior to initialisation. - path : string, bytes, optional - Path under which array is stored. - chunk_store : Store, optional - Separate storage for chunks. If not provided, `store` will be used - for storage of both chunks and metadata. - filters : sequence, optional - Sequence of filters to use to encode chunk data prior to compression. - object_codec : Codec, optional - A codec to encode object arrays, only needed if dtype=object. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - Examples - -------- - Initialize an array store:: - - >>> from zarr.v2.storage import init_array, KVStore - >>> store = KVStore(dict()) - >>> init_array(store, shape=(10000, 10000), chunks=(1000, 1000)) - >>> sorted(store.keys()) - ['.zarray'] - - Array metadata is stored as JSON:: - - >>> print(store['.zarray'].decode()) - { - "chunks": [ - 1000, - 1000 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": ">> store = KVStore(dict()) - >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', path='foo') - >>> sorted(store.keys()) - ['.zgroup', 'foo/.zarray'] - >>> print(store['foo/.zarray'].decode()) - { - "chunks": [ - 1000000 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": "|i1", - "fill_value": null, - "filters": null, - "order": "C", - "shape": [ - 100000000 - ], - "zarr_format": 2 - } - - Notes - ----- - The initialisation process involves normalising all array metadata, encoding - as JSON and storing under the '.zarray' key. - - """ - - # normalize path - path = normalize_storage_path(path) - - # ensure parent group initialized - - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) - - if not compressor: - # compatibility with legacy tests using compressor=[] - compressor = None - _init_array_metadata( - store, - shape=shape, - chunks=chunks, - dtype=dtype, - compressor=compressor, - fill_value=fill_value, - order=order, - overwrite=overwrite, - path=path, - chunk_store=chunk_store, - filters=filters, - object_codec=object_codec, - dimension_separator=dimension_separator, - storage_transformers=storage_transformers, - ) - - -def _init_array_metadata( - store: StoreLike, - shape, - chunks=None, - dtype=None, - compressor="default", - fill_value=None, - order="C", - overwrite=False, - path: Optional[str] = None, - chunk_store: Optional[StoreLike] = None, - filters=None, - object_codec=None, - dimension_separator=None, - storage_transformers=(), -): - path = normalize_storage_path(path) - - # guard conditions - if overwrite: - # attempt to delete any pre-existing array in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - - if not overwrite: - if contains_array(store, path): - raise ContainsArrayError(path) - if contains_group(store, path, explicit_only=False): - raise ContainsGroupError(path) - - # normalize metadata - dtype, object_codec = normalize_dtype(dtype, object_codec) - shape = normalize_shape(shape) + dtype.shape - dtype = dtype.base - chunks = normalize_chunks(chunks, shape, dtype.itemsize) - order = normalize_order(order) - fill_value = normalize_fill_value(fill_value, dtype) - - # optional array metadata - if dimension_separator is None: - dimension_separator = getattr(store, "_dimension_separator", None) - dimension_separator = normalize_dimension_separator(dimension_separator) - - # compressor prep - if shape == (): - # no point in compressing a 0-dimensional array, only a single value - compressor = None - elif compressor == "none": - # compatibility - compressor = None - elif compressor == "default": - compressor = default_compressor - - # obtain compressor config - compressor_config = None - if compressor: - try: - compressor_config = compressor.get_config() - except AttributeError as e: - raise BadCompressorError(compressor) from e - - # obtain filters config - if filters: - # TODO: filters was removed from the metadata in v3 - # raise error here if store_version > 2? - filters_config = [f.get_config() for f in filters] - else: - filters_config = [] - - # deal with object encoding - if dtype.hasobject: - if object_codec is None: - if not filters: - # there are no filters so we can be sure there is no object codec - raise ValueError("missing object_codec for object array") - else: - # one of the filters may be an object codec, issue a warning rather - # than raise an error to maintain backwards-compatibility - warnings.warn( - "missing object_codec for object array; this will raise a " - "ValueError in version 3.0", - FutureWarning, - ) - else: - filters_config.insert(0, object_codec.get_config()) - elif object_codec is not None: - warnings.warn("an object_codec is only needed for object arrays") - - # use null to indicate no filters - if not filters_config: - filters_config = None # type: ignore - - # initialize metadata - _compressor = compressor_config - meta = dict( - shape=shape, - compressor=_compressor, - fill_value=fill_value, - dimension_separator=dimension_separator, - ) - - meta.update(dict(chunks=chunks, dtype=dtype, order=order, filters=filters_config)) - assert not storage_transformers - - key = _prefix_to_array_key(store, _path_to_prefix(path)) - if hasattr(store, "_metadata_class"): - store[key] = store._metadata_class.encode_array_metadata(meta) - else: - store[key] = encode_array_metadata(meta) - - -# backwards compatibility -init_store = init_array - - -def init_group( - store: StoreLike, - overwrite: bool = False, - path: Path = None, - chunk_store: Optional[StoreLike] = None, -): - """Initialize a group store. Note that this is a low-level function and there should be no - need to call this directly from user code. - - Parameters - ---------- - store : Store - A mapping that supports string keys and byte sequence values. - overwrite : bool, optional - If True, erase all data in `store` prior to initialisation. - path : string, optional - Path under which array is stored. - chunk_store : Store, optional - Separate storage for chunks. If not provided, `store` will be used - for storage of both chunks and metadata. - - """ - - # normalize path - path = normalize_storage_path(path) - - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) - - # initialise metadata - _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store) - - -def _init_group_metadata( - store: StoreLike, - overwrite: Optional[bool] = False, - path: Optional[str] = None, - chunk_store: Optional[StoreLike] = None, -): - path = normalize_storage_path(path) - - # guard conditions - if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - - if not overwrite: - if contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) - - # initialize metadata - # N.B., currently no metadata properties are needed, however there may - # be in future - meta: dict[str, Any] = {} - key = _prefix_to_group_key(store, _path_to_prefix(path)) - if hasattr(store, "_metadata_class"): - store[key] = store._metadata_class.encode_group_metadata(meta) - else: - store[key] = encode_group_metadata(meta) - - -def _dict_store_keys(d: dict[str, Any], prefix="", cls=dict): - for k in d.keys(): - v = d[k] - if isinstance(v, cls): - yield from _dict_store_keys(v, prefix + k + "/", cls) - else: - yield prefix + k - - -class KVStore(Store): - """ - This provides a default implementation of a store interface around - a mutable mapping, to avoid having to test stores for presence of methods. - - This, for most methods should just be a pass-through to the underlying KV - store which is likely to expose a MuttableMapping interface, - """ - - def __init__(self, mutablemapping): - self._mutable_mapping = mutablemapping - - def __getitem__(self, key): - return self._mutable_mapping[key] - - def __setitem__(self, key, value): - self._mutable_mapping[key] = value - - def __delitem__(self, key): - del self._mutable_mapping[key] - - def __contains__(self, key): - return key in self._mutable_mapping - - def get(self, key, default=None): - return self._mutable_mapping.get(key, default) - - def values(self): - return self._mutable_mapping.values() - - def __iter__(self): - return iter(self._mutable_mapping) - - def __len__(self): - return len(self._mutable_mapping) - - def __repr__(self): - return f"<{self.__class__.__name__}: \n{self._mutable_mapping!r}\n at {hex(id(self))}>" - - def __eq__(self, other): - if isinstance(other, KVStore): - return self._mutable_mapping == other._mutable_mapping - else: - return NotImplemented - - -class MemoryStore(Store): - """Store class that uses a hierarchy of :class:`KVStore` objects, thus all data - will be held in main memory. - - Examples - -------- - This is the default class used when creating a group. E.g.:: - - >>> import zarr - >>> g = zarr.v2.group() - >>> type(g.store) - - - Note that the default class when creating an array is the built-in - :class:`KVStore` class, i.e.:: - - >>> z = zarr.v2.zeros(100) - >>> type(z.store) - - - Notes - ----- - Safe to write in multiple threads. - - """ - - def __init__(self, root=None, cls=dict, dimension_separator=None): - if root is None: - self.root = cls() - else: - self.root = root - self.cls = cls - self.write_mutex = Lock() - self._dimension_separator = dimension_separator - - def __getstate__(self): - return self.root, self.cls - - def __setstate__(self, state): - root, cls = state - self.__init__(root=root, cls=cls) - - def _get_parent(self, item: str): - parent = self.root - # split the item - segments = item.split("/") - # find the parent container - for k in segments[:-1]: - parent = parent[k] - if not isinstance(parent, self.cls): - raise KeyError(item) - return parent, segments[-1] - - def _require_parent(self, item): - parent = self.root - # split the item - segments = item.split("/") - # require the parent container - for k in segments[:-1]: - try: - parent = parent[k] - except KeyError: - parent[k] = self.cls() - parent = parent[k] - else: - if not isinstance(parent, self.cls): - raise KeyError(item) - return parent, segments[-1] - - def __getitem__(self, item: str): - parent, key = self._get_parent(item) - try: - value = parent[key] - except KeyError: - raise KeyError(item) - else: - if isinstance(value, self.cls): - raise KeyError(item) - else: - return value - - def __setitem__(self, item: str, value): - with self.write_mutex: - parent, key = self._require_parent(item) - value = ensure_bytes(value) - parent[key] = value - - def __delitem__(self, item: str): - with self.write_mutex: - parent, key = self._get_parent(item) - try: - del parent[key] - except KeyError: - raise KeyError(item) - - def __contains__(self, item: str): # type: ignore[override] - try: - parent, key = self._get_parent(item) - value = parent[key] - except KeyError: - return False - else: - return not isinstance(value, self.cls) - - def __eq__(self, other): - return isinstance(other, MemoryStore) and self.root == other.root and self.cls == other.cls - - def keys(self): - yield from _dict_store_keys(self.root, cls=self.cls) - - def __iter__(self): - return self.keys() - - def __len__(self) -> int: - return sum(1 for _ in self.keys()) - - def listdir(self, path: Path = None) -> List[str]: - path = normalize_storage_path(path) - if path: - try: - parent, key = self._get_parent(path) - value = parent[key] - except KeyError: - return [] - else: - value = self.root - if isinstance(value, self.cls): - return sorted(value.keys()) - else: - return [] - - def rename(self, src_path: Path, dst_path: Path): - src_path = normalize_storage_path(src_path) - dst_path = normalize_storage_path(dst_path) - - src_parent, src_key = self._get_parent(src_path) - dst_parent, dst_key = self._require_parent(dst_path) - - dst_parent[dst_key] = src_parent.pop(src_key) - - def rmdir(self, path: Path = None): - path = normalize_storage_path(path) - if path: - try: - parent, key = self._get_parent(path) - value = parent[key] - except KeyError: - return - else: - if isinstance(value, self.cls): - del parent[key] - else: - # clear out root - self.root = self.cls() - - def getsize(self, path: Path = None): - path = normalize_storage_path(path) - - # obtain value to return size of - value = None - if path: - try: - parent, key = self._get_parent(path) - value = parent[key] - except KeyError: - pass - else: - value = self.root - - # obtain size of value - if value is None: - return 0 - - elif isinstance(value, self.cls): - # total size for directory - size = 0 - for v in value.values(): - if not isinstance(v, self.cls): - size += buffer_size(v) - return size - - else: - return buffer_size(value) - - def clear(self): - with self.write_mutex: - self.root.clear() - - -class DictStore(MemoryStore): - def __init__(self, *args, **kwargs): - warnings.warn( - "DictStore has been renamed to MemoryStore in 2.4.0 and " - "will be removed in the future. Please use MemoryStore.", - DeprecationWarning, - stacklevel=2, - ) - super().__init__(*args, **kwargs) - - -class DirectoryStore(Store): - """Storage class using directories and files on a standard file system. - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.v2.DirectoryStore('data/array.zarr') - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Each chunk of the array is stored as a separate file on the file system, - i.e.:: - - >>> import os - >>> sorted(os.listdir('data/array.zarr')) - ['.zarray', '0.0', '0.1', '1.0', '1.1'] - - Store a group:: - - >>> store = zarr.v2.DirectoryStore('data/group.zarr') - >>> root = zarr.v2.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - When storing a group, levels in the group hierarchy will correspond to - directories on the file system, i.e.:: - - >>> sorted(os.listdir('data/group.zarr')) - ['.zgroup', 'foo'] - >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zgroup', 'bar'] - >>> sorted(os.listdir('data/group.zarr/foo/bar')) - ['.zarray', '0.0', '0.1', '1.0', '1.1'] - - Notes - ----- - Atomic writes are used, which means that data are first written to a - temporary file, then moved into place when the write is successfully - completed. Files are only held open while they are being read or written and are - closed immediately afterwards, so there is no need to manually close any files. - - Safe to write in multiple threads or processes. - - """ - - def __init__(self, path, normalize_keys=False, dimension_separator=None): - # guard conditions - path = os.path.abspath(path) - if os.path.exists(path) and not os.path.isdir(path): - raise FSPathExistNotDir(path) - - self.path = path - self.normalize_keys = normalize_keys - self._dimension_separator = dimension_separator - - def _normalize_key(self, key): - return key.lower() if self.normalize_keys else key - - @staticmethod - def _fromfile(fn): - """Read data from a file - - Parameters - ---------- - fn : str - Filepath to open and read from. - - Notes - ----- - Subclasses should overload this method to specify any custom - file reading logic. - """ - with open(fn, "rb") as f: - return f.read() - - @staticmethod - def _tofile(a, fn): - """Write data to a file - - Parameters - ---------- - a : array-like - Data to write into the file. - fn : str - Filepath to open and write to. - - Notes - ----- - Subclasses should overload this method to specify any custom - file writing logic. - """ - with open(fn, mode="wb") as f: - f.write(a) - - def __getitem__(self, key): - key = self._normalize_key(key) - filepath = os.path.join(self.path, key) - if os.path.isfile(filepath): - return self._fromfile(filepath) - else: - raise KeyError(key) - - def __setitem__(self, key, value): - key = self._normalize_key(key) - - # coerce to flat, contiguous array (ideally without copying) - value = ensure_contiguous_ndarray_like(value) - - # destination path for key - file_path = os.path.join(self.path, key) - - # ensure there is no directory in the way - if os.path.isdir(file_path): - shutil.rmtree(file_path) - - # ensure containing directory exists - dir_path, file_name = os.path.split(file_path) - if os.path.isfile(dir_path): - raise KeyError(key) - if not os.path.exists(dir_path): - try: - os.makedirs(dir_path) - except OSError as e: - if e.errno != errno.EEXIST: - raise KeyError(key) - - # write to temporary file - # note we're not using tempfile.NamedTemporaryFile to avoid restrictive file permissions - temp_name = file_name + "." + uuid.uuid4().hex + ".partial" - temp_path = os.path.join(dir_path, temp_name) - try: - self._tofile(value, temp_path) - - # move temporary file into place; - # make several attempts at writing the temporary file to get past - # potential antivirus file locking issues - retry_call(os.replace, (temp_path, file_path), exceptions=(PermissionError,)) - - finally: - # clean up if temp file still exists for whatever reason - if os.path.exists(temp_path): # pragma: no cover - os.remove(temp_path) - - def __delitem__(self, key): - key = self._normalize_key(key) - path = os.path.join(self.path, key) - if os.path.isfile(path): - os.remove(path) - elif os.path.isdir(path): - # include support for deleting directories, even though strictly - # speaking these do not exist as keys in the store - shutil.rmtree(path) - else: - raise KeyError(key) - - def __contains__(self, key): - key = self._normalize_key(key) - file_path = os.path.join(self.path, key) - return os.path.isfile(file_path) - - def __eq__(self, other): - return isinstance(other, DirectoryStore) and self.path == other.path - - def keys(self): - if os.path.exists(self.path): - yield from self._keys_fast(self.path) - - @staticmethod - def _keys_fast(path, walker=os.walk): - for dirpath, _, filenames in walker(path): - dirpath = os.path.relpath(dirpath, path) - if dirpath == os.curdir: - for f in filenames: - yield f - else: - dirpath = dirpath.replace("\\", "/") - for f in filenames: - yield "/".join((dirpath, f)) - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def dir_path(self, path=None): - store_path = normalize_storage_path(path) - dir_path = self.path - if store_path: - dir_path = os.path.join(dir_path, store_path) - return dir_path - - def listdir(self, path=None): - return ( - self._nested_listdir(path) - if self._dimension_separator == "/" - else self._flat_listdir(path) - ) - - def _flat_listdir(self, path=None): - dir_path = self.dir_path(path) - if os.path.isdir(dir_path): - return sorted(os.listdir(dir_path)) - else: - return [] - - def _nested_listdir(self, path=None): - children = self._flat_listdir(path=path) - if array_meta_key in children: - # special handling of directories containing an array to map nested chunk - # keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and os.path.isdir(entry_path): - for dir_path, _, file_names in os.walk(entry_path): - for file_name in file_names: - file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path + os.path.sep)[1] - new_children.append( - rel_path.replace(os.path.sep, self._dimension_separator or ".") - ) - else: - new_children.append(entry) - return sorted(new_children) - else: - return children - - def rename(self, src_path, dst_path): - store_src_path = normalize_storage_path(src_path) - store_dst_path = normalize_storage_path(dst_path) - - dir_path = self.path - - src_path = os.path.join(dir_path, store_src_path) - dst_path = os.path.join(dir_path, store_dst_path) - - os.renames(src_path, dst_path) - - def rmdir(self, path=None): - store_path = normalize_storage_path(path) - dir_path = self.path - if store_path: - dir_path = os.path.join(dir_path, store_path) - if os.path.isdir(dir_path): - shutil.rmtree(dir_path) - - def getsize(self, path=None): - store_path = normalize_storage_path(path) - fs_path = self.path - if store_path: - fs_path = os.path.join(fs_path, store_path) - if os.path.isfile(fs_path): - return os.path.getsize(fs_path) - elif os.path.isdir(fs_path): - size = 0 - for child in scandir(fs_path): - if child.is_file(): - size += child.stat().st_size - return size - else: - return 0 - - def clear(self): - shutil.rmtree(self.path) - - -def atexit_rmtree(path, isdir=os.path.isdir, rmtree=shutil.rmtree): # pragma: no cover - """Ensure directory removal at interpreter exit.""" - if isdir(path): - rmtree(path) - - -# noinspection PyShadowingNames -def atexit_rmglob( - path, - glob=glob.glob, - isdir=os.path.isdir, - isfile=os.path.isfile, - remove=os.remove, - rmtree=shutil.rmtree, -): # pragma: no cover - """Ensure removal of multiple files at interpreter exit.""" - for p in glob(path): - if isfile(p): - remove(p) - elif isdir(p): - rmtree(p) - - -class FSStore(Store): - """Wraps an fsspec.FSMap to give access to arbitrary filesystems - - Requires that ``fsspec`` is installed, as well as any additional - requirements for the protocol chosen. - - Parameters - ---------- - url : str - The destination to map. If no fs is provided, should include protocol - and path, like "s3://bucket/root". If an fs is provided, can be a path - within that filesystem, like "bucket/root" - normalize_keys : bool - key_separator : str - public API for accessing dimension_separator. Never `None` - See dimension_separator for more information. - mode : str - "w" for writable, "r" for read-only - exceptions : list of Exception subclasses - When accessing data, any of these exceptions will be treated - as a missing key - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - fs : fsspec.spec.AbstractFileSystem, optional - An existing filesystem to use for the store. - check : bool, optional - If True, performs a touch at the root location, to check for write access. - Passed to `fsspec.mapping.FSMap` constructor. - create : bool, optional - If True, performs a mkdir at the rool location. - Passed to `fsspec.mapping.FSMap` constructor. - missing_exceptions : sequence of Exceptions, optional - Exceptions classes to associate with missing files. - Passed to `fsspec.mapping.FSMap` constructor. - storage_options : passed to the fsspec implementation. Cannot be used - together with fs. - """ - - _array_meta_key = array_meta_key - _group_meta_key = group_meta_key - _attrs_key = attrs_key - - def __init__( - self, - url, - normalize_keys=False, - key_separator=None, - mode="w", - exceptions=(KeyError, PermissionError, IOError), - dimension_separator=None, - fs=None, - check=False, - create=False, - missing_exceptions=None, - **storage_options, - ): - if not self._fsspec_installed(): # pragma: no cover - raise ImportError("`fsspec` is required to use zarr's FSStore") - import fsspec - - mapper_options = {"check": check, "create": create} - # https://github.com/zarr-developers/zarr-python/pull/911#discussion_r841926292 - # Some fsspec implementations don't accept missing_exceptions. - # This is a workaround to avoid passing it in the most common scenarios. - # Remove this and add missing_exceptions to mapper_options when fsspec is released. - if missing_exceptions is not None: - mapper_options["missing_exceptions"] = missing_exceptions # pragma: no cover - - if fs is None: - protocol, _ = fsspec.core.split_protocol(url) - # set auto_mkdir to True for local file system - if protocol in (None, "file") and not storage_options.get("auto_mkdir"): - storage_options["auto_mkdir"] = True - self.map = fsspec.get_mapper(url, **{**mapper_options, **storage_options}) - self.fs = self.map.fs # for direct operations - self.path = self.fs._strip_protocol(url) - else: - if storage_options: - raise ValueError("Cannot specify both fs and storage_options") - self.fs = fs - self.path = self.fs._strip_protocol(url) - self.map = self.fs.get_mapper(self.path, **mapper_options) - - self.normalize_keys = normalize_keys - self.mode = mode - self.exceptions = exceptions - # For backwards compatibility. Guaranteed to be non-None - if key_separator is not None: - dimension_separator = key_separator - - self.key_separator = dimension_separator - self._default_key_separator() - - # Pass attributes to array creation - self._dimension_separator = dimension_separator - - def _default_key_separator(self): - if self.key_separator is None: - self.key_separator = "." - - def _normalize_key(self, key): - key = normalize_storage_path(key).lstrip("/") - if key: - *bits, end = key.split("/") - - if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): - end = end.replace(".", self.key_separator) - key = "/".join(bits + [end]) - - return key.lower() if self.normalize_keys else key - - def getitems( - self, keys: Sequence[str], *, contexts: Mapping[str, Context] - ) -> Mapping[str, Any]: - keys_transformed = [self._normalize_key(key) for key in keys] - results = self.map.getitems(keys_transformed, on_error="omit") - # The function calling this method may not recognize the transformed keys - # So we send the values returned by self.map.getitems back into the original key space. - return {keys[keys_transformed.index(rk)]: rv for rk, rv in results.items()} - - def __getitem__(self, key): - key = self._normalize_key(key) - try: - return self.map[key] - except self.exceptions as e: - raise KeyError(key) from e - - def setitems(self, values): - if self.mode == "r": - raise ReadOnlyError - - # Normalize keys and make sure the values are bytes - values = { - self._normalize_key(key): ensure_contiguous_ndarray_or_bytes(val) - for key, val in values.items() - } - self.map.setitems(values) - - def __setitem__(self, key, value): - if self.mode == "r": - raise ReadOnlyError - key = self._normalize_key(key) - value = ensure_contiguous_ndarray_or_bytes(value) - path = self.dir_path(key) - try: - if self.fs.isdir(path): - self.fs.rm(path, recursive=True) - self.map[key] = value - self.fs.invalidate_cache(self.fs._parent(path)) - except self.exceptions as e: - raise KeyError(key) from e - - def __delitem__(self, key): - if self.mode == "r": - raise ReadOnlyError - key = self._normalize_key(key) - path = self.dir_path(key) - if self.fs.isdir(path): - self.fs.rm(path, recursive=True) - else: - del self.map[key] - - def delitems(self, keys): - if self.mode == "r": - raise ReadOnlyError - # only remove the keys that exist in the store - nkeys = [self._normalize_key(key) for key in keys if key in self] - # rm errors if you pass an empty collection - if len(nkeys) > 0: - self.map.delitems(nkeys) - - def __contains__(self, key): - key = self._normalize_key(key) - return key in self.map - - def __eq__(self, other): - return type(self) is type(other) and self.map == other.map and self.mode == other.mode - - def keys(self): - return iter(self.map) - - def __iter__(self): - return self.keys() - - def __len__(self): - return len(list(self.keys())) - - def dir_path(self, path=None): - store_path = normalize_storage_path(path) - return self.map._key_to_str(store_path) - - def listdir(self, path=None): - dir_path = self.dir_path(path) - try: - children = sorted( - p.rstrip("/").rsplit("/", 1)[-1] for p in self.fs.ls(dir_path, detail=False) - ) - if self.key_separator != "/": - return children - else: - if self._array_meta_key in children: - # special handling of directories containing an array to map nested chunk - # keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and self.fs.isdir(entry_path): - for file_name in self.fs.find(entry_path): - file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path)[1] - rel_path = rel_path.lstrip("/") - new_children.append(rel_path.replace("/", ".")) - else: - new_children.append(entry) - return sorted(new_children) - else: - return children - except OSError: - return [] - - def rmdir(self, path=None): - if self.mode == "r": - raise ReadOnlyError - store_path = self.dir_path(path) - if self.fs.isdir(store_path): - self.fs.rm(store_path, recursive=True) - - def getsize(self, path=None): - store_path = self.dir_path(path) - return self.fs.du(store_path, True, True) - - def clear(self): - if self.mode == "r": - raise ReadOnlyError - self.map.clear() - - @classmethod - def _fsspec_installed(cls): - """Returns true if fsspec is installed""" - import importlib.util - - return importlib.util.find_spec("fsspec") is not None - - -class TempStore(DirectoryStore): - """Directory store using a temporary directory for storage. - - Parameters - ---------- - suffix : string, optional - Suffix for the temporary directory name. - prefix : string, optional - Prefix for the temporary directory name. - dir : string, optional - Path to parent directory in which to create temporary directory. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - """ - - # noinspection PyShadowingBuiltins - def __init__( - self, suffix="", prefix="zarr", dir=None, normalize_keys=False, dimension_separator=None - ): - path = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) - atexit.register(atexit_rmtree, path) - super().__init__(path, normalize_keys=normalize_keys) - - -_prog_ckey = re.compile(r"^(\d+)(\.\d+)+$") -_prog_number = re.compile(r"^\d+$") - - -class NestedDirectoryStore(DirectoryStore): - """Storage class using directories and files on a standard file system, with - special handling for chunk keys so that chunk files for multidimensional - arrays are stored in a nested directory tree. - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - dimension_separator : {'/'}, optional - Separator placed between the dimensions of a chunk. - Only supports "/" unlike other implementations. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.v2.NestedDirectoryStore('data/array.zarr') - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Each chunk of the array is stored as a separate file on the file system, - note the multiple directory levels used for the chunk files:: - - >>> import os - >>> sorted(os.listdir('data/array.zarr')) - ['.zarray', '0', '1'] - >>> sorted(os.listdir('data/array.zarr/0')) - ['0', '1'] - >>> sorted(os.listdir('data/array.zarr/1')) - ['0', '1'] - - Store a group:: - - >>> store = zarr.v2.NestedDirectoryStore('data/group.zarr') - >>> root = zarr.v2.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - When storing a group, levels in the group hierarchy will correspond to - directories on the file system, i.e.:: - - >>> sorted(os.listdir('data/group.zarr')) - ['.zgroup', 'foo'] - >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zgroup', 'bar'] - >>> sorted(os.listdir('data/group.zarr/foo/bar')) - ['.zarray', '0', '1'] - >>> sorted(os.listdir('data/group.zarr/foo/bar/0')) - ['0', '1'] - >>> sorted(os.listdir('data/group.zarr/foo/bar/1')) - ['0', '1'] - - Notes - ----- - The :class:`DirectoryStore` class stores all chunk files for an array - together in a single directory. On some file systems, the potentially large - number of files in a single directory can cause performance issues. The - :class:`NestedDirectoryStore` class provides an alternative where chunk - files for multidimensional arrays will be organised into a directory - hierarchy, thus reducing the number of files in any one directory. - - Safe to write in multiple threads or processes. - - """ - - def __init__(self, path, normalize_keys=False, dimension_separator="/"): - super().__init__(path, normalize_keys=normalize_keys) - if dimension_separator is None: - dimension_separator = "/" - elif dimension_separator != "/": - raise ValueError("NestedDirectoryStore only supports '/' as dimension_separator") - self._dimension_separator = dimension_separator - - def __eq__(self, other): - return isinstance(other, NestedDirectoryStore) and self.path == other.path - - -# noinspection PyPep8Naming -class ZipStore(Store): - """Storage class using a Zip file. - - Parameters - ---------- - path : string - Location of file. - compression : integer, optional - Compression method to use when writing to the archive. - allowZip64 : bool, optional - If True (the default) will create ZIP files that use the ZIP64 - extensions when the zipfile is larger than 2 GiB. If False - will raise an exception when the ZIP file would require ZIP64 - extensions. - mode : string, optional - One of 'r' to read an existing file, 'w' to truncate and write a new - file, 'a' to append to an existing file, or 'x' to exclusively create - and write a new file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.v2.ZipStore('data/array.zip', mode='w') - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.v2.ZipStore('data/group.zip', mode='w') - >>> root = zarr.v2.group(store=store) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a ZipStore, the ``close()`` method must be called, otherwise - essential data will not be written to the underlying Zip file. The ZipStore - class also supports the context manager protocol, which ensures the ``close()`` - method is called on leaving the context, e.g.:: - - >>> with zarr.v2.ZipStore('data/array.zip', mode='w') as store: - ... z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - Each chunk of an array is stored as a separate entry in the Zip file. Note - that Zip files do not provide any way to remove or replace existing entries. - If an attempt is made to replace an entry, then a warning is generated by - the Python standard library about a duplicate Zip file entry. This can be - triggered if you attempt to write data to a Zarr array more than once, - e.g.:: - - >>> store = zarr.v2.ZipStore('data/example.zip', mode='w') - >>> z = zarr.v2.zeros(100, chunks=10, store=store) - >>> # first write OK - ... z[...] = 42 - >>> # second write generates warnings - ... z[...] = 42 # doctest: +SKIP - >>> store.close() - - This can also happen in a more subtle situation, where data are written only - once to a Zarr array, but the write operations are not aligned with chunk - boundaries, e.g.:: - - >>> store = zarr.v2.ZipStore('data/example.zip', mode='w') - >>> z = zarr.v2.zeros(100, chunks=10, store=store) - >>> z[5:15] = 42 - >>> # write overlaps chunk previously written, generates warnings - ... z[15:25] = 42 # doctest: +SKIP - - To avoid creating duplicate entries, only write data once, and align writes - with chunk boundaries. This alignment is done automatically if you call - ``z[...] = ...`` or create an array from existing data via :func:`zarr.v2.array`. - - Alternatively, use a :class:`DirectoryStore` when writing the data, then - manually Zip the directory and use the Zip file for subsequent reads. - Take note that the files in the Zip file must be relative to the root of the - Zarr archive. You may find it easier to create such a Zip file with ``7z``, e.g.:: - - 7z a -tzip archive.zarr.v2.zip archive.zarr/. - - Safe to write in multiple threads but not in multiple processes. - - """ - - _erasable = False - - def __init__( - self, - path, - compression=zipfile.ZIP_STORED, - allowZip64=True, - mode="a", - dimension_separator=None, - ): - # store properties - path = os.path.abspath(path) - self.path = path - self.compression = compression - self.allowZip64 = allowZip64 - self.mode = mode - self._dimension_separator = dimension_separator - - # Current understanding is that zipfile module in stdlib is not thread-safe, - # and so locking is required for both read and write. However, this has not - # been investigated in detail, perhaps no lock is needed if mode='r'. - self.mutex = RLock() - - # open zip file - self.zf = zipfile.ZipFile(path, mode=mode, compression=compression, allowZip64=allowZip64) - - def __getstate__(self): - self.flush() - return self.path, self.compression, self.allowZip64, self.mode - - def __setstate__(self, state): - path, compression, allowZip64, mode = state - # if initially opened with mode 'w' or 'x', re-open in mode 'a' so file doesn't - # get clobbered - if mode in "wx": - mode = "a" - self.__init__(path=path, compression=compression, allowZip64=allowZip64, mode=mode) - - def close(self): - """Closes the underlying zip file, ensuring all records are written.""" - with self.mutex: - self.zf.close() - - def flush(self): - """Closes the underlying zip file, ensuring all records are written, - then re-opens the file for further modifications.""" - if self.mode != "r": - with self.mutex: - self.zf.close() - # N.B., re-open with mode 'a' regardless of initial mode so we don't wipe - # what's been written - self.zf = zipfile.ZipFile( - self.path, mode="a", compression=self.compression, allowZip64=self.allowZip64 - ) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - with self.mutex: - with self.zf.open(key) as f: # will raise KeyError - return f.read() - - def __setitem__(self, key, value): - if self.mode == "r": - raise ReadOnlyError - value = ensure_contiguous_ndarray_like(value).view("u1") - with self.mutex: - # writestr(key, value) writes with default permissions from - # zipfile (600) that are too restrictive, build ZipInfo for - # the key to work around limitation - keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6]) - keyinfo.compress_type = self.compression - if keyinfo.filename[-1] == os.sep: - keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x - keyinfo.external_attr |= 0x10 # MS-DOS directory flag - else: - keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- - - self.zf.writestr(keyinfo, value) - - def __delitem__(self, key): - raise NotImplementedError - - def __eq__(self, other): - return ( - isinstance(other, ZipStore) - and self.path == other.path - and self.compression == other.compression - and self.allowZip64 == other.allowZip64 - ) - - def keylist(self): - with self.mutex: - return sorted(self.zf.namelist()) - - def keys(self): - yield from self.keylist() - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - try: - with self.mutex: - self.zf.getinfo(key) - except KeyError: - return False - else: - return True - - def listdir(self, path=None): - path = normalize_storage_path(path) - return _listdir_from_keys(self, path) - - def getsize(self, path=None): - path = normalize_storage_path(path) - with self.mutex: - children = self.listdir(path) - if children: - size = 0 - for child in children: - if path: - name = path + "/" + child - else: - name = child - try: - info = self.zf.getinfo(name) - except KeyError: - pass - else: - size += info.compress_size - return size - elif path: - try: - info = self.zf.getinfo(path) - return info.compress_size - except KeyError: - return 0 - else: - return 0 - - def clear(self): - if self.mode == "r": - raise ReadOnlyError - with self.mutex: - self.close() - os.remove(self.path) - self.zf = zipfile.ZipFile( - self.path, mode=self.mode, compression=self.compression, allowZip64=self.allowZip64 - ) - - -def migrate_1to2(store): - """Migrate array metadata in `store` from Zarr format version 1 to - version 2. - - Parameters - ---------- - store : Store - Store to be migrated. - - Notes - ----- - Version 1 did not support hierarchies, so this migration function will - look for a single array in `store` and migrate the array metadata to - version 2. - - """ - - # migrate metadata - from zarr.v2 import meta_v1 - - meta = meta_v1.decode_metadata(store["meta"]) - del store["meta"] - - # add empty filters - meta["filters"] = None - - # migration compression metadata - compression = meta["compression"] - if compression is None or compression == "none": - compressor_config = None - else: - compression_opts = meta["compression_opts"] - codec_cls = codec_registry[compression] - if isinstance(compression_opts, dict): - compressor = codec_cls(**compression_opts) - else: - compressor = codec_cls(compression_opts) - compressor_config = compressor.get_config() - meta["compressor"] = compressor_config - del meta["compression"] - del meta["compression_opts"] - - # store migrated metadata - if hasattr(store, "_metadata_class"): - store[array_meta_key] = store._metadata_class.encode_array_metadata(meta) - else: - store[array_meta_key] = encode_array_metadata(meta) - - # migrate user attributes - store[attrs_key] = store["attrs"] - del store["attrs"] - - -# noinspection PyShadowingBuiltins -class DBMStore(Store): - """Storage class using a DBM-style database. - - Parameters - ---------- - path : string - Location of database file. - flag : string, optional - Flags for opening the database file. - mode : int - File mode used if a new file is created. - open : function, optional - Function to open the database file. If not provided, :func:`dbm.open` will be - used on Python 3, and :func:`anydbm.open` will be used on Python 2. - write_lock: bool, optional - Use a lock to prevent concurrent writes from multiple threads (True by default). - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk.e - **open_kwargs - Keyword arguments to pass the `open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.v2.DBMStore('data/array.db') - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.v2.DBMStore('data/group.db') - >>> root = zarr.v2.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.v2.DBMStore('data/array.db') as store: - ... z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - A different database library can be used by passing a different function to - the `open` parameter. For example, if the `bsddb3 - `_ package is installed, a - Berkeley DB database can be used:: - - >>> import bsddb3 - >>> store = zarr.v2.DBMStore('data/array.bdb', open=bsddb3.btopen) - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() - - Notes - ----- - Please note that, by default, this class will use the Python standard - library `dbm.open` function to open the database file (or `anydbm.open` on - Python 2). There are up to three different implementations of DBM-style - databases available in any Python installation, and which one is used may - vary from one system to another. Database file formats are not compatible - between these different implementations. Also, some implementations are - more efficient than others. In particular, the "dumb" implementation will be - the fall-back on many systems, and has very poor performance for some usage - scenarios. If you want to ensure a specific implementation is used, pass the - corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM - library. - - Safe to write in multiple threads. May be safe to write in multiple processes, - depending on which DBM implementation is being used, although this has not been - tested. - - """ - - def __init__( - self, - path, - flag="c", - mode=0o666, - open=None, - write_lock=True, - dimension_separator=None, - **open_kwargs, - ): - if open is None: - import dbm - - open = dbm.open - path = os.path.abspath(path) - # noinspection PyArgumentList - self.db = open(path, flag, mode, **open_kwargs) - self.path = path - self.flag = flag - self.mode = mode - self.open = open - self.write_lock = write_lock - if write_lock: - # This may not be required as some dbm implementations manage their own - # locks, but err on the side of caution. - self.write_mutex = Lock() - else: - self.write_mutex = nolock - self.open_kwargs = open_kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # needed for ndbm - except Exception: - # flush may fail if db has already been closed - pass - return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) - - def __setstate__(self, state): - path, flag, mode, open, write_lock, open_kws = state - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.__init__(path=path, flag=flag, mode=mode, open=open, write_lock=write_lock, **open_kws) - - def close(self): - """Closes the underlying database file.""" - if hasattr(self.db, "close"): - with self.write_mutex: - self.db.close() - - def flush(self): - """Synchronizes data to the underlying database file.""" - if self.flag[0] != "r": - with self.write_mutex: - if hasattr(self.db, "sync"): - self.db.sync() - else: # pragma: no cover - # we don't cover this branch anymore as ndbm (oracle) is not packaged - # by conda-forge on non-mac OS: - # https://github.com/conda-forge/staged-recipes/issues/4476 - # fall-back, close and re-open, needed for ndbm - flag = self.flag - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.db.close() - # noinspection PyArgumentList - self.db = self.open(self.path, flag, self.mode, **self.open_kwargs) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return self.db[key] - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - value = ensure_bytes(value) - with self.write_mutex: - self.db[key] = value - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.write_mutex: - del self.db[key] - - def __eq__(self, other): - return ( - isinstance(other, DBMStore) - and self.path == other.path - and - # allow flag and mode to differ - self.open == other.open - and self.open_kwargs == other.open_kwargs - ) - - def keys(self): - return (ensure_text(k, "ascii") for k in iter(self.db.keys())) - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return key in self.db - - def rmdir(self, path: str = "") -> None: - path = normalize_storage_path(path) - _rmdir_from_keys(self, path) - - -class LMDBStore(Store): - """Storage class using LMDB. Requires the `lmdb `_ - package to be installed. - - - Parameters - ---------- - path : string - Location of database file. - buffers : bool, optional - If True (default) use support for buffers, which should increase performance by - reducing memory copies. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `lmdb.open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.v2.LMDBStore('data/array.mdb') - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.v2.LMDBStore('data/group.mdb') - >>> root = zarr.v2.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.v2.LMDBStore('data/array.mdb') as store: - ... z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - By default writes are not immediately flushed to disk to increase performance. You - can ensure data are flushed to disk by calling the ``flush()`` or ``close()`` methods. - - Should be safe to write in multiple threads or processes due to the synchronization - support within LMDB, although writing from multiple processes has not been tested. - - """ - - def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): - import lmdb - - # set default memory map size to something larger than the lmdb default, which is - # very likely to be too small for any moderate array (logic copied from zict) - map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 - kwargs.setdefault("map_size", map_size) - - # don't initialize buffers to zero by default, shouldn't be necessary - kwargs.setdefault("meminit", False) - - # decide whether to use the writemap option based on the operating system's - # support for sparse files - writemap requires sparse file support otherwise - # the whole# `map_size` may be reserved up front on disk (logic copied from zict) - writemap = sys.platform.startswith("linux") - kwargs.setdefault("writemap", writemap) - - # decide options for when data are flushed to disk - choose to delay syncing - # data to filesystem, otherwise pay a large performance penalty (zict also does - # this) - kwargs.setdefault("metasync", False) - kwargs.setdefault("sync", False) - kwargs.setdefault("map_async", False) - - # set default option for number of cached transactions - max_spare_txns = multiprocessing.cpu_count() - kwargs.setdefault("max_spare_txns", max_spare_txns) - - # normalize path - path = os.path.abspath(path) - - # open database - self.db = lmdb.open(path, **kwargs) - - # store properties - self.buffers = buffers - self.path = path - self.kwargs = kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # just in case - except Exception: - # flush may fail if db has already been closed - pass - return self.path, self.buffers, self.kwargs - - def __setstate__(self, state): - path, buffers, kwargs = state - self.__init__(path=path, buffers=buffers, **kwargs) - - def close(self): - """Closes the underlying database.""" - self.db.close() - - def flush(self): - """Synchronizes data to the file system.""" - self.db.sync() - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - # use the buffers option, should avoid a memory copy - with self.db.begin(buffers=self.buffers) as txn: - value = txn.get(key) - if value is None: - raise KeyError(key) - return value - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True, buffers=self.buffers) as txn: - txn.put(key, value) - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True) as txn: - if not txn.delete(key): - raise KeyError(key) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - return cursor.set_key(key) - - def items(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k, v in cursor.iternext(keys=True, values=True): - yield ensure_text(k, "ascii"), v - - def keys(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k in cursor.iternext(keys=True, values=False): - yield ensure_text(k, "ascii") - - def values(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - yield from cursor.iternext(keys=False, values=True) - - def __iter__(self): - return self.keys() - - def __len__(self): - return self.db.stat()["entries"] - - -class LRUStoreCache(Store): - """Storage class that implements a least-recently-used (LRU) cache layer over - some other store. Intended primarily for use with stores that can be slow to - access, e.g., remote stores that require network communication to store and - retrieve data. - - Parameters - ---------- - store : Store - The store containing the actual data to be cached. - max_size : int - The maximum size that the cache may grow to, in number of bytes. Provide `None` - if you would like the cache to have unlimited size. - - Examples - -------- - The example below wraps an S3 store with an LRU cache:: - - >>> import s3fs - >>> import zarr - >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) - >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) - >>> cache = zarr.v2.LRUStoreCache(store, max_size=2**28) - >>> root = zarr.v2.group(store=cache) # doctest: +REMOTE_DATA - >>> z = root['foo/bar/baz'] # doctest: +REMOTE_DATA - >>> from timeit import timeit - >>> # first data access is relatively slow, retrieved from store - ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP - b'Hello from the cloud!' - 0.1081731989979744 - >>> # second data access is faster, uses cache - ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP - b'Hello from the cloud!' - 0.0009490990014455747 - - """ - - def __init__(self, store: StoreLike, max_size: int): - self._store: BaseStore = BaseStore._ensure_store(store) - self._max_size = max_size - self._current_size = 0 - self._keys_cache = None - self._contains_cache: Dict[Any, Any] = {} - self._listdir_cache: Dict[Path, Any] = dict() - self._values_cache: Dict[Path, Any] = OrderedDict() - self._mutex = Lock() - self.hits = self.misses = 0 - - def __getstate__(self): - return ( - self._store, - self._max_size, - self._current_size, - self._keys_cache, - self._contains_cache, - self._listdir_cache, - self._values_cache, - self.hits, - self.misses, - ) - - def __setstate__(self, state): - ( - self._store, - self._max_size, - self._current_size, - self._keys_cache, - self._contains_cache, - self._listdir_cache, - self._values_cache, - self.hits, - self.misses, - ) = state - self._mutex = Lock() - - def __len__(self): - return len(self._keys()) - - def __iter__(self): - return self.keys() - - def __contains__(self, key): - with self._mutex: - if key not in self._contains_cache: - self._contains_cache[key] = key in self._store - return self._contains_cache[key] - - def clear(self): - self._store.clear() - self.invalidate() - - def keys(self): - with self._mutex: - return iter(self._keys()) - - def _keys(self): - if self._keys_cache is None: - self._keys_cache = list(self._store.keys()) - return self._keys_cache - - def listdir(self, path: Path = None): - with self._mutex: - try: - return self._listdir_cache[path] - except KeyError: - listing = listdir(self._store, path) - self._listdir_cache[path] = listing - return listing - - def getsize(self, path=None) -> int: - return getsize(self._store, path=path) - - def _pop_value(self): - # remove the first value from the cache, as this will be the least recently - # used value - _, v = self._values_cache.popitem(last=False) - return v - - def _accommodate_value(self, value_size): - if self._max_size is None: - return - # ensure there is enough space in the cache for a new value - while self._current_size + value_size > self._max_size: - v = self._pop_value() - self._current_size -= buffer_size(v) - - def _cache_value(self, key: Path, value): - # cache a value - value_size = buffer_size(value) - # check size of the value against max size, as if the value itself exceeds max - # size then we are never going to cache it - if self._max_size is None or value_size <= self._max_size: - self._accommodate_value(value_size) - self._values_cache[key] = value - self._current_size += value_size - - def invalidate(self): - """Completely clear the cache.""" - with self._mutex: - self._values_cache.clear() - self._invalidate_keys() - self._current_size = 0 - - def invalidate_values(self): - """Clear the values cache.""" - with self._mutex: - self._values_cache.clear() - - def invalidate_keys(self): - """Clear the keys cache.""" - with self._mutex: - self._invalidate_keys() - - def _invalidate_keys(self): - self._keys_cache = None - self._contains_cache.clear() - self._listdir_cache.clear() - - def _invalidate_value(self, key): - if key in self._values_cache: - value = self._values_cache.pop(key) - self._current_size -= buffer_size(value) - - def __getitem__(self, key): - try: - # first try to obtain the value from the cache - with self._mutex: - value = self._values_cache[key] - # cache hit if no KeyError is raised - self.hits += 1 - # treat the end as most recently used - self._values_cache.move_to_end(key) - - except KeyError: - # cache miss, retrieve value from the store - value = self._store[key] - with self._mutex: - self.misses += 1 - # need to check if key is not in the cache, as it may have been cached - # while we were retrieving the value from the store - if key not in self._values_cache: - self._cache_value(key, value) - - return value - - def __setitem__(self, key, value): - self._store[key] = value - with self._mutex: - self._invalidate_keys() - self._invalidate_value(key) - self._cache_value(key, value) - - def __delitem__(self, key): - del self._store[key] - with self._mutex: - self._invalidate_keys() - self._invalidate_value(key) - - -class SQLiteStore(Store): - """Storage class using SQLite. - - Parameters - ---------- - path : string - Location of database file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `sqlite3.connect` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.v2.SQLiteStore('data/array.sqldb') - >>> z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.v2.SQLiteStore('data/group.sqldb') - >>> root = zarr.v2.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - """ - - def __init__(self, path, dimension_separator=None, **kwargs): - import sqlite3 - - self._dimension_separator = dimension_separator - - # normalize path - if path != ":memory:": - path = os.path.abspath(path) - - # store properties - self.path = path - self.kwargs = kwargs - - # allow threading if SQLite connections are thread-safe - # - # ref: https://www.sqlite.org/releaselog/3_3_1.html - # ref: https://github.com/python/cpython/issues/71377 - check_same_thread = True - if sqlite3.sqlite_version_info >= (3, 3, 1): - check_same_thread = False - - # keep a lock for serializing mutable operations - self.lock = Lock() - - # open database - self.db = sqlite3.connect( - self.path, - detect_types=0, - isolation_level=None, - check_same_thread=check_same_thread, - **self.kwargs, - ) - - # handle keys as `str`s - self.db.text_factory = str - - # get a cursor to read/write to the database - self.cursor = self.db.cursor() - - # initialize database with our table if missing - with self.lock: - self.cursor.execute("CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)") - - def __getstate__(self): - if self.path == ":memory:": - raise PicklingError("Cannot pickle in-memory SQLite databases") - return self.path, self.kwargs - - def __setstate__(self, state): - path, kwargs = state - self.__init__(path=path, **kwargs) - - def close(self): - """Closes the underlying database.""" - - # close cursor and db objects - self.cursor.close() - self.db.close() - - def __getitem__(self, key): - value = self.cursor.execute("SELECT v FROM zarr WHERE (k = ?)", (key,)) - for (v,) in value: - return v - raise KeyError(key) - - def __setitem__(self, key, value): - self.update({key: value}) - - def __delitem__(self, key): - with self.lock: - self.cursor.execute("DELETE FROM zarr WHERE (k = ?)", (key,)) - if self.cursor.rowcount < 1: - raise KeyError(key) - - def __contains__(self, key): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr WHERE (k = ?)", (key,)) - for (has,) in cs: - has = bool(has) - return has - - def items(self): - kvs = self.cursor.execute("SELECT k, v FROM zarr") - yield from kvs - - def keys(self): - ks = self.cursor.execute("SELECT k FROM zarr") - for (k,) in ks: - yield k - - def values(self): - vs = self.cursor.execute("SELECT v FROM zarr") - for (v,) in vs: - yield v - - def __iter__(self): - return self.keys() - - def __len__(self): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr") - for (c,) in cs: - return c - - def update(self, *args, **kwargs): - args += (kwargs,) - - kv_list = [] - for dct in args: - for k, v in dct.items(): - v = ensure_contiguous_ndarray_like(v) - - # Accumulate key-value pairs for storage - kv_list.append((k, v)) - - with self.lock: - self.cursor.executemany("REPLACE INTO zarr VALUES (?, ?)", kv_list) - - def listdir(self, path=None): - path = normalize_storage_path(path) - sep = "_" if path == "" else "/" - keys = self.cursor.execute( - """ - SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( - SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m - FROM zarr WHERE k LIKE (? || "{sep}%") - ) ORDER BY l ASC - """.format(sep=sep), - (path, path), - ) - keys = list(map(operator.itemgetter(0), keys)) - return keys - - def getsize(self, path=None): - path = normalize_storage_path(path) - size = self.cursor.execute( - """ - SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - WHERE k LIKE (? || "%") AND - 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") - """, - (path, path), - ) - for (s,) in size: - return s - - def rmdir(self, path=None): - path = normalize_storage_path(path) - if path: - with self.lock: - self.cursor.execute('DELETE FROM zarr WHERE k LIKE (? || "/%")', (path,)) - else: - self.clear() - - def clear(self): - with self.lock: - self.cursor.executescript( - """ - BEGIN TRANSACTION; - DROP TABLE zarr; - CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); - COMMIT TRANSACTION; - """ - ) - - -class MongoDBStore(Store): - """Storage class using MongoDB. - - .. note:: This is an experimental feature. - - Requires the `pymongo `_ - package to be installed. - - Parameters - ---------- - database : string - Name of database - collection : string - Name of collection - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `pymongo.MongoClient` function. - - Notes - ----- - The maximum chunksize in MongoDB documents is 16 MB. - - """ - - _key = "key" - _value = "value" - - def __init__( - self, - database="mongodb_zarr", - collection="zarr_collection", - dimension_separator=None, - **kwargs, - ): - import pymongo - - self._database = database - self._collection = collection - self._dimension_separator = dimension_separator - self._kwargs = kwargs - - self.client = pymongo.MongoClient(**self._kwargs) - self.db = self.client.get_database(self._database) - self.collection = self.db.get_collection(self._collection) - - def __getitem__(self, key): - doc = self.collection.find_one({self._key: key}) - - if doc is None: - raise KeyError(key) - else: - return doc[self._value] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.collection.replace_one( - {self._key: key}, {self._key: key, self._value: value}, upsert=True - ) - - def __delitem__(self, key): - result = self.collection.delete_many({self._key: key}) - if not result.deleted_count == 1: - raise KeyError(key) - - def __iter__(self): - for f in self.collection.find({}): - yield f[self._key] - - def __len__(self): - return self.collection.count_documents({}) - - def __getstate__(self): - return self._database, self._collection, self._kwargs - - def __setstate__(self, state): - database, collection, kwargs = state - self.__init__(database=database, collection=collection, **kwargs) - - def close(self): - """Cleanup client resources and disconnect from MongoDB.""" - self.client.close() - - def clear(self): - """Remove all items from store.""" - self.collection.delete_many({}) - - -class RedisStore(Store): - """Storage class using Redis. - - .. note:: This is an experimental feature. - - Requires the `redis `_ - package to be installed. - - Parameters - ---------- - prefix : string - Name of prefix for Redis keys - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `redis.Redis` function. - - """ - - def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): - import redis - - self._prefix = prefix - self._kwargs = kwargs - self._dimension_separator = dimension_separator - - self.client = redis.Redis(**kwargs) - - def _key(self, key): - return "{prefix}:{key}".format(prefix=self._prefix, key=key) - - def __getitem__(self, key): - return self.client[self._key(key)] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.client[self._key(key)] = value - - def __delitem__(self, key): - count = self.client.delete(self._key(key)) - if not count: - raise KeyError(key) - - def keylist(self): - offset = len(self._key("")) # length of prefix - return [key[offset:].decode("utf-8") for key in self.client.keys(self._key("*"))] - - def keys(self): - yield from self.keylist() - - def __iter__(self): - yield from self.keys() - - def __len__(self): - return len(self.keylist()) - - def __getstate__(self): - return self._prefix, self._kwargs - - def __setstate__(self, state): - prefix, kwargs = state - self.__init__(prefix=prefix, **kwargs) - - def clear(self): - for key in self.keys(): - del self[key] - - -class ConsolidatedMetadataStore(Store): - """A layer over other storage, where the metadata has been consolidated into - a single key. - - The purpose of this class, is to be able to get all of the metadata for - a given array in a single read operation from the underlying storage. - See :func:`zarr.v2.convenience.consolidate_metadata` for how to create this - single metadata key. - - This class loads from the one key, and stores the data in a dict, so that - accessing the keys no longer requires operations on the backend store. - - This class is read-only, and attempts to change the array metadata will - fail, but changing the data is possible. If the backend storage is changed - directly, then the metadata stored here could become obsolete, and - :func:`zarr.v2.convenience.consolidate_metadata` should be called again and the class - re-invoked. The use case is for write once, read many times. - - .. versionadded:: 2.3 - - .. note:: This is an experimental feature. - - Parameters - ---------- - store: Store - Containing the zarr array. - metadata_key: str - The target in the store where all of the metadata are stored. We - assume JSON encoding. - - See Also - -------- - zarr.v2.convenience.consolidate_metadata, zarr.v2.convenience.open_consolidated - - """ - - def __init__(self, store: StoreLike, metadata_key=".zmetadata"): - self.store = Store._ensure_store(store) - - # retrieve consolidated metadata - meta = json_loads(self.store[metadata_key]) - - # check format of consolidated metadata - consolidated_format = meta.get("zarr_consolidated_format", None) - if consolidated_format != 1: - raise MetadataError( - "unsupported zarr consolidated metadata format: %s" % consolidated_format - ) - - # decode metadata - self.meta_store: Store = KVStore(meta["metadata"]) - - def __getitem__(self, key): - return self.meta_store[key] - - def __contains__(self, item): - return item in self.meta_store - - def __iter__(self): - return iter(self.meta_store) - - def __len__(self): - return len(self.meta_store) - - def __delitem__(self, key): - raise ReadOnlyError - - def __setitem__(self, key, value): - raise ReadOnlyError - - def getsize(self, path): - return getsize(self.meta_store, path) - - def listdir(self, path): - return listdir(self.meta_store, path) diff --git a/src/zarr/v2/sync.py b/src/zarr/v2/sync.py deleted file mode 100644 index 49684a51ee..0000000000 --- a/src/zarr/v2/sync.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -from collections import defaultdict -from threading import Lock - -import fasteners - - -class ThreadSynchronizer: - """Provides synchronization using thread locks.""" - - def __init__(self): - self.mutex = Lock() - self.locks = defaultdict(Lock) - - def __getitem__(self, item): - with self.mutex: - return self.locks[item] - - def __getstate__(self): - return True - - def __setstate__(self, *args): - # reinitialize from scratch - self.__init__() - - -class ProcessSynchronizer: - """Provides synchronization using file locks via the - `fasteners `_ - package. - - Parameters - ---------- - path : string - Path to a directory on a file system that is shared by all processes. - N.B., this should be a *different* path to where you store the array. - - """ - - def __init__(self, path): - self.path = path - - def __getitem__(self, item): - path = os.path.join(self.path, item) - lock = fasteners.InterProcessLock(path) - return lock - - # pickling and unpickling should be handled automatically diff --git a/src/zarr/v2/util.py b/src/zarr/v2/util.py deleted file mode 100644 index 7e3bd788ec..0000000000 --- a/src/zarr/v2/util.py +++ /dev/null @@ -1,788 +0,0 @@ -import inspect -import json -import math -import numbers -from textwrap import TextWrapper -import mmap -import time -from typing import ( - Any, - Callable, - Dict, - Iterator, - Mapping, - Optional, - Tuple, - TypeVar, - Union, - Iterable, - cast, -) - -import numpy as np -import numpy.typing as npt -from asciitree import BoxStyle, LeftAligned -from asciitree.traversal import Traversal -from numcodecs.compat import ( - ensure_text, - ensure_ndarray_like, - ensure_bytes, - ensure_contiguous_ndarray_like, -) -from numcodecs.ndarray_like import NDArrayLike -from numcodecs.registry import codec_registry -from numcodecs.blosc import cbuffer_sizes, cbuffer_metainfo - -KeyType = TypeVar("KeyType") -ValueType = TypeVar("ValueType") - - -def flatten(arg: Iterable[Any]) -> Iterable[Any]: - for element in arg: - if isinstance(element, Iterable) and not isinstance(element, (str, bytes)): - yield from flatten(element) - else: - yield element - - -# codecs to use for object dtype convenience API -object_codecs = { - str.__name__: "vlen-utf8", - bytes.__name__: "vlen-bytes", - "array": "vlen-array", -} - - -class NumberEncoder(json.JSONEncoder): - def default(self, o): - # See json.JSONEncoder.default docstring for explanation - # This is necessary to encode numpy dtype - if isinstance(o, numbers.Integral): - return int(o) - if isinstance(o, numbers.Real): - return float(o) - return json.JSONEncoder.default(self, o) - - -def json_dumps(o: Any) -> bytes: - """Write JSON in a consistent, human-readable way.""" - return json.dumps( - o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder - ).encode("ascii") - - -def json_loads(s: Union[bytes, str]) -> Dict[str, Any]: - """Read JSON in a consistent way.""" - return json.loads(ensure_text(s, "utf-8")) - - -def normalize_shape(shape: Union[int, Tuple[int, ...], None]) -> Tuple[int, ...]: - """Convenience function to normalize the `shape` argument.""" - - if shape is None: - raise TypeError("shape is None") - - # handle 1D convenience form - if isinstance(shape, numbers.Integral): - shape = (int(shape),) - - # normalize - shape = cast(Tuple[int, ...], shape) - shape = tuple(int(s) for s in shape) - return shape - - -# code to guess chunk shape, adapted from h5py - -CHUNK_BASE = 256 * 1024 # Multiplier by which chunks are adjusted -CHUNK_MIN = 128 * 1024 # Soft lower limit (128k) -CHUNK_MAX = 64 * 1024 * 1024 # Hard upper limit - - -def guess_chunks(shape: Tuple[int, ...], typesize: int) -> Tuple[int, ...]: - """ - Guess an appropriate chunk layout for an array, given its shape and - the size of each element in bytes. Will allocate chunks only as large - as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of - each axis, slightly favoring bigger values for the last index. - Undocumented and subject to change without warning. - """ - - ndims = len(shape) - # require chunks to have non-zero length for all dimensions - chunks = np.maximum(np.array(shape, dtype="=f8"), 1) - - # Determine the optimal chunk size in bytes using a PyTables expression. - # This is kept as a float. - dset_size = np.prod(chunks) * typesize - target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024))) - - if target_size > CHUNK_MAX: - target_size = CHUNK_MAX - elif target_size < CHUNK_MIN: - target_size = CHUNK_MIN - - idx = 0 - while True: - # Repeatedly loop over the axes, dividing them by 2. Stop when: - # 1a. We're smaller than the target chunk size, OR - # 1b. We're within 50% of the target chunk size, AND - # 2. The chunk is smaller than the maximum chunk size - - chunk_bytes = np.prod(chunks) * typesize - - if ( - chunk_bytes < target_size or abs(chunk_bytes - target_size) / target_size < 0.5 - ) and chunk_bytes < CHUNK_MAX: - break - - if np.prod(chunks) == 1: - break # Element size larger than CHUNK_MAX - - chunks[idx % ndims] = math.ceil(chunks[idx % ndims] / 2.0) - idx += 1 - - return tuple(int(x) for x in chunks) - - -def normalize_chunks(chunks: Any, shape: Tuple[int, ...], typesize: int) -> Tuple[int, ...]: - """Convenience function to normalize the `chunks` argument for an array - with the given `shape`.""" - - # N.B., expect shape already normalized - - # handle auto-chunking - if chunks is None or chunks is True: - return guess_chunks(shape, typesize) - - # handle no chunking - if chunks is False: - return shape - - # handle 1D convenience form - if isinstance(chunks, numbers.Integral): - chunks = tuple(int(chunks) for _ in shape) - - # handle bad dimensionality - if len(chunks) > len(shape): - raise ValueError("too many dimensions in chunks") - - # handle underspecified chunks - if len(chunks) < len(shape): - # assume chunks across remaining dimensions - chunks += shape[len(chunks) :] - - # handle None or -1 in chunks - if -1 in chunks or None in chunks: - chunks = tuple(s if c == -1 or c is None else int(c) for s, c in zip(shape, chunks)) - - chunks = tuple(int(c) for c in chunks) - return chunks - - -def normalize_dtype(dtype: Union[str, npt.DTypeLike], object_codec) -> Tuple[np.dtype[Any], Any]: - # convenience API for object arrays - if inspect.isclass(dtype): - dtype = dtype.__name__ - if isinstance(dtype, str): - # allow ':' to delimit class from codec arguments - tokens = dtype.split(":") - key = tokens[0] - if key in object_codecs: - dtype = np.dtype(object) - if object_codec is None: - codec_id = object_codecs[key] - if len(tokens) > 1: - args = tokens[1].split(",") - else: - args = [] - try: - object_codec = codec_registry[codec_id](*args) - except KeyError: # pragma: no cover - raise ValueError( - "codec %r for object type %r is not " - "available; please provide an " - "object_codec manually" % (codec_id, key) - ) - return dtype, object_codec - - dtype = np.dtype(dtype) - - # don't allow generic datetime64 or timedelta64, require units to be specified - if dtype == np.dtype("M8") or dtype == np.dtype("m8"): - raise ValueError( - "datetime64 and timedelta64 dtypes with generic units " - 'are not supported, please specify units (e.g., "M8[ns]")' - ) - - return dtype, object_codec - - -# noinspection PyTypeChecker -def is_total_slice(item, shape: Tuple[int]) -> bool: - """Determine whether `item` specifies a complete slice of array with the - given `shape`. Used to optimize __setitem__ operations on the Chunk - class.""" - - # N.B., assume shape is normalized - - if item == Ellipsis: - return True - if item == slice(None): - return True - if isinstance(item, slice): - item = (item,) - if isinstance(item, tuple): - return all( - ( - isinstance(it, slice) - and ((it == slice(None)) or ((it.stop - it.start == sh) and (it.step in [1, None]))) - ) - for it, sh in zip(item, shape) - ) - else: - raise TypeError("expected slice or tuple of slices, found %r" % item) - - -def normalize_resize_args(old_shape, *args): - # normalize new shape argument - if len(args) == 1: - new_shape = args[0] - else: - new_shape = args - if isinstance(new_shape, int): - new_shape = (new_shape,) - else: - new_shape = tuple(new_shape) - if len(new_shape) != len(old_shape): - raise ValueError("new shape must have same number of dimensions") - - # handle None in new_shape - new_shape = tuple(s if n is None else int(n) for s, n in zip(old_shape, new_shape)) - - return new_shape - - -def human_readable_size(size) -> str: - if size < 2**10: - return "%s" % size - elif size < 2**20: - return "%.1fK" % (size / float(2**10)) - elif size < 2**30: - return "%.1fM" % (size / float(2**20)) - elif size < 2**40: - return "%.1fG" % (size / float(2**30)) - elif size < 2**50: - return "%.1fT" % (size / float(2**40)) - else: - return "%.1fP" % (size / float(2**50)) - - -def normalize_order(order: str) -> str: - order = str(order).upper() - if order not in ["C", "F"]: - raise ValueError("order must be either 'C' or 'F', found: %r" % order) - return order - - -def normalize_dimension_separator(sep: Optional[str]) -> Optional[str]: - if sep in (".", "/", None): - return sep - else: - raise ValueError("dimension_separator must be either '.' or '/', found: %r" % sep) - - -def normalize_fill_value(fill_value, dtype: np.dtype[Any]): - if fill_value is None or dtype.hasobject: - # no fill value - pass - elif not isinstance(fill_value, np.void) and fill_value == 0: - # this should be compatible across numpy versions for any array type, including - # structured arrays - fill_value = np.zeros((), dtype=dtype)[()] - - elif dtype.kind == "U": - # special case unicode because of encoding issues on Windows if passed through numpy - # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 - - if not isinstance(fill_value, str): - raise ValueError( - "fill_value {!r} is not valid for dtype {}; must be a unicode string".format( - fill_value, dtype - ) - ) - - else: - try: - if isinstance(fill_value, bytes) and dtype.kind == "V": - # special case for numpy 1.14 compatibility - fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] - else: - fill_value = np.array(fill_value, dtype=dtype)[()] - - except Exception as e: - # re-raise with our own error message to be helpful - raise ValueError( - "fill_value {!r} is not valid for dtype {}; nested exception: {}".format( - fill_value, dtype, e - ) - ) - - return fill_value - - -def normalize_storage_path(path: Union[str, bytes, None]) -> str: - # handle bytes - if isinstance(path, bytes): - path = str(path, "ascii") - - # ensure str - if path is not None and not isinstance(path, str): - path = str(path) - - if path: - # convert backslash to forward slash - path = path.replace("\\", "/") - - # ensure no leading slash - while len(path) > 0 and path[0] == "/": - path = path[1:] - - # ensure no trailing slash - while len(path) > 0 and path[-1] == "/": - path = path[:-1] - - # collapse any repeated slashes - previous_char = None - collapsed = "" - for char in path: - if char == "/" and previous_char == "/": - pass - else: - collapsed += char - previous_char = char - path = collapsed - - # don't allow path segments with just '.' or '..' - segments = path.split("/") - if any(s in {".", ".."} for s in segments): - raise ValueError("path containing '.' or '..' segment not allowed") - - else: - path = "" - - return path - - -def buffer_size(v) -> int: - return ensure_ndarray_like(v).nbytes - - -def info_text_report(items: Dict[Any, Any]) -> str: - keys = [k for k, v in items] - max_key_len = max(len(k) for k in keys) - report = "" - for k, v in items: - wrapper = TextWrapper( - width=80, - initial_indent=k.ljust(max_key_len) + " : ", - subsequent_indent=" " * max_key_len + " : ", - ) - text = wrapper.fill(str(v)) - report += text + "\n" - return report - - -def info_html_report(items) -> str: - report = '' - report += "" - for k, v in items: - report += ( - "" - '' - '' - "" % (k, v) - ) - report += "" - report += "
%s%s
" - return report - - -class InfoReporter: - def __init__(self, obj): - self.obj = obj - - def __repr__(self): - items = self.obj.info_items() - return info_text_report(items) - - def _repr_html_(self): - items = self.obj.info_items() - return info_html_report(items) - - -class TreeNode: - def __init__(self, obj, depth=0, level=None): - self.obj = obj - self.depth = depth - self.level = level - - def get_children(self): - if hasattr(self.obj, "values") and (self.level is None or self.depth < self.level): - depth = self.depth + 1 - return [TreeNode(o, depth=depth, level=self.level) for o in self.obj.values()] - return [] - - def get_text(self): - name = self.obj.name.split("/")[-1] or "/" - if hasattr(self.obj, "shape"): - name += " {} {}".format(self.obj.shape, self.obj.dtype) - return name - - def get_type(self): - return type(self.obj).__name__ - - -class TreeTraversal(Traversal): # type: ignore[misc] - def get_children(self, node): - return node.get_children() - - def get_root(self, tree): - return tree - - def get_text(self, node): - return node.get_text() - - -tree_group_icon = "folder" -tree_array_icon = "table" - - -def tree_get_icon(stype: str) -> str: - if stype == "Array": - return tree_array_icon - elif stype == "Group": - return tree_group_icon - else: - raise ValueError("Unknown type: %s" % stype) - - -def tree_widget_sublist(node, root=False, expand=False): - import ipytree - - result = ipytree.Node() - result.icon = tree_get_icon(node.get_type()) - if root or (expand is True) or (isinstance(expand, int) and node.depth < expand): - result.opened = True - else: - result.opened = False - result.name = node.get_text() - result.nodes = [tree_widget_sublist(c, expand=expand) for c in node.get_children()] - result.disabled = True - - return result - - -def tree_widget(group, expand, level): - try: - import ipytree - except ImportError as error: - raise ImportError( - "{}: Run `pip install zarr[jupyter]` or `conda install ipytree`" - "to get the required ipytree dependency for displaying the tree " - "widget. If using jupyterlab<3, you also need to run " - "`jupyter labextension install ipytree`".format(error) - ) - - result = ipytree.Tree() - root = TreeNode(group, level=level) - result.add_node(tree_widget_sublist(root, root=True, expand=expand)) - - return result - - -class TreeViewer: - def __init__(self, group, expand=False, level=None): - self.group = group - self.expand = expand - self.level = level - - self.text_kwargs = dict(horiz_len=2, label_space=1, indent=1) - - self.bytes_kwargs = dict( - UP_AND_RIGHT="+", HORIZONTAL="-", VERTICAL="|", VERTICAL_AND_RIGHT="+" - ) - - self.unicode_kwargs = dict( - UP_AND_RIGHT="\u2514", - HORIZONTAL="\u2500", - VERTICAL="\u2502", - VERTICAL_AND_RIGHT="\u251c", - ) - - def __bytes__(self): - drawer = LeftAligned( - traverse=TreeTraversal(), draw=BoxStyle(gfx=self.bytes_kwargs, **self.text_kwargs) - ) - root = TreeNode(self.group, level=self.level) - result = drawer(root) - - # Unicode characters slip in on Python 3. - # So we need to straighten that out first. - result = result.encode() - - return result - - def __unicode__(self): - drawer = LeftAligned( - traverse=TreeTraversal(), draw=BoxStyle(gfx=self.unicode_kwargs, **self.text_kwargs) - ) - root = TreeNode(self.group, level=self.level) - return drawer(root) - - def __repr__(self): - return self.__unicode__() - - def _repr_mimebundle_(self, **kwargs): - tree = tree_widget(self.group, expand=self.expand, level=self.level) - return tree._repr_mimebundle_(**kwargs) - - -def check_array_shape(param, array, shape): - if not hasattr(array, "shape"): - raise TypeError( - "parameter {!r}: expected an array-like object, got {!r}".format(param, type(array)) - ) - if array.shape != shape: - raise ValueError( - "parameter {!r}: expected array with shape {!r}, got {!r}".format( - param, shape, array.shape - ) - ) - - -def is_valid_python_name(name): - from keyword import iskeyword - - return name.isidentifier() and not iskeyword(name) - - -class NoLock: - """A lock that doesn't lock.""" - - def __enter__(self): - pass - - def __exit__(self, *args): - pass - - -nolock = NoLock() - - -class PartialReadBuffer: - def __init__(self, store_key, chunk_store): - self.chunk_store = chunk_store - # is it fsstore or an actual fsspec map object - assert hasattr(self.chunk_store, "map") - self.map = self.chunk_store.map - self.fs = self.chunk_store.fs - self.store_key = store_key - self.buff = None - self.nblocks = None - self.start_points = None - self.n_per_block = None - self.start_points_max = None - self.read_blocks = set() - - _key_path = self.map._key_to_str(store_key) - _key_path = _key_path.split("/") - _chunk_path = [self.chunk_store._normalize_key(_key_path[-1])] - _key_path = "/".join(_key_path[:-1] + _chunk_path) - self.key_path = _key_path - - def prepare_chunk(self): - assert self.buff is None - header = self.fs.read_block(self.key_path, 0, 16) - nbytes, self.cbytes, blocksize = cbuffer_sizes(header) - typesize, _shuffle, _memcpyd = cbuffer_metainfo(header) - self.buff = mmap.mmap(-1, self.cbytes) - self.buff[0:16] = header - self.nblocks = nbytes / blocksize - self.nblocks = ( - int(self.nblocks) if self.nblocks == int(self.nblocks) else int(self.nblocks + 1) - ) - if self.nblocks == 1: - self.buff = self.read_full() - return - start_points_buffer = self.fs.read_block(self.key_path, 16, int(self.nblocks * 4)) - self.start_points = np.frombuffer(start_points_buffer, count=self.nblocks, dtype=np.int32) - self.start_points_max = self.start_points.max() - self.buff[16 : (16 + (self.nblocks * 4))] = start_points_buffer - self.n_per_block = blocksize / typesize - - def read_part(self, start, nitems): - assert self.buff is not None - if self.nblocks == 1: - return - start_block = int(start / self.n_per_block) - wanted_decompressed = 0 - while wanted_decompressed < nitems: - if start_block not in self.read_blocks: - start_byte = self.start_points[start_block] - if start_byte == self.start_points_max: - stop_byte = self.cbytes - else: - stop_byte = self.start_points[self.start_points > start_byte].min() - length = stop_byte - start_byte - data_buff = self.fs.read_block(self.key_path, start_byte, length) - self.buff[start_byte:stop_byte] = data_buff - self.read_blocks.add(start_block) - if wanted_decompressed == 0: - wanted_decompressed += ((start_block + 1) * self.n_per_block) - start - else: - wanted_decompressed += self.n_per_block - start_block += 1 - - def read_full(self): - return self.chunk_store[self.store_key] - - -class UncompressedPartialReadBufferV3: - def __init__(self, store_key, chunk_store, itemsize): - assert chunk_store.supports_efficient_get_partial_values - self.chunk_store = chunk_store - self.store_key = store_key - self.itemsize = itemsize - - def prepare_chunk(self): - pass - - def read_part(self, start, nitems): - return self.chunk_store.get_partial_values( - [(self.store_key, (start * self.itemsize, nitems * self.itemsize))] - )[0] - - def read_full(self): - return self.chunk_store[self.store_key] - - -def retry_call( - callabl: Callable[..., Any], - args=None, - kwargs=None, - exceptions: Tuple[Any, ...] = (), - retries: int = 10, - wait: float = 0.1, -) -> Any: - """ - Make several attempts to invoke the callable. If one of the given exceptions - is raised, wait the given period of time and retry up to the given number of - retries. - """ - - if args is None: - args = () - if kwargs is None: - kwargs = {} - - for attempt in range(1, retries + 1): - try: - return callabl(*args, **kwargs) - except exceptions: - if attempt < retries: - time.sleep(wait) - else: - raise - - -def all_equal(value: Any, array: Any): - """ - Test if all the elements of an array are equivalent to a value. - If `value` is None, then this function does not do any comparison and - returns False. - """ - - if value is None: - return False - if not value: - # if `value` is falsey, then just 1 truthy value in `array` - # is sufficient to return False. We assume here that np.any is - # optimized to return on the first truthy value in `array`. - try: - return not np.any(array) - except (TypeError, ValueError): # pragma: no cover - pass - if np.issubdtype(array.dtype, np.object_): - # we have to flatten the result of np.equal to handle outputs like - # [np.array([True,True]), True, True] - return all(flatten(np.equal(value, array, dtype=array.dtype))) - else: - # Numpy errors if you call np.isnan on custom dtypes, so ensure - # we are working with floats before calling isnan - if np.issubdtype(array.dtype, np.floating) and np.isnan(value): - return np.all(np.isnan(array)) - else: - # using == raises warnings from numpy deprecated pattern, but - # using np.equal() raises type errors for structured dtypes... - return np.all(value == array) - - -def ensure_contiguous_ndarray_or_bytes(buf) -> Union[NDArrayLike, bytes]: - """Convenience function to coerce `buf` to ndarray-like array or bytes. - - First check if `buf` can be zero-copy converted to a contiguous array. - If not, `buf` will be copied to a newly allocated `bytes` object. - - Parameters - ---------- - buf : ndarray-like, array-like, or bytes-like - A numpy array like object such as numpy.ndarray, cupy.ndarray, or - any object exporting a buffer interface. - - Returns - ------- - arr : NDArrayLike or bytes - A ndarray-like or bytes object - """ - - try: - return ensure_contiguous_ndarray_like(buf) - except TypeError: - # An error is raised if `buf` couldn't be zero-copy converted - return ensure_bytes(buf) - - -class ConstantMap(Mapping[KeyType, ValueType]): - """A read-only map that maps all keys to the same constant value - - Useful if you want to call `getitems()` with the same context for all keys. - - Parameters - ---------- - keys - The keys of the map. Will be copied to a frozenset if it isn't already. - constant - The constant that all keys are mapping to. - """ - - def __init__(self, keys: Iterable[KeyType], constant: ValueType) -> None: - self._keys = keys if isinstance(keys, frozenset) else frozenset(keys) - self._constant = constant - - def __getitem__(self, key: KeyType) -> ValueType: - if key not in self._keys: - raise KeyError(repr(key)) - return self._constant - - def __iter__(self) -> Iterator[KeyType]: - return iter(self._keys) - - def __len__(self) -> int: - return len(self._keys) - - def __contains__(self, key: object) -> bool: - return key in self._keys - - def __repr__(self) -> str: - return repr(dict(self.items())) diff --git a/tests/v2/__init__.py b/tests/v2/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/v2/conftest.py b/tests/v2/conftest.py deleted file mode 100644 index 6680e4066b..0000000000 --- a/tests/v2/conftest.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest -import pathlib - - -@pytest.fixture(params=[str, pathlib.Path]) -def path_type(request): - return request.param diff --git a/tests/v2/fixture/.zgroup b/tests/v2/fixture/.zgroup deleted file mode 100644 index 3b7daf227c..0000000000 --- a/tests/v2/fixture/.zgroup +++ /dev/null @@ -1,3 +0,0 @@ -{ - "zarr_format": 2 -} \ No newline at end of file diff --git a/tests/v2/fixture/dimension_separator/flat/.zarray b/tests/v2/fixture/dimension_separator/flat/.zarray deleted file mode 100644 index f265bb0674..0000000000 --- a/tests/v2/fixture/dimension_separator/flat/.zarray +++ /dev/null @@ -1,23 +0,0 @@ -{ - "chunks": [ - 2, - 2 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dimension_separator": ".", - "dtype": "=^r}ϼQ>//O{/ۯ.?^k_y?k{,k~.=ޯ}_>ޫ_oy[ۿ=-^}ӯ+[{Wݗ_߾+}{>]v_SWګ_{WY}q{۽\^bd?׾?~/^~oyTw'W}z_~ڧ޾w\[}So|{__boO_oy_\߳=/wSSo{//{]y]S*/=.!~q}q/}rݻ׾/{_~{˟{?xG~_{+Sկ~'Yo'{}W_Ϳts_{[{V__l̓՗+o_|K=)Ϳ<]|婛wU_k!V_/w{<g6z͗g~u^}۷e{~x_~#WӼ{buo{W{?>ٽWy{嫿ev}˻tq_-==wW6^{{ϥ_yWlۼo~_ˏLo{/Sݼk[<뗽7G>?~ڷ}էyr~'{SW]+}^/6O쾸=ۗW[iͿ{WT_^r~?ͳ/{\OkW]^W[vkn[=}>}q~_\W3/-~Y}wWWz}޼={/=O/y_mb̻{O#׷/-Oݧ7Ok_yr}ﭾ_>q{_z+_vu.^מү>O_.ܾ,^g߼u]yW{槯={eO}7/v{V~导ݛy?e^77>7q{u.=zߺ/sKwܞW>~߼rݗ/w.7_~Vۧ_{u_zy1O嗧n^~y{{V{3o{q_{.?Kw^l{_^O|^//|ueկY~}=ݓ<}//]K/~Y}1ryWyo|ټxW_W{z^˗_o^_^k_}=kwݷ/<{կ򻗽?.owv~V}۳>^Oʫ}}yyW_=V~~_{~ۯ/ֽr{쓯~q}k_}}WW\z{O?v|c՗'\~W[]{{/_{wo#Oo[.Ͼ[^ll^Y}O_)뽗tᾧǟSqy/W^מ޽/S~}}{_O''^Woz_]yk߷y~o[^1?>{/oOzӻ_y_{˯O|{=n^|ޣ߾~ƾG~^{.}w{ʳ_yO~/^Krw|/{u}-~{G{zlS=Ov+_<˅<ǿۏÿѺ?6>C9K(n_秹r`s?y6/A:55ꫡ?娝gE9ч-3畋;/W_j9tQ[Z^PzQ|C9|x:}yt>^-wQ>kɣ˫!_j~WOm߼s/=$s#^.~ux57ܣo:{7Won>a޾=y~ЭΞ˫OyZ>z{~n#7sO|^G-O9vo.u}f9Ra7@֗ޜ{@yқCvQg^O~=7g}t˫w>hsJ>w?|ߜCM޷ϝy#}|[Ջ{O~9Na;Re?üW/O|oQ߆[ӻ'}6G>]r^}~\Jy۞t{~yBQotot_[^^ϋ___{)=^7ߣãHpȧ6|<֗Gooȯ孟^H=˃ݷMLc>7WG=ԭоtj3>=Z3k[>^^0tOVw>{ν~soM~|jEy{\;o[wnwoA'׾sߗoh$:D[<:hZzu~ᧇ|<_}yyëyxx?\v^YOw_yr ǿ(~uo^sN<ܜcs̜>suZq⷗>Cz>/ṛOo}5xo>OgES-5p9:?|s_[77߽4=VGos5Ꞔ%6frsE4s-?[7~k>o_Mg~uOp<'yr!yY@9[O}ϜGsmsǗԗ#媻wv'짆o>}#Uuy~OxsՋ&94nQ}y˧:ϽAA wyk}_~{Oǯ6yG;o~ЇŝV$6^C9ryr||Py/oɱ/\^ {3ws}躖yqu~˼-JƯ=W\s<!^_׃/ʕ[Tq(#|i3ωW.߾ݹ:yϕgSgv.<}19?ܜh>l欞<}5s/'G͏WGo>=l_Է5WCm =)9~s԰L\H9{SˇBsPPN}mo>|=I}Η)~;~o?H/_}>:}ۗwCׇͧx6V߾<rAo}z{]⣇o{-z/}KͅrԫWYOgyoz7}͑{CF{=~rwOz|f>\>ߜy~w{< W{V>^K>_ܾ}oHgOjH^>^m#nP>j}w~9#G >~HS߹y_j+|tjm/ի:+_# \ No newline at end of file diff --git a/tests/v2/fixture/test_format_compatibility/array_16/compressor_1/1 b/tests/v2/fixture/test_format_compatibility/array_16/compressor_1/1 deleted file mode 100644 index 41b2124f0d..0000000000 Binary files a/tests/v2/fixture/test_format_compatibility/array_16/compressor_1/1 and /dev/null differ diff --git a/tests/v2/fixture/test_format_compatibility/array_16/compressor_2/.zarray b/tests/v2/fixture/test_format_compatibility/array_16/compressor_2/.zarray deleted file mode 100644 index 59f8cc2b9c..0000000000 --- a/tests/v2/fixture/test_format_compatibility/array_16/compressor_2/.zarray +++ /dev/null @@ -1,17 +0,0 @@ -{ - "chunks": [ - 3000 - ], - "compressor": { - "id": "bz2", - "level": 1 - }, - "dtype": "J_rr {^bQ0.zV^ב7Ivφ;WpnMۭWH -+M(ϵw|O{!!/CXxǿ! ChE*g>N#wSo-9Oןp">n2N~#V3 6/Yj[)n)^mc?Ǹ!BuOgZQA}kջ \ރ_?/=oxQ!#}\y܅Ƴ9ɾ/u7 n ~W̼ؐ> RM}&?7) kzKWWQ?0y/eWd.N7r-7J˟sAU/Q<ϼ -՛Vm2~?O;[?ʾȼ4͐5=،n^dK'Gʟ.x.8z&L̋-&Vo⚫ﻌWy^!snlw|3~J?_39Tu;_֓}ԉx-P 9lFpg1{os-Vc׳\и4">E.__ݸ!)gߝe|rl/#}Zݮ_%⯃ϙh@Hvѫ$\ϻuI;5;s.8RU{ ȬQ_uXs\g&]Nws[\{Mp?i&N{?W5k=;}Z@˹Ay?x4W|抿r/{\w֡dP弰X꾧^O·=zTe==yB[^<΁ts^c!~;ݲN=܃_I|+jAֽ؇C_;Vx_U|k)=*ͼKSL=%9_d^~ -i{EbϾzg|9mOދEFSĥ쯇׍| O}MK칯h(uvܻ, }qTߠO["WSi\97;~V=J|EymF -ůoޓ<k@zޗ.fM"_Tw+g]ng|wGo[M5'")ųx@\+s$<=|wFN}P|N{:e̽X+xFp5 g3JݗH,c<>6<97+>g:#uwC>B|)J)Zadq*?r_!_օ߉=opy!ImsʽeQ|{wh? -^$.>ʿ=Ɂ%L^̞nlo篇_ȟ~2e˼G>WWuS \ No newline at end of file diff --git a/tests/v2/fixture/test_format_compatibility/array_7/compressor_1/1 b/tests/v2/fixture/test_format_compatibility/array_7/compressor_1/1 deleted file mode 100644 index 2755ef8b40..0000000000 Binary files a/tests/v2/fixture/test_format_compatibility/array_7/compressor_1/1 and /dev/null differ diff --git a/tests/v2/fixture/test_format_compatibility/array_7/compressor_2/.zarray b/tests/v2/fixture/test_format_compatibility/array_7/compressor_2/.zarray deleted file mode 100644 index f534454b54..0000000000 --- a/tests/v2/fixture/test_format_compatibility/array_7/compressor_2/.zarray +++ /dev/null @@ -1,17 +0,0 @@ -{ - "chunks": [ - 1200 - ], - "compressor": { - "id": "bz2", - "level": 1 - }, - "dtype": " None: - g = group(store=DirectoryStore(str(tmpdir)), path='utf8attrs') - attrs = {"foo": "た"} - g.attrs.put(attrs) - assert g.attrs.asdict() == attrs - - def test_get_set_del_contains(self): - store = _init_store() - a = self.init_attributes(store) - assert "foo" not in a - a["foo"] = "bar" - a["baz"] = 42 - assert "foo" in a - assert "baz" in a - assert "bar" == a["foo"] - assert 42 == a["baz"] - del a["foo"] - assert "foo" not in a - with pytest.raises(KeyError): - # noinspection PyStatementEffect - a["foo"] - - def test_update_put(self): - store = _init_store() - a = self.init_attributes(store) - assert "foo" not in a - assert "bar" not in a - assert "baz" not in a - - a.update(foo="spam", bar=42, baz=4.2) - assert a["foo"] == "spam" - assert a["bar"] == 42 - assert a["baz"] == 4.2 - - a.put(dict(foo="eggs", bar=84)) - assert a["foo"] == "eggs" - assert a["bar"] == 84 - assert "baz" not in a - - def test_iterators(self): - store = _init_store() - a = self.init_attributes(store) - assert 0 == len(a) - assert set() == set(a) - assert set() == set(a.keys()) - assert set() == set(a.values()) - assert set() == set(a.items()) - - a["foo"] = "bar" - a["baz"] = 42 - - assert 2 == len(a) - assert {"foo", "baz"} == set(a) - assert {"foo", "baz"} == set(a.keys()) - assert {"bar", 42} == set(a.values()) - assert {("foo", "bar"), ("baz", 42)} == set(a.items()) - - def test_read_only(self): - store = _init_store() - a = self.init_attributes(store, read_only=True) - store[".zattrs"] = json.dumps(dict(foo="bar", baz=42)).encode("ascii") - assert a["foo"] == "bar" - assert a["baz"] == 42 - with pytest.raises(PermissionError): - a["foo"] = "quux" - with pytest.raises(PermissionError): - del a["foo"] - with pytest.raises(PermissionError): - a.update(foo="quux") - - def test_key_completions(self): - store = _init_store() - a = self.init_attributes(store) - d = a._ipython_key_completions_() - assert "foo" not in d - assert "123" not in d - assert "baz" not in d - assert "asdf;" not in d - a["foo"] = 42 - a["123"] = 4.2 - a["asdf;"] = "ghjkl;" - d = a._ipython_key_completions_() - assert "foo" in d - assert "123" in d - assert "asdf;" in d - assert "baz" not in d - - def test_caching_on(self): - # caching is turned on by default - - # setup store - store = CountingDict() - attrs_key = ".zattrs" - assert 0 == store.counter["__getitem__", attrs_key] - assert 0 == store.counter["__setitem__", attrs_key] - store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") - assert 0 == store.counter["__getitem__", attrs_key] - assert 1 == store.counter["__setitem__", attrs_key] - - # setup attributes - a = self.init_attributes(store) - - # test __getitem__ causes all attributes to be cached - assert a["foo"] == "xxx" - assert 1 == store.counter["__getitem__", attrs_key] - assert a["bar"] == 42 - assert 1 == store.counter["__getitem__", attrs_key] - assert a["foo"] == "xxx" - assert 1 == store.counter["__getitem__", attrs_key] - - # test __setitem__ updates the cache - a["foo"] = "yyy" - get_cnt = 2 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 2 == store.counter["__setitem__", attrs_key] - assert a["foo"] == "yyy" - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 2 == store.counter["__setitem__", attrs_key] - - # test update() updates the cache - a.update(foo="zzz", bar=84) - get_cnt = 3 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - assert a["foo"] == "zzz" - assert a["bar"] == 84 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - - # test __contains__ uses the cache - assert "foo" in a - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - assert "spam" not in a - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - - # test __delitem__ updates the cache - del a["bar"] - get_cnt = 4 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 4 == store.counter["__setitem__", attrs_key] - assert "bar" not in a - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 4 == store.counter["__setitem__", attrs_key] - - # test refresh() - store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") - assert get_cnt == store.counter["__getitem__", attrs_key] - a.refresh() - get_cnt = 5 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert a["foo"] == "xxx" - assert get_cnt == store.counter["__getitem__", attrs_key] - assert a["bar"] == 42 - assert get_cnt == store.counter["__getitem__", attrs_key] - - def test_caching_off(self): - # setup store - store = CountingDict() - attrs_key = ".zattrs" - assert 0 == store.counter["__getitem__", attrs_key] - assert 0 == store.counter["__setitem__", attrs_key] - store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") - assert 0 == store.counter["__getitem__", attrs_key] - assert 1 == store.counter["__setitem__", attrs_key] - - # setup attributes - a = self.init_attributes(store, cache=False) - - # test __getitem__ - assert a["foo"] == "xxx" - assert 1 == store.counter["__getitem__", attrs_key] - assert a["bar"] == 42 - assert 2 == store.counter["__getitem__", attrs_key] - assert a["foo"] == "xxx" - assert 3 == store.counter["__getitem__", attrs_key] - - # test __setitem__ - a["foo"] = "yyy" - get_cnt = 4 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 2 == store.counter["__setitem__", attrs_key] - assert a["foo"] == "yyy" - get_cnt = 5 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 2 == store.counter["__setitem__", attrs_key] - - # test update() - a.update(foo="zzz", bar=84) - get_cnt = 6 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - assert a["foo"] == "zzz" - assert a["bar"] == 84 - get_cnt = 8 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - - # test __contains__ - assert "foo" in a - get_cnt = 9 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - assert "spam" not in a - get_cnt = 10 - assert get_cnt == store.counter["__getitem__", attrs_key] - assert 3 == store.counter["__setitem__", attrs_key] - - def test_wrong_keys(self): - store = _init_store() - a = self.init_attributes(store) - - warning_msg = "only attribute keys of type 'string' will be allowed in the future" - - with pytest.warns(DeprecationWarning, match=warning_msg): - a[1] = "foo" - - with pytest.warns(DeprecationWarning, match=warning_msg): - a.put({1: "foo"}) - - with pytest.warns(DeprecationWarning, match=warning_msg): - a.update({1: "foo"}) diff --git a/tests/v2/test_convenience.py b/tests/v2/test_convenience.py deleted file mode 100644 index f558a8800f..0000000000 --- a/tests/v2/test_convenience.py +++ /dev/null @@ -1,804 +0,0 @@ -import atexit -import tempfile -import unittest -from numbers import Integral - -import numpy as np -import pytest -from numcodecs import Adler32, Zlib -from numpy.testing import assert_array_equal - -import zarr.v2 as zarr -from zarr.v2.convenience import ( - consolidate_metadata, - copy, - copy_store, - load, - open, - open_consolidated, - save, - save_group, - save_array, - copy_all, -) -from zarr.v2.core import Array -from zarr.v2.errors import CopyError -from zarr.v2.hierarchy import Group, group -from zarr.v2.storage import ( - ConsolidatedMetadataStore, - FSStore, - MemoryStore, - atexit_rmtree, - getsize, -) - - -def test_open_array(path_type): - store = tempfile.mkdtemp() - atexit.register(atexit_rmtree, store) - store = path_type(store) - - # open array, create if doesn't exist - z = open(store, mode="a", shape=100) - assert isinstance(z, Array) - assert z.shape == (100,) - - # open array, overwrite - z = open(store, mode="w", shape=200) - assert isinstance(z, Array) - assert z.shape == (200,) - - # open array, read-only - z = open(store, mode="r") - assert isinstance(z, Array) - assert z.shape == (200,) - assert z.read_only - - # path not found - with pytest.raises(ValueError): - open("doesnotexist", mode="r") - - -def test_open_group(path_type): - store = tempfile.mkdtemp() - atexit.register(atexit_rmtree, store) - store = path_type(store) - - # open group, create if doesn't exist - g = open(store, mode="a") - g.create_group("foo") - assert isinstance(g, Group) - assert "foo" in g - - # open group, overwrite - g = open(store, mode="w") - assert isinstance(g, Group) - assert "foo" not in g - - # open group, read-only - g = open(store, mode="r") - assert isinstance(g, Group) - assert g.read_only - - -def test_save_errors(): - with pytest.raises(ValueError): - # no arrays provided - save_group("data/group.zarr") - with pytest.raises(TypeError): - # no array provided - save_array("data/group.zarr") - with pytest.raises(ValueError): - # no arrays provided - save("data/group.zarr") - - -def test_lazy_loader(): - foo = np.arange(100) - bar = np.arange(100, 0, -1) - store = "data/group.zarr" - save(store, foo=foo, bar=bar) - loader = load(store) - assert "foo" in loader - assert "bar" in loader - assert "baz" not in loader - assert len(loader) == 2 - assert sorted(loader) == ["bar", "foo"] - assert_array_equal(foo, loader["foo"]) - assert_array_equal(bar, loader["bar"]) - assert "LazyLoader: " in repr(loader) - - -def test_load_array(): - foo = np.arange(100) - bar = np.arange(100, 0, -1) - store = "data/group.zarr" - save(store, foo=foo, bar=bar) - - # can also load arrays directly into a numpy array - for array_name in ["foo", "bar"]: - array_path = array_name - array = load(store, path=array_path) - assert isinstance(array, np.ndarray) - if array_name == "foo": - assert_array_equal(foo, array) - else: - assert_array_equal(bar, array) - - -def test_tree(): - g1 = zarr.group() - g1.create_group("foo") - g3 = g1.create_group("bar") - g3.create_group("baz") - g5 = g3.create_group("qux") - g5.create_dataset("baz", shape=100, chunks=10) - assert repr(zarr.tree(g1)) == repr(g1.tree()) - assert str(zarr.tree(g1)) == str(g1.tree()) - - -@pytest.mark.parametrize("stores_from_path", [False, True]) -@pytest.mark.parametrize( - "with_chunk_store,listable", - [(False, True), (True, True), (False, False)], - ids=["default-listable", "with_chunk_store-listable", "default-unlistable"], -) -def test_consolidate_metadata(with_chunk_store, listable, monkeypatch, stores_from_path): - # setup initial data - if stores_from_path: - store = tempfile.mkdtemp() - atexit.register(atexit_rmtree, store) - if with_chunk_store: - chunk_store = tempfile.mkdtemp() - atexit.register(atexit_rmtree, chunk_store) - else: - chunk_store = None - else: - store = MemoryStore() - chunk_store = MemoryStore() if with_chunk_store else None - path = None - z = group(store, chunk_store=chunk_store, path=path) - - # Reload the actual store implementation in case str - store_to_copy = z.store - - z.create_group("g1") - g2 = z.create_group("g2") - g2.attrs["hello"] = "world" - arr = g2.create_dataset("arr", shape=(20, 20), chunks=(5, 5), dtype="f8") - assert 16 == arr.nchunks - assert 0 == arr.nchunks_initialized - arr.attrs["data"] = 1 - arr[:] = 1.0 - assert 16 == arr.nchunks_initialized - - if stores_from_path: - # get the actual store class for use with consolidate_metadata - store_class = z._store - else: - store_class = store - - # perform consolidation - out = consolidate_metadata(store_class, path=path) - assert isinstance(out, Group) - assert ["g1", "g2"] == list(out) - if not stores_from_path: - assert isinstance(out._store, ConsolidatedMetadataStore) - assert ".zmetadata" in store - meta_keys = [ - ".zgroup", - "g1/.zgroup", - "g2/.zgroup", - "g2/.zattrs", - "g2/arr/.zarray", - "g2/arr/.zattrs", - ] - - for key in meta_keys: - del store[key] - - # https://github.com/zarr-developers/zarr-python/issues/993 - # Make sure we can still open consolidated on an unlistable store: - if not listable: - fs_memory = pytest.importorskip("fsspec.implementations.memory") - monkeypatch.setattr(fs_memory.MemoryFileSystem, "isdir", lambda x, y: False) - monkeypatch.delattr(fs_memory.MemoryFileSystem, "ls") - fs = fs_memory.MemoryFileSystem() - store_to_open = FSStore("", fs=fs) - # copy original store to new unlistable store - store_to_open.update(store_to_copy) - - else: - store_to_open = store - - # open consolidated - z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path) - assert ["g1", "g2"] == list(z2) - assert "world" == z2.g2.attrs["hello"] - assert 1 == z2.g2.arr.attrs["data"] - assert (z2.g2.arr[:] == 1.0).all() - assert 16 == z2.g2.arr.nchunks - if listable: - assert 16 == z2.g2.arr.nchunks_initialized - else: - with pytest.raises(NotImplementedError): - _ = z2.g2.arr.nchunks_initialized - - if stores_from_path: - # path string is note a BaseStore subclass so cannot be used to - # initialize a ConsolidatedMetadataStore. - - with pytest.raises(ValueError): - cmd = ConsolidatedMetadataStore(store) - else: - # tests del/write on the store - - cmd = ConsolidatedMetadataStore(store) - with pytest.raises(PermissionError): - del cmd[".zgroup"] - with pytest.raises(PermissionError): - cmd[".zgroup"] = None - - # test getsize on the store - assert isinstance(getsize(cmd), Integral) - - # test new metadata are not writeable - with pytest.raises(PermissionError): - z2.create_group("g3") - with pytest.raises(PermissionError): - z2.create_dataset("spam", shape=42, chunks=7, dtype="i4") - with pytest.raises(PermissionError): - del z2["g2"] - - # test consolidated metadata are not writeable - with pytest.raises(PermissionError): - z2.g2.attrs["hello"] = "universe" - with pytest.raises(PermissionError): - z2.g2.arr.attrs["foo"] = "bar" - - # test the data are writeable - z2.g2.arr[:] = 2 - assert (z2.g2.arr[:] == 2).all() - - # test invalid modes - with pytest.raises(ValueError): - open_consolidated(store, chunk_store=chunk_store, mode="a", path=path) - with pytest.raises(ValueError): - open_consolidated(store, chunk_store=chunk_store, mode="w", path=path) - with pytest.raises(ValueError): - open_consolidated(store, chunk_store=chunk_store, mode="w-", path=path) - - # make sure keyword arguments are passed through without error - open_consolidated( - store, - chunk_store=chunk_store, - path=path, - cache_attrs=True, - synchronizer=None, - ) - - -@pytest.mark.parametrize( - "options", - ( - {"dimension_separator": "/"}, - {"dimension_separator": "."}, - {"dimension_separator": None}, - ), -) -def test_save_array_separator(tmpdir, options): - data = np.arange(6).reshape((3, 2)) - url = tmpdir.join("test.zarr") - save_array(url, data, **options) - - -class TestCopyStore(unittest.TestCase): - _version = 2 - - def setUp(self): - source = dict() - source["foo"] = b"xxx" - source["bar/baz"] = b"yyy" - source["bar/qux"] = b"zzz" - self.source = source - - def _get_dest_store(self): - return dict() - - def test_no_paths(self): - source = self.source - dest = self._get_dest_store() - copy_store(source, dest) - assert len(source) == len(dest) - for key in source: - assert source[key] == dest[key] - - def test_source_path(self): - source = self.source - # paths should be normalized - for source_path in "bar", "bar/", "/bar", "/bar/": - dest = self._get_dest_store() - copy_store(source, dest, source_path=source_path) - assert 2 == len(dest) - for key in source: - if key.startswith("bar/"): - dest_key = key.split("bar/")[1] - assert source[key] == dest[dest_key] - else: - assert key not in dest - - def test_dest_path(self): - source = self.source - # paths should be normalized - for dest_path in "new", "new/", "/new", "/new/": - dest = self._get_dest_store() - copy_store(source, dest, dest_path=dest_path) - assert len(source) == len(dest) - for key in source: - if self._version == 3: - dest_key = key[:10] + "new/" + key[10:] - else: - dest_key = "new/" + key - assert source[key] == dest[dest_key] - - def test_source_dest_path(self): - source = self.source - # paths should be normalized - for source_path in "bar", "bar/", "/bar", "/bar/": - for dest_path in "new", "new/", "/new", "/new/": - dest = self._get_dest_store() - copy_store(source, dest, source_path=source_path, dest_path=dest_path) - assert 2 == len(dest) - for key in source: - if key.startswith("bar/"): - dest_key = "new/" + key.split("bar/")[1] - assert source[key] == dest[dest_key] - else: - assert key not in dest - assert ("new/" + key) not in dest - - def test_excludes_includes(self): - source = self.source - - # single excludes - dest = self._get_dest_store() - excludes = "f.*" - copy_store(source, dest, excludes=excludes) - assert len(dest) == 2 - - root = "" - assert root + "foo" not in dest - - # multiple excludes - dest = self._get_dest_store() - excludes = "b.z", ".*x" - copy_store(source, dest, excludes=excludes) - assert len(dest) == 1 - assert root + "foo" in dest - assert root + "bar/baz" not in dest - assert root + "bar/qux" not in dest - - # excludes and includes - dest = self._get_dest_store() - excludes = "b.*" - includes = ".*x" - copy_store(source, dest, excludes=excludes, includes=includes) - assert len(dest) == 2 - assert root + "foo" in dest - assert root + "bar/baz" not in dest - assert root + "bar/qux" in dest - - def test_dry_run(self): - source = self.source - dest = self._get_dest_store() - copy_store(source, dest, dry_run=True) - assert 0 == len(dest) - - def test_if_exists(self): - source = self.source - dest = self._get_dest_store() - root = "" - dest[root + "bar/baz"] = b"mmm" - - # default ('raise') - with pytest.raises(CopyError): - copy_store(source, dest) - - # explicit 'raise' - with pytest.raises(CopyError): - copy_store(source, dest, if_exists="raise") - - # skip - copy_store(source, dest, if_exists="skip") - assert 3 == len(dest) - assert dest[root + "foo"] == b"xxx" - assert dest[root + "bar/baz"] == b"mmm" - assert dest[root + "bar/qux"] == b"zzz" - - # replace - copy_store(source, dest, if_exists="replace") - assert 3 == len(dest) - assert dest[root + "foo"] == b"xxx" - assert dest[root + "bar/baz"] == b"yyy" - assert dest[root + "bar/qux"] == b"zzz" - - # invalid option - with pytest.raises(ValueError): - copy_store(source, dest, if_exists="foobar") - - -def check_copied_array(original, copied, without_attrs=False, expect_props=None): - # setup - source_h5py = original.__module__.startswith("h5py.") - dest_h5py = copied.__module__.startswith("h5py.") - zarr_to_zarr = not (source_h5py or dest_h5py) - h5py_to_h5py = source_h5py and dest_h5py - zarr_to_h5py = not source_h5py and dest_h5py - h5py_to_zarr = source_h5py and not dest_h5py - if expect_props is None: - expect_props = dict() - else: - expect_props = expect_props.copy() - - # common properties in zarr and h5py - for p in "dtype", "shape", "chunks": - expect_props.setdefault(p, getattr(original, p)) - - # zarr-specific properties - if zarr_to_zarr: - for p in "compressor", "filters", "order", "fill_value": - expect_props.setdefault(p, getattr(original, p)) - - # h5py-specific properties - if h5py_to_h5py: - for p in ( - "maxshape", - "compression", - "compression_opts", - "shuffle", - "scaleoffset", - "fletcher32", - "fillvalue", - ): - expect_props.setdefault(p, getattr(original, p)) - - # common properties with some name differences - if h5py_to_zarr: - expect_props.setdefault("fill_value", original.fillvalue) - if zarr_to_h5py: - expect_props.setdefault("fillvalue", original.fill_value) - - # compare properties - for k, v in expect_props.items(): - assert v == getattr(copied, k) - - # compare data - assert_array_equal(original[:], copied[:]) - - # compare attrs - if without_attrs: - for k in original.attrs.keys(): - assert k not in copied.attrs - else: - if dest_h5py and "filters" in original.attrs: - # special case in v3 (storing filters metadata under attributes) - # we explicitly do not copy this info over to HDF5 - original_attrs = original.attrs.asdict().copy() - original_attrs.pop("filters") - else: - original_attrs = original.attrs - assert sorted(original_attrs.items()) == sorted(copied.attrs.items()) - - -def check_copied_group(original, copied, without_attrs=False, expect_props=None, shallow=False): - # setup - if expect_props is None: - expect_props = dict() - else: - expect_props = expect_props.copy() - - # compare children - for k, v in original.items(): - if hasattr(v, "shape"): - assert k in copied - check_copied_array(v, copied[k], without_attrs=without_attrs, expect_props=expect_props) - elif shallow: - assert k not in copied - else: - assert k in copied - check_copied_group( - v, - copied[k], - without_attrs=without_attrs, - shallow=shallow, - expect_props=expect_props, - ) - - # compare attrs - if without_attrs: - for k in original.attrs.keys(): - assert k not in copied.attrs - else: - assert sorted(original.attrs.items()) == sorted(copied.attrs.items()) - - -def test_copy_all(): - """ - https://github.com/zarr-developers/zarr-python/issues/269 - - copy_all used to not copy attributes as `.keys()` does not return hidden `.zattrs`. - - """ - original_group = zarr.group(store=MemoryStore(), overwrite=True) - original_group.attrs["info"] = "group attrs" - original_subgroup = original_group.create_group("subgroup") - original_subgroup.attrs["info"] = "sub attrs" - - destination_group = zarr.group(store=MemoryStore(), overwrite=True) - - # copy from memory to directory store - copy_all( - original_group, - destination_group, - dry_run=False, - ) - - assert "subgroup" in destination_group - assert destination_group.attrs["info"] == "group attrs" - assert destination_group.subgroup.attrs["info"] == "sub attrs" - - -class TestCopy: - @pytest.fixture(params=[False, True], ids=["zarr", "hdf5"]) - def source(self, request, tmpdir): - def prep_source(source): - foo = source.create_group("foo") - foo.attrs["experiment"] = "weird science" - baz = foo.create_dataset("bar/baz", data=np.arange(100), chunks=(50,)) - baz.attrs["units"] = "metres" - if request.param: - extra_kws = dict( - compression="gzip", - compression_opts=3, - fillvalue=84, - shuffle=True, - fletcher32=True, - ) - else: - extra_kws = dict(compressor=Zlib(3), order="F", fill_value=42, filters=[Adler32()]) - source.create_dataset( - "spam", - data=np.arange(100, 200).reshape(20, 5), - chunks=(10, 2), - dtype="i2", - **extra_kws, - ) - return source - - if request.param: - h5py = pytest.importorskip("h5py") - fn = tmpdir.join("source.h5") - with h5py.File(str(fn), mode="w") as h5f: - yield prep_source(h5f) - else: - yield prep_source(group()) - - @pytest.fixture(params=[False, True], ids=["zarr", "hdf5"]) - def dest(self, request, tmpdir): - if request.param: - h5py = pytest.importorskip("h5py") - fn = tmpdir.join("dest.h5") - with h5py.File(str(fn), mode="w") as h5f: - yield h5f - else: - yield group() - - def test_copy_array(self, source, dest): - # copy array with default options - copy(source["foo/bar/baz"], dest) - check_copied_array(source["foo/bar/baz"], dest["baz"]) - copy(source["spam"], dest) - check_copied_array(source["spam"], dest["spam"]) - - def test_copy_bad_dest(self, source, dest): - # try to copy to an array, dest must be a group - dest = dest.create_dataset("eggs", shape=(100,)) - with pytest.raises(ValueError): - copy(source["foo/bar/baz"], dest) - - def test_copy_array_name(self, source, dest): - # copy array with name - copy(source["foo/bar/baz"], dest, name="qux") - assert "baz" not in dest - check_copied_array(source["foo/bar/baz"], dest["qux"]) - - def test_copy_array_create_options(self, source, dest): - dest_h5py = dest.__module__.startswith("h5py.") - - # copy array, provide creation options - compressor = Zlib(9) - create_kws = dict(chunks=(10,)) - if dest_h5py: - create_kws.update( - compression="gzip", compression_opts=9, shuffle=True, fletcher32=True, fillvalue=42 - ) - else: - create_kws.update(compressor=compressor, fill_value=42, order="F", filters=[Adler32()]) - copy(source["foo/bar/baz"], dest, without_attrs=True, **create_kws) - check_copied_array( - source["foo/bar/baz"], dest["baz"], without_attrs=True, expect_props=create_kws - ) - - def test_copy_array_exists_array(self, source, dest): - # copy array, dest array in the way - dest.create_dataset("baz", shape=(10,)) - - # raise - with pytest.raises(CopyError): - # should raise by default - copy(source["foo/bar/baz"], dest) - assert (10,) == dest["baz"].shape - with pytest.raises(CopyError): - copy(source["foo/bar/baz"], dest, if_exists="raise") - assert (10,) == dest["baz"].shape - - # skip - copy(source["foo/bar/baz"], dest, if_exists="skip") - assert (10,) == dest["baz"].shape - - # replace - copy(source["foo/bar/baz"], dest, if_exists="replace") - check_copied_array(source["foo/bar/baz"], dest["baz"]) - - # invalid option - with pytest.raises(ValueError): - copy(source["foo/bar/baz"], dest, if_exists="foobar") - - def test_copy_array_exists_group(self, source, dest): - # copy array, dest group in the way - dest.create_group("baz") - - # raise - with pytest.raises(CopyError): - copy(source["foo/bar/baz"], dest) - assert not hasattr(dest["baz"], "shape") - with pytest.raises(CopyError): - copy(source["foo/bar/baz"], dest, if_exists="raise") - assert not hasattr(dest["baz"], "shape") - - # skip - copy(source["foo/bar/baz"], dest, if_exists="skip") - assert not hasattr(dest["baz"], "shape") - - # replace - copy(source["foo/bar/baz"], dest, if_exists="replace") - check_copied_array(source["foo/bar/baz"], dest["baz"]) - - def test_copy_array_skip_initialized(self, source, dest): - dest_h5py = dest.__module__.startswith("h5py.") - - dest.create_dataset("baz", shape=(100,), chunks=(10,), dtype="i8") - assert not np.all(source["foo/bar/baz"][:] == dest["baz"][:]) - - if dest_h5py: - with pytest.raises(ValueError): - # not available with copy to h5py - copy(source["foo/bar/baz"], dest, if_exists="skip_initialized") - - else: - # copy array, dest array exists but not yet initialized - copy(source["foo/bar/baz"], dest, if_exists="skip_initialized") - check_copied_array(source["foo/bar/baz"], dest["baz"]) - - # copy array, dest array exists and initialized, will be skipped - dest["baz"][:] = np.arange(100, 200) - copy(source["foo/bar/baz"], dest, if_exists="skip_initialized") - assert_array_equal(np.arange(100, 200), dest["baz"][:]) - assert not np.all(source["foo/bar/baz"][:] == dest["baz"][:]) - - def test_copy_group(self, source, dest): - # copy group, default options - copy(source["foo"], dest) - check_copied_group(source["foo"], dest["foo"]) - - def test_copy_group_no_name(self, source, dest): - with pytest.raises(TypeError): - # need a name if copy root - copy(source, dest) - - copy(source, dest, name="root") - check_copied_group(source, dest["root"]) - - def test_copy_group_options(self, source, dest): - # copy group, non-default options - copy(source["foo"], dest, name="qux", without_attrs=True) - assert "foo" not in dest - check_copied_group(source["foo"], dest["qux"], without_attrs=True) - - def test_copy_group_shallow(self, source, dest): - # copy group, shallow - copy(source, dest, name="eggs", shallow=True) - check_copied_group(source, dest["eggs"], shallow=True) - - def test_copy_group_exists_group(self, source, dest): - # copy group, dest groups exist - dest.create_group("foo/bar") - copy(source["foo"], dest) - check_copied_group(source["foo"], dest["foo"]) - - def test_copy_group_exists_array(self, source, dest): - # copy group, dest array in the way - dest.create_dataset("foo/bar", shape=(10,)) - - # raise - with pytest.raises(CopyError): - copy(source["foo"], dest) - assert dest["foo/bar"].shape == (10,) - with pytest.raises(CopyError): - copy(source["foo"], dest, if_exists="raise") - assert dest["foo/bar"].shape == (10,) - - # skip - copy(source["foo"], dest, if_exists="skip") - assert dest["foo/bar"].shape == (10,) - - # replace - copy(source["foo"], dest, if_exists="replace") - check_copied_group(source["foo"], dest["foo"]) - - def test_copy_group_dry_run(self, source, dest): - # dry run, empty destination - n_copied, n_skipped, n_bytes_copied = copy( - source["foo"], dest, dry_run=True, return_stats=True - ) - assert 0 == len(dest) - assert 3 == n_copied - assert 0 == n_skipped - assert 0 == n_bytes_copied - - # dry run, array exists in destination - baz = np.arange(100, 200) - dest.create_dataset("foo/bar/baz", data=baz) - assert not np.all(source["foo/bar/baz"][:] == dest["foo/bar/baz"][:]) - assert 1 == len(dest) - - # raise - with pytest.raises(CopyError): - copy(source["foo"], dest, dry_run=True) - assert 1 == len(dest) - - # skip - n_copied, n_skipped, n_bytes_copied = copy( - source["foo"], dest, dry_run=True, if_exists="skip", return_stats=True - ) - assert 1 == len(dest) - assert 2 == n_copied - assert 1 == n_skipped - assert 0 == n_bytes_copied - assert_array_equal(baz, dest["foo/bar/baz"]) - - # replace - n_copied, n_skipped, n_bytes_copied = copy( - source["foo"], dest, dry_run=True, if_exists="replace", return_stats=True - ) - assert 1 == len(dest) - assert 3 == n_copied - assert 0 == n_skipped - assert 0 == n_bytes_copied - assert_array_equal(baz, dest["foo/bar/baz"]) - - def test_logging(self, source, dest, tmpdir): - # callable log - copy(source["foo"], dest, dry_run=True, log=print) - - # file name - fn = str(tmpdir.join("log_name")) - copy(source["foo"], dest, dry_run=True, log=fn) - - # file - with tmpdir.join("log_file").open(mode="w") as f: - copy(source["foo"], dest, dry_run=True, log=f) - - # bad option - with pytest.raises(TypeError): - copy(source["foo"], dest, dry_run=True, log=True) diff --git a/tests/v2/test_core.py b/tests/v2/test_core.py deleted file mode 100644 index f053725b95..0000000000 --- a/tests/v2/test_core.py +++ /dev/null @@ -1,2510 +0,0 @@ -import atexit -import os -import sys -import pickle -import shutil -from typing import Any, Literal, Optional, Tuple, Union -import unittest -from itertools import zip_longest -from tempfile import mkdtemp -import numpy as np -import packaging.version -import pytest -from numcodecs import ( - BZ2, - JSON, - LZ4, - Blosc, - Categorize, - Delta, - FixedScaleOffset, - GZip, - MsgPack, - Pickle, - VLenArray, - VLenBytes, - VLenUTF8, - Zlib, -) -from numcodecs.compat import ensure_bytes, ensure_ndarray -from numcodecs.tests.common import greetings -from numpy.testing import assert_array_almost_equal, assert_array_equal - -import zarr.v2 -from zarr.v2._storage.store import ( - BaseStore, -) - -from zarr.v2.core import Array -from zarr.v2.meta import json_loads -from zarr.v2.n5 import N5Store, N5FSStore, n5_keywords -from zarr.v2.storage import ( - ABSStore, - DBMStore, - DirectoryStore, - FSStore, - KVStore, - LMDBStore, - LRUStoreCache, - NestedDirectoryStore, - SQLiteStore, - atexit_rmglob, - atexit_rmtree, - init_array, - init_group, - normalize_store_arg, -) - -from zarr.v2.util import buffer_size -from .util import abs_container, skip_test_env_var, have_fsspec, mktemp - -# noinspection PyMethodMayBeStatic - - -class TestArray: - root = "" - path = "" - compressor = Zlib(level=1) - filters = None - dimension_separator: Literal["/", ".", None] = None - cache_metadata = True - cache_attrs = True - partial_decompress: bool = False - write_empty_chunks = True - read_only = False - storage_transformers: Tuple[Any, ...] = () - - def create_store(self) -> BaseStore: - return KVStore(dict()) - - # used by child classes - def create_chunk_store(self) -> Optional[BaseStore]: - return None - - def create_storage_transformers(self, shape: Union[int, Tuple[int, ...]]) -> Tuple[Any, ...]: - return () - - def create_filters(self, dtype: Optional[str]) -> Tuple[Any, ...]: - return () - - def create_array(self, shape: Union[int, Tuple[int, ...]], **kwargs): - store = self.create_store() - chunk_store = self.create_chunk_store() - # keyword arguments for array initialization - init_array_kwargs = { - "path": kwargs.pop("path", self.path), - "compressor": kwargs.pop("compressor", self.compressor), - "chunk_store": chunk_store, - "storage_transformers": self.create_storage_transformers(shape), - "filters": kwargs.pop("filters", self.create_filters(kwargs.get("dtype", None))), - } - - # keyword arguments for array instantiation - access_array_kwargs = { - "path": init_array_kwargs["path"], - "read_only": kwargs.pop("read_only", self.read_only), - "chunk_store": chunk_store, - "cache_metadata": kwargs.pop("cache_metadata", self.cache_metadata), - "cache_attrs": kwargs.pop("cache_attrs", self.cache_attrs), - "partial_decompress": kwargs.pop("partial_decompress", self.partial_decompress), - "write_empty_chunks": kwargs.pop("write_empty_chunks", self.write_empty_chunks), - } - - init_array(store, shape, **{**init_array_kwargs, **kwargs}) - - return Array(store, **access_array_kwargs) - - def test_array_init(self): - # normal initialization - store = self.create_store() - init_array(store, shape=100, chunks=10, dtype=" end - assert [] == list(z.islice(6, 5)) - - z.store.close() - - def test_iter(self): - params = ( - ((1,), (1,)), - ((2,), (1,)), - ((1,), (2,)), - ((3,), (3,)), - ((1000,), (100,)), - ((100,), (1000,)), - ((1, 100), (1, 1)), - ((1, 0), (1, 1)), - ((0, 1), (1, 1)), - ((0, 1), (2, 1)), - ((100, 1), (3, 1)), - ((100, 100), (10, 10)), - ((10, 10, 10), (3, 3, 3)), - ) - for shape, chunks in params: - z = self.create_array(shape=shape, chunks=chunks, dtype=int) - a = np.arange(np.prod(shape)).reshape(shape) - z[:] = a - for expect, actual in zip_longest(a, z): - assert_array_equal(expect, actual) - z.store.close() - - def test_islice(self): - params = ( - ((1,), (1,), 0, 1), - ((2,), (1,), 0, 1), - ((1,), (2,), 0, 1), - ((3,), (3,), 1, 2), - ((1000,), (100,), 150, 1050), - ((100,), (1000,), 25, 75), - ((1, 100), (1, 1), 0, 1), - ((100, 1), (3, 1), 56, 100), - ((100, 100), (10, 10), 13, 99), - ((10, 10, 10), (3, 3, 3), 2, 4), - ) - for shape, chunks, start, end in params: - z = self.create_array(shape=shape, chunks=chunks, dtype=int) - a = np.arange(np.prod(shape)).reshape(shape) - z[:] = a - end_array = min(end, a.shape[0]) - for expect, actual in zip_longest(a[start:end_array], z.islice(start, end)): - assert_array_equal(expect, actual) - if hasattr(z.store, "close"): - z.store.close() - - def test_compressors(self): - compressors = [None, BZ2(), Blosc(), LZ4(), Zlib(), GZip()] - if LZMA: - compressors.append(LZMA()) - for compressor in compressors: - a = self.create_array(shape=1000, chunks=100, compressor=compressor) - a[0:100] = 1 - assert np.all(a[0:100] == 1) - a[:] = 1 - assert np.all(a[:] == 1) - a.store.close() - - def test_endian(self): - dtype = np.dtype("float32") - a1 = self.create_array(shape=1000, chunks=100, dtype=dtype.newbyteorder("<")) - a1[:] = 1 - x1 = a1[:] - a2 = self.create_array(shape=1000, chunks=100, dtype=dtype.newbyteorder(">")) - a2[:] = 1 - x2 = a2[:] - assert_array_equal(x1, x2) - a1.store.close() - a2.store.close() - - def test_attributes(self): - a = self.create_array(shape=10, chunks=10, dtype="i8") - a.attrs["foo"] = "bar" - assert a.attrs.key in a.store - attrs = json_loads(a.store[a.attrs.key]) - assert "foo" in attrs and attrs["foo"] == "bar" - - a.attrs["bar"] = "foo" - assert a.attrs.key in a.store - attrs = json_loads(a.store[a.attrs.key]) - assert "foo" in attrs and attrs["foo"] == "bar" - assert "bar" in attrs and attrs["bar"] == "foo" - a.store.close() - - def test_structured_with_object(self): - a = self.create_array( - fill_value=(0.0, None), - shape=10, - chunks=10, - dtype=[("x", float), ("y", object)], - object_codec=Pickle(), - ) - assert tuple(a[0]) == (0.0, None) - - -class TestArrayWithPath(TestArray): - path = "foo/bar" - compressor = Blosc() - - def test_nchunks_initialized(self): - pass - - def expected(self): - return [ - "f710da18d45d38d4aaf2afd7fb822fdd73d02957", - "1437428e69754b1e1a38bd7fc9e43669577620db", - "6c530b6b9d73e108cc5ee7b6be3d552cc994bdbe", - "4c0a76fb1222498e09dcd92f7f9221d6cea8b40e", - "05b0663ffe1785f38d3a459dec17e57a18f254af", - ] - - def test_nbytes_stored(self): - # MemoryStore as store - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum( - buffer_size(v) for k, v in z.store.items() if k.startswith("foo/bar/") - ) - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum( - buffer_size(v) for k, v in z.store.items() if k.startswith("foo/bar/") - ) - assert expect_nbytes_stored == z.nbytes_stored - - # mess with store - z.store[z._key_prefix + "foo"] = list(range(10)) - assert -1 == z.nbytes_stored - - -class TestArrayWithChunkStore(TestArray): - compressor = Blosc() - - def create_chunk_store(self): - return KVStore(dict()) - - def expected(self): - return [ - "f710da18d45d38d4aaf2afd7fb822fdd73d02957", - "1437428e69754b1e1a38bd7fc9e43669577620db", - "6c530b6b9d73e108cc5ee7b6be3d552cc994bdbe", - "4c0a76fb1222498e09dcd92f7f9221d6cea8b40e", - "05b0663ffe1785f38d3a459dec17e57a18f254af", - ] - - def test_nbytes_stored(self): - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - expect_nbytes_stored += sum(buffer_size(v) for v in z.chunk_store.values()) - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - expect_nbytes_stored += sum(buffer_size(v) for v in z.chunk_store.values()) - assert expect_nbytes_stored == z.nbytes_stored - - # mess with store - z.chunk_store[z._key_prefix + "foo"] = list(range(10)) - assert -1 == z.nbytes_stored - - -class TestArrayWithDirectoryStore(TestArray): - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = DirectoryStore(path) - return store - - def test_nbytes_stored(self): - # dict as store - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - assert expect_nbytes_stored == z.nbytes_stored - - -def test_array_init_from_dict(): - # initialization via non-Store MutableMapping - store = dict() - init_array(store, shape=100, chunks=10, dtype=" Tuple[Any, ...]: - return ( - Delta(dtype=dtype), - FixedScaleOffset(dtype=dtype, scale=1, offset=0), - ) - - def expected(self): - return [ - "b80367c5599d47110d42bd8886240c2f46620dba", - "95a7b2471225e73199c9716d21e8d3dd6e5f6f2a", - "7300f1eb130cff5891630038fd99c28ef23d3a01", - "c649ad229bc5720258b934ea958570c2f354c2eb", - "62fc9236d78af18a5ec26c12eea1d33bce52501e", - ] - - def test_astype_no_filters(self): - shape = (100,) - dtype = np.dtype(np.int8) - astype = np.dtype(np.float32) - - store = KVStore(dict()) - init_array(store, shape=shape, chunks=10, dtype=dtype) - - data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) - - z1 = Array(store) - z1[...] = data - z2 = z1.astype(astype) - - expected = data.astype(astype) - assert_array_equal(expected, z2) - assert z2.read_only - - def test_astype(self): - shape = (100,) - chunks = (10,) - - dtype = np.dtype(np.int8) - astype = np.dtype(np.float32) - - data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) - - z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype) - z1[...] = data - z2 = z1.astype(astype) - - expected = data.astype(astype) - assert_array_equal(expected, z2) - - def test_array_dtype_shape(self): - # skip this one, cannot do delta on unstructured array - pass - - def test_structured_array(self): - # skip this one, cannot do delta on structured array - pass - - def test_structured_array_subshapes(self): - # skip this one, cannot do delta on structured array - pass - - def test_structured_array_nested(self): - # skip this one, cannot do delta on structured array - pass - - def test_dtypes(self): - # skip this one, delta messes up floats - pass - - def test_object_arrays(self): - # skip this one, cannot use delta with objects - pass - - def test_object_arrays_vlen_text(self): - # skip this one, cannot use delta with objects - pass - - def test_object_arrays_vlen_bytes(self): - # skip this one, cannot use delta with objects - pass - - def test_object_arrays_vlen_array(self): - # skip this one, cannot use delta with objects - pass - - def test_object_arrays_danger(self): - # skip this one, cannot use delta with objects - pass - - def test_structured_array_contain_object(self): - # skip this one, cannot use delta on structured array - pass - - -# custom store, does not support getsize() -class CustomMapping: - def __init__(self): - self.inner = KVStore(dict()) - - def __iter__(self): - return iter(self.keys()) - - def keys(self): - return self.inner.keys() - - def values(self): - return self.inner.values() - - def get(self, item, default=None): - try: - return self.inner[item] - except KeyError: - return default - - def __getitem__(self, item): - return self.inner[item] - - def __setitem__(self, item, value): - self.inner[item] = ensure_bytes(value) - - def __delitem__(self, key): - del self.inner[key] - - def __contains__(self, item): - return item in self.inner - - def close(self): - return self.inner.close() - - -class TestArrayWithCustomMapping(TestArray): - def create_store(self): - return CustomMapping() - - def test_nbytes_stored(self): - z = self.create_array(shape=1000, chunks=100) - assert 245 == z.nbytes_stored - z[:] = 42 - assert 515 == z.nbytes_stored - - -class TestArrayNoCache(TestArray): - def test_cache_metadata(self): - a1 = self.create_array(shape=100, chunks=10, dtype="i1", cache_metadata=False) - path = None - a2 = Array(a1.store, path=path, cache_metadata=True) - assert a1.shape == a2.shape - assert a1.size == a2.size - assert a1.nbytes == a2.nbytes - assert a1.nchunks == a2.nchunks - - # a1 is not caching so *will* see updates made via other objects - a2.resize(200) - assert (200,) == a2.shape - assert 200 == a2.size - assert 200 == a2.nbytes - assert 20 == a2.nchunks - assert a1.shape == a2.shape - assert a1.size == a2.size - assert a1.nbytes == a2.nbytes - assert a1.nchunks == a2.nchunks - - a2.append(np.zeros(100)) - assert (300,) == a2.shape - assert 300 == a2.size - assert 300 == a2.nbytes - assert 30 == a2.nchunks - assert a1.shape == a2.shape - assert a1.size == a2.size - assert a1.nbytes == a2.nbytes - assert a1.nchunks == a2.nchunks - - # a2 is caching so *will not* see updates made via other objects - a1.resize(400) - assert (400,) == a1.shape - assert 400 == a1.size - assert 400 == a1.nbytes - assert 40 == a1.nchunks - assert (300,) == a2.shape - assert 300 == a2.size - assert 300 == a2.nbytes - assert 30 == a2.nchunks - - def test_cache_attrs(self): - a1 = self.create_array(shape=100, chunks=10, dtype="i1", cache_attrs=False) - path = None - a2 = Array(a1.store, path=path, cache_attrs=True) - assert a1.attrs.asdict() == a2.attrs.asdict() - - # a1 is not caching so *will* see updates made via other objects - a2.attrs["foo"] = "xxx" - a2.attrs["bar"] = 42 - assert a1.attrs.asdict() == a2.attrs.asdict() - - # a2 is caching so *will not* see updates made via other objects - a1.attrs["foo"] = "yyy" - assert "yyy" == a1.attrs["foo"] - assert "xxx" == a2.attrs["foo"] - - def test_object_arrays_danger(self): - # skip this one as it only works if metadata are cached - pass - - -class TestArrayWithStoreCache(TestArray): - def create_store(self): - return LRUStoreCache(dict(), max_size=None) - - def test_store_has_bytes_values(self): - # skip as the cache has no control over how the store provides values - pass - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStore(TestArray): - compressor = Blosc() - dimension_separator: Literal[".", "/"] = "." - - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - key_separator = self.dimension_separator - store = FSStore( - path, - key_separator=key_separator, - auto_mkdir=True, - check=True, - create=True, - missing_exceptions=None, - ) - return store - - def expected(self): - return [ - "ab753fc81df0878589535ca9bad2816ba88d91bc", - "c16261446f9436b1e9f962e57ce3e8f6074abe8a", - "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", - "6e52f95ac15b164a8e96843a230fcee0e610729b", - "091fa99bc60706095c9ce30b56ce2503e0223f56", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreFromFilesystem(TestArray): - compressor = Blosc() - dimension_separator = "." - - def create_store(self): - from fsspec.implementations.local import LocalFileSystem - - fs = LocalFileSystem(auto_mkdir=True) - path = mkdtemp() - atexit.register(shutil.rmtree, path) - key_separator = self.dimension_separator - store = FSStore( - path, - fs=fs, - key_separator=key_separator, - check=True, - create=True, - missing_exceptions=None, - ) - return store - - def expected(self): - return [ - "ab753fc81df0878589535ca9bad2816ba88d91bc", - "c16261446f9436b1e9f962e57ce3e8f6074abe8a", - "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", - "6e52f95ac15b164a8e96843a230fcee0e610729b", - "091fa99bc60706095c9ce30b56ce2503e0223f56", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStorePartialRead(TestArray): - compressor = Blosc(blocksize=256) - partial_decompress = True - - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = FSStore(path) - return store - - def expected(self): - return [ - "dd7577d645c38767cf6f6d1ef8fd64002883a014", - "aa0de9892cf1ed3cda529efbf3233720b84489b7", - "e6191c44cf958576c29c41cef0f55b028a4dbdff", - "88adeeabb819feecccadf50152293dbb42f9107e", - "1426e084427f9920e29c9ec81b663d1005849455", - ] - - def test_non_cont(self): - z = self.create_array(shape=(500, 500, 500), chunks=(50, 50, 50), dtype="""" - - data = np.arange(25).reshape((5, 5)) - ds = zarr.v2.create( - shape=data.shape, - chunks=(5, 5), - dtype=data.dtype, - compressor=(None), - store=FSStore(url=str(tmpdir), mode="a"), - order="F", - ) - - ds[:] = data - - ds_reopened = zarr.v2.open_array(store=FSStore(url=str(tmpdir), mode="r")) - - written_data = ds_reopened[:] - assert_array_equal(data, written_data) diff --git a/tests/v2/test_creation.py b/tests/v2/test_creation.py deleted file mode 100644 index 08073a8ac3..0000000000 --- a/tests/v2/test_creation.py +++ /dev/null @@ -1,691 +0,0 @@ -import atexit -import os.path -import shutil -import warnings - -import numpy as np -import pytest -from numpy.testing import assert_array_equal - -from zarr.v2.codecs import Zlib -from zarr.v2.core import Array -from zarr.v2.creation import ( - array, - create, - empty, - empty_like, - full, - full_like, - ones, - ones_like, - open_array, - open_like, - zeros, - zeros_like, -) -from zarr.v2.hierarchy import open_group -from zarr.v2.n5 import N5Store -from zarr.v2.storage import DirectoryStore, KVStore -from zarr.v2.sync import ThreadSynchronizer -from .util import mktemp, have_fsspec - - -_VERSIONS = (None, 2) -_VERSIONS2 = (2,) - - -# something bcolz-like -class MockBcolzArray: - def __init__(self, data, chunklen): - self.data = data - self.chunklen = chunklen - - def __getattr__(self, item): - return getattr(self.data, item) - - def __getitem__(self, item): - return self.data[item] - - -# something h5py-like -class MockH5pyDataset: - def __init__(self, data, chunks): - self.data = data - self.chunks = chunks - - def __getattr__(self, item): - return getattr(self.data, item) - - def __getitem__(self, item): - return self.data[item] - - -def _init_creation_kwargs(at_root=True): - kwargs = {} - if not at_root: - kwargs["path"] = "array" - return kwargs - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_array(at_root): - kwargs = _init_creation_kwargs(at_root) - - # with numpy array - a = np.arange(100) - z = array(a, chunks=10, **kwargs) - assert a.shape == z.shape - assert a.dtype == z.dtype - assert_array_equal(a, z[:]) - - # with array-like - a = list(range(100)) - z = array(a, chunks=10, **kwargs) - assert (100,) == z.shape - assert np.asarray(a).dtype == z.dtype - assert_array_equal(np.asarray(a), z[:]) - - # with another zarr array - z2 = array(z, **kwargs) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - assert_array_equal(z[:], z2[:]) - - # with chunky array-likes - - b = np.arange(1000).reshape(100, 10) - c = MockBcolzArray(b, 10) - z3 = array(c, **kwargs) - assert c.shape == z3.shape - assert (10, 10) == z3.chunks - - b = np.arange(1000).reshape(100, 10) - c = MockH5pyDataset(b, chunks=(10, 2)) - z4 = array(c, **kwargs) - assert c.shape == z4.shape - assert (10, 2) == z4.chunks - - c = MockH5pyDataset(b, chunks=None) - z5 = array(c, **kwargs) - assert c.shape == z5.shape - assert isinstance(z5.chunks, tuple) - - # with dtype=None - a = np.arange(100, dtype="i4") - z = array(a, dtype=None, **kwargs) - assert_array_equal(a[:], z[:]) - assert a.dtype == z.dtype - - # with dtype=something else - a = np.arange(100, dtype="i4") - z = array(a, dtype="i8", **kwargs) - assert_array_equal(a[:], z[:]) - assert np.dtype("i8") == z.dtype - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_empty(at_root): - kwargs = _init_creation_kwargs(at_root) - z = empty(100, chunks=10, **kwargs) - assert (100,) == z.shape - assert (10,) == z.chunks - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_zeros(at_root): - kwargs = _init_creation_kwargs(at_root) - z = zeros(100, chunks=10, **kwargs) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.zeros(100), z[:]) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_ones(at_root): - kwargs = _init_creation_kwargs(at_root) - z = ones(100, chunks=10, **kwargs) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.ones(100), z[:]) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_full(at_root): - kwargs = _init_creation_kwargs(at_root) - z = full(100, chunks=10, fill_value=42, dtype="i4", **kwargs) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42, dtype="i4"), z[:]) - - # nan - z = full(100, chunks=10, fill_value=np.nan, dtype="f8", **kwargs) - assert np.all(np.isnan(z[:])) - - -def test_full_additional_dtypes(): - """Test additional types that aren't part of the base v3 spec.""" - kwargs = _init_creation_kwargs() - # NaT - z = full(100, chunks=10, fill_value="NaT", dtype="M8[s]", **kwargs) - assert np.all(np.isnat(z[:])) - z = full(100, chunks=10, fill_value="NaT", dtype="m8[s]", **kwargs) - assert np.all(np.isnat(z[:])) - - # byte string dtype - v = b"xxx" - z = full(100, chunks=10, fill_value=v, dtype="S3", **kwargs) - assert v == z[0] - a = z[...] - assert z.dtype == a.dtype - assert v == a[0] - assert np.all(a == v) - - # unicode string dtype - v = "xxx" - z = full(100, chunks=10, fill_value=v, dtype="U3", **kwargs) - assert v == z[0] - a = z[...] - assert z.dtype == a.dtype - assert v == a[0] - assert np.all(a == v) - - # bytes fill value / unicode dtype - v = b"xxx" - with pytest.raises(ValueError): - full(100, chunks=10, fill_value=v, dtype="U3") - - -@pytest.mark.parametrize("dimension_separator", [".", "/", None]) -@pytest.mark.parametrize("at_root", [False, True]) -def test_open_array(at_root, dimension_separator): - store = "data/array.zarr" - kwargs = _init_creation_kwargs(at_root) - - # mode == 'w' - z = open_array( - store, mode="w", shape=100, chunks=10, dimension_separator=dimension_separator, **kwargs - ) - z[:] = 42 - assert isinstance(z, Array) - - assert isinstance(z.store, DirectoryStore) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - - if dimension_separator is None: - assert z._dimension_separator == "." - else: - assert z._dimension_separator == dimension_separator - - # mode in 'r', 'r+' - group_kwargs = kwargs.copy() - open_group("data/group.zarr", mode="w", **group_kwargs) - for mode in "r", "r+": - with pytest.raises(ValueError): - open_array("doesnotexist", mode=mode) - with pytest.raises(ValueError): - open_array("data/group.zarr", mode=mode) - z = open_array(store, mode="r", **kwargs) - assert isinstance(z, Array) - assert isinstance(z.store, DirectoryStore) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - with pytest.raises(PermissionError): - z[:] = 43 - z = open_array(store, mode="r+", **kwargs) - assert isinstance(z, Array) - assert isinstance(z.store, DirectoryStore) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - z[:] = 43 - assert_array_equal(np.full(100, fill_value=43), z[:]) - - # mode == 'a' - shutil.rmtree(store) - z = open_array(store, mode="a", shape=100, chunks=10, **kwargs) - z[:] = 42 - assert isinstance(z, Array) - assert isinstance(z.store, DirectoryStore) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - - with pytest.raises(ValueError): - open_array("data/group.zarr", mode="a", **kwargs) - - # mode in 'w-', 'x' - for mode in "w-", "x": - shutil.rmtree(store) - z = open_array(store, mode=mode, shape=100, chunks=10, **kwargs) - z[:] = 42 - assert isinstance(z, Array) - assert isinstance(z.store, DirectoryStore) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - with pytest.raises(ValueError): - open_array(store, mode=mode, **kwargs) - - with pytest.raises(ValueError): - open_array("data/group.zarr", mode=mode, **kwargs) - - # with synchronizer - z = open_array(store, synchronizer=ThreadSynchronizer(), **kwargs) - assert isinstance(z, Array) - - # with path - kwargs_no_path = kwargs.copy() - kwargs_no_path.pop("path", None) - z = open_array(store, shape=100, path="foo/bar", mode="w", **kwargs_no_path) - assert isinstance(z, Array) - assert "foo/bar" == z.path - - # with chunk store - meta_store = "data/meta.zarr" - chunk_store = "data/chunks.zarr" - z = open_array(store=meta_store, chunk_store=chunk_store, shape=11, mode="w", **kwargs) - z[:] = 42 - assert os.path.abspath(meta_store) == z.store.path - assert os.path.abspath(chunk_store) == z.chunk_store.path - - -def test_open_array_none(): - # open with store = None - z = open_array(mode="w", shape=100, chunks=10) - assert isinstance(z, Array) - - -@pytest.mark.parametrize("dimension_separator", [".", "/", None]) -def test_open_array_infer_separator_from_store(dimension_separator): - StoreClass = DirectoryStore - path = None - store = StoreClass("data/array.zarr", dimension_separator=dimension_separator) - - # Note: no dimension_separator kwarg to open_array - # we are testing here that it gets inferred from store - z = open_array(store, path=path, mode="w", shape=100, chunks=10) - z[:] = 42 - assert isinstance(z, Array) - assert isinstance(z.store, DirectoryStore) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - - if dimension_separator is None: - assert z._dimension_separator == "." - else: - assert z._dimension_separator == dimension_separator - - -def test_open_array_n5(): - store = "data/array.zarr" - kwargs = _init_creation_kwargs() - - # for N5 store - store = "data/array.n5" - z = open_array(store, mode="w", shape=100, chunks=10, **kwargs) - z[:] = 42 - assert isinstance(z, Array) - assert isinstance(z.store, N5Store) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - - store = "data/group.n5" - group_kwargs = kwargs.copy() - z = open_group(store, mode="w", **group_kwargs) - i = z.create_group("inner") - a = i.zeros("array", shape=100, chunks=10) - a[:] = 42 - - # Edit inner/attributes.json to not include "n5" - with open("data/group.n5/inner/attributes.json", "w") as o: - o.write("{}") - - # Re-open - a = open_group(store, **group_kwargs)["inner"]["array"] - assert isinstance(a, Array) - assert isinstance(z.store, N5Store) - assert (100,) == a.shape - assert (10,) == a.chunks - assert_array_equal(np.full(100, fill_value=42), a[:]) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_open_array_dict_store(at_root): - # dict will become a KVStore - store = dict() - kwargs = _init_creation_kwargs(at_root) - expected_store_type = KVStore - - # mode == 'w' - z = open_array(store, mode="w", shape=100, chunks=10, **kwargs) - z[:] = 42 - assert isinstance(z, Array) - assert isinstance(z.store, expected_store_type) - assert (100,) == z.shape - assert (10,) == z.chunks - assert_array_equal(np.full(100, fill_value=42), z[:]) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_create_in_dict(at_root): - kwargs = _init_creation_kwargs(at_root) - expected_store_type = KVStore - - for func in [empty, zeros, ones]: - a = func(100, store=dict(), **kwargs) - assert isinstance(a.store, expected_store_type) - - a = full(100, 5, store=dict(), **kwargs) - assert isinstance(a.store, expected_store_type) - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.parametrize("at_root", [False, True]) -def test_create_writeable_mode(at_root, tmp_path): - # Regression test for https://github.com/zarr-developers/zarr-python/issues/1306 - import fsspec - - kwargs = _init_creation_kwargs(at_root) - store = fsspec.get_mapper(str(tmp_path)) - z = create(100, store=store, **kwargs) - assert z.store.map == store - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_empty_like(at_root): - kwargs = _init_creation_kwargs(at_root) - - # zarr array - z = empty(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) - z2 = empty_like(z, path=kwargs.get("path")) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z.order == z2.order - - # numpy array - a = np.empty(100, dtype="f4") - z3 = empty_like(a, **kwargs) - assert a.shape == z3.shape - assert (100,) == z3.chunks - assert a.dtype == z3.dtype - assert z3.fill_value is None - - # something slightly silly - a = [0] * 100 - z3 = empty_like(a, shape=200, **kwargs) - assert (200,) == z3.shape - - # other array-likes - b = np.arange(1000).reshape(100, 10) - c = MockBcolzArray(b, 10) - z = empty_like(c, **kwargs) - assert b.shape == z.shape - assert (10, 10) == z.chunks - c = MockH5pyDataset(b, chunks=(10, 2)) - z = empty_like(c, **kwargs) - assert b.shape == z.shape - assert (10, 2) == z.chunks - c = MockH5pyDataset(b, chunks=None) - z = empty_like(c, **kwargs) - assert b.shape == z.shape - assert isinstance(z.chunks, tuple) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_zeros_like(at_root): - kwargs = _init_creation_kwargs(at_root) - - # zarr array - z = zeros(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) - z2 = zeros_like(z, path=kwargs.get("path")) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z.order == z2.order - - # numpy array - a = np.empty(100, dtype="f4") - z3 = zeros_like(a, chunks=10, **kwargs) - assert a.shape == z3.shape - assert (10,) == z3.chunks - assert a.dtype == z3.dtype - assert 0 == z3.fill_value - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_ones_like(at_root): - kwargs = _init_creation_kwargs(at_root) - - # zarr array - z = ones(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) - z2 = ones_like(z, path=kwargs.get("path")) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z.order == z2.order - - # numpy array - a = np.empty(100, dtype="f4") - z3 = ones_like(a, chunks=10, **kwargs) - assert a.shape == z3.shape - assert (10,) == z3.chunks - assert a.dtype == z3.dtype - assert 1 == z3.fill_value - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_full_like(at_root): - kwargs = _init_creation_kwargs(at_root) - - z = full(100, chunks=10, dtype="f4", compressor=Zlib(5), fill_value=42, order="F", **kwargs) - z2 = full_like(z, path=kwargs.get("path")) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z.order == z2.order - - # numpy array - a = np.empty(100, dtype="f4") - z3 = full_like(a, chunks=10, fill_value=42, **kwargs) - assert a.shape == z3.shape - assert (10,) == z3.chunks - assert a.dtype == z3.dtype - assert 42 == z3.fill_value - - with pytest.raises(TypeError): - # fill_value missing - full_like(a, chunks=10, **kwargs) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_open_like(at_root): - kwargs = _init_creation_kwargs(at_root) - - # zarr array - path = mktemp() - atexit.register(shutil.rmtree, path) - z = full(100, chunks=10, dtype="f4", compressor=Zlib(5), fill_value=42, order="F", **kwargs) - z2 = open_like(z, path) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z.order == z2.order - - # numpy array - path = mktemp() - atexit.register(shutil.rmtree, path) - a = np.empty(100, dtype="f4") - z3 = open_like(a, path, chunks=10) - assert a.shape == z3.shape - assert (10,) == z3.chunks - assert a.dtype == z3.dtype - assert 0 == z3.fill_value - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_create(at_root): - kwargs = _init_creation_kwargs(at_root) - - # defaults - z = create(100, **kwargs) - assert isinstance(z, Array) - assert (100,) == z.shape - assert (100,) == z.chunks # auto-chunks - assert np.dtype(None) == z.dtype - assert "blosc" == z.compressor.codec_id - assert 0 == z.fill_value - - # all specified - z = create(100, chunks=10, dtype="i4", compressor=Zlib(1), fill_value=42, order="F", **kwargs) - assert isinstance(z, Array) - assert (100,) == z.shape - assert (10,) == z.chunks - assert np.dtype("i4") == z.dtype - assert "zlib" == z.compressor.codec_id - assert 1 == z.compressor.level - assert 42 == z.fill_value - assert "F" == z.order - - # with synchronizer - synchronizer = ThreadSynchronizer() - z = create(100, chunks=10, synchronizer=synchronizer, **kwargs) - assert isinstance(z, Array) - assert (100,) == z.shape - assert (10,) == z.chunks - assert synchronizer is z.synchronizer - - # don't allow string as compressor arg - with pytest.raises(ValueError): - create(100, chunks=10, compressor="zlib", **kwargs) - - # h5py compatibility - - z = create(100, compression="zlib", compression_opts=9, **kwargs) - assert "zlib" == z.compressor.codec_id - assert 9 == z.compressor.level - - z = create(100, compression="default", **kwargs) - assert "blosc" == z.compressor.codec_id - - # errors - with pytest.raises(ValueError): - # bad compression argument - create(100, compression=1, **kwargs) - with pytest.raises(ValueError): - # bad fill value - create(100, dtype="i4", fill_value="foo", **kwargs) - - # auto chunks - z = create(1000000000, chunks=True, **kwargs) - assert z.chunks[0] < z.shape[0] - z = create(1000000000, chunks=None, **kwargs) # backwards-compatibility - assert z.chunks[0] < z.shape[0] - # no chunks - z = create(1000000000, chunks=False, **kwargs) - assert z.chunks == z.shape - - -def test_compression_args(): - kwargs = _init_creation_kwargs() - - with warnings.catch_warnings(): - warnings.simplefilter("default") - z = create(100, compression="zlib", compression_opts=9, **kwargs) - assert isinstance(z, Array) - assert "zlib" == z.compressor.codec_id - assert 9 == z.compressor.level - - # 'compressor' overrides 'compression' - with pytest.warns(UserWarning): - z = create(100, compressor=Zlib(9), compression="bz2", compression_opts=1, **kwargs) - assert isinstance(z, Array) - assert "zlib" == z.compressor.codec_id - assert 9 == z.compressor.level - - # 'compressor' ignores 'compression_opts' - with pytest.warns(UserWarning): - z = create(100, compressor=Zlib(9), compression_opts=1, **kwargs) - assert isinstance(z, Array) - assert "zlib" == z.compressor.codec_id - assert 9 == z.compressor.level - - with pytest.warns(UserWarning): - # 'compressor' overrides 'compression' - create(100, compressor=Zlib(9), compression="bz2", compression_opts=1, **kwargs) - with pytest.warns(UserWarning): - # 'compressor' ignores 'compression_opts' - create(100, compressor=Zlib(9), compression_opts=1, **kwargs) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_create_read_only(at_root): - # https://github.com/alimanfoo/zarr/issues/151 - - kwargs = _init_creation_kwargs(at_root) - - # create an array initially read-only, then enable writing - z = create(100, read_only=True, **kwargs) - assert z.read_only - with pytest.raises(PermissionError): - z[:] = 42 - z.read_only = False - z[:] = 42 - assert np.all(z[...] == 42) - z.read_only = True - with pytest.raises(PermissionError): - z[:] = 0 - - # this is subtly different, but here we want to create an array with data, and then - # have it be read-only - a = np.arange(100) - z = array(a, read_only=True, **kwargs) - assert_array_equal(a, z[...]) - assert z.read_only - with pytest.raises(PermissionError): - z[:] = 42 - - -def test_json_dumps_chunks_numpy_dtype(): - z = zeros((10,), chunks=(np.int64(2),)) - assert np.all(z[...] == 0) - - -@pytest.mark.parametrize( - ("init_shape", "init_chunks", "shape", "chunks"), - ( - ((1,), (1,), (1,), (1,)), - ((1.0,), (1.0,), (1,), (1,)), - ((1.0,), False, (1,), (1,)), - ((1.0,), True, (1,), (1,)), - ((1.0,), None, (1,), (1,)), - ), -) -def test_shape_chunk_ints(init_shape, init_chunks, shape, chunks): - g = open_group() - array = g.create_dataset("ds", shape=init_shape, chunks=init_chunks, dtype=np.uint8) - - assert all( - isinstance(s, int) for s in array.shape - ), f"Expected shape to be all ints but found {array.shape=}." - assert all( - isinstance(c, int) for c in array.chunks - ), f"Expected chunks to be all ints but found {array.chunks=}." - assert array.shape == shape, f"Expected {shape=} but found {array.shape=}." - assert array.chunks == chunks, f"Expected {chunks=} but found {array.chunks=}." diff --git a/tests/v2/test_dim_separator.py b/tests/v2/test_dim_separator.py deleted file mode 100644 index 2cff527f89..0000000000 --- a/tests/v2/test_dim_separator.py +++ /dev/null @@ -1,136 +0,0 @@ -import pathlib - -import pytest -from numpy.testing import assert_array_equal -from functools import partial - -import zarr.v2 -from zarr.v2.core import Array -from zarr.v2.storage import DirectoryStore, NestedDirectoryStore, FSStore -from .util import have_fsspec - - -needs_fsspec = pytest.mark.skipif(not have_fsspec, reason="needs fsspec") - - -@pytest.fixture( - params=( - "static_flat", - "static_flat_legacy", - "static_nested", - "static_nested_legacy", - "directory_nested", - "directory_flat", - "directory_default", - "nesteddirectory_nested", - "nesteddirectory_default", - pytest.param("fs_nested", marks=needs_fsspec), - pytest.param("fs_flat", marks=needs_fsspec), - pytest.param("fs_default", marks=needs_fsspec), - ) -) -def dataset(tmpdir, request) -> None: - """ - Generate a variety of different Zarrs using - different store implementations as well as - different dimension_separator arguments. - """ - - loc = tmpdir.join("dim_sep_test.zarr") - which = request.param - kwargs = {} - - if which.startswith("static"): - test_root = pathlib.Path(__file__).parent - suffix = which[len("static_") :] - static = test_root / "fixture" / "dimension_separator" / suffix - - # this commented block will generate the static fixtures. - # if not static.exists(): # pragma: no cover - # if "nested" in which: - # # No way to reproduce the nested_legacy file via code - # generator = NestedDirectoryStore - # else: - # if "legacy" in suffix: - # # No dimension_separator metadata included - # generator = DirectoryStore - # else: - # # Explicit dimension_separator metadata included - # generator = partial(DirectoryStore, dimension_separator=".") - - # # store the data - should be one-time operation - # s = generator(str(static)) - # a = zarr.v2.open(store=s, mode="w", shape=(2, 2), dtype=" len(array_keys) - assert sorted(array_keys_recurse) == ["baz", "zab"] - - # test recursive arrays - arrays = list(g1["foo"].arrays(recurse=False)) - arrays_recurse = list(g1["foo"].arrays(recurse=True)) - assert len(arrays_recurse) > len(arrays) - assert "zab" == arrays_recurse[0][0] - assert g1["foo"]["bar"]["zab"] == arrays_recurse[0][1] - - g1.store.close() - - def test_getattr(self): - # setup - g1 = self.create_group() - g2 = g1.create_group("foo") - g2.create_dataset("bar", shape=100) - - # test - assert g1["foo"] == g1.foo - assert g2["bar"] == g2.bar - # test that hasattr returns False instead of an exception (issue #88) - assert not hasattr(g1, "unexistingattribute") - - g1.store.close() - - def test_setitem(self): - g = self.create_group() - try: - data = np.arange(100) - g["foo"] = data - assert_array_equal(data, g["foo"]) - data = np.arange(200) - g["foo"] = data - assert_array_equal(data, g["foo"]) - # 0d array - g["foo"] = 42 - assert () == g["foo"].shape - assert 42 == g["foo"][()] - except NotImplementedError: - pass - g.store.close() - - def test_delitem(self): - g = self.create_group() - g.create_group("foo") - g.create_dataset("bar/baz", shape=100, chunks=10) - assert "foo" in g - assert "bar" in g - assert "bar/baz" in g - try: - del g["bar"] - with pytest.raises(KeyError): - del g["xxx"] - except NotImplementedError: - pass - else: - assert "foo" in g - assert "bar" not in g - assert "bar/baz" not in g - g.store.close() - - def test_move(self): - g = self.create_group() - - data = np.arange(100) - g["boo"] = data - - data = np.arange(100) - g["foo"] = data - - g.move("foo", "bar") - assert "foo" not in g - assert "bar" in g - assert_array_equal(data, g["bar"]) - - g.move("bar", "foo/bar") - assert "bar" not in g - assert "foo" in g - assert "foo/bar" in g - assert isinstance(g["foo"], Group) - assert_array_equal(data, g["foo/bar"]) - - g.move("foo", "foo2") - assert "foo" not in g - assert "foo/bar" not in g - assert "foo2" in g - assert "foo2/bar" in g - assert isinstance(g["foo2"], Group) - assert_array_equal(data, g["foo2/bar"]) - - g2 = g["foo2"] - g2.move("bar", "/bar") - assert "foo2" in g - assert "foo2/bar" not in g - assert "bar" in g - assert isinstance(g["foo2"], Group) - - assert_array_equal(data, g["bar"]) - - with pytest.raises(ValueError): - g2.move("bar", "bar2") - - with pytest.raises(ValueError): - g.move("bar", "boo") - - g.store.close() - - def test_array_creation(self): - grp = self.create_group() - - a = grp.create("a", shape=100, chunks=10) - assert isinstance(a, Array) - b = grp.empty("b", shape=100, chunks=10) - assert isinstance(b, Array) - assert b.fill_value is None - c = grp.zeros("c", shape=100, chunks=10) - assert isinstance(c, Array) - assert 0 == c.fill_value - d = grp.ones("d", shape=100, chunks=10) - assert isinstance(d, Array) - assert 1 == d.fill_value - e = grp.full("e", shape=100, chunks=10, fill_value=42) - assert isinstance(e, Array) - assert 42 == e.fill_value - - f = grp.empty_like("f", a) - assert isinstance(f, Array) - assert f.fill_value is None - g = grp.zeros_like("g", a) - assert isinstance(g, Array) - assert 0 == g.fill_value - h = grp.ones_like("h", a) - assert isinstance(h, Array) - assert 1 == h.fill_value - i = grp.full_like("i", e) - assert isinstance(i, Array) - assert 42 == i.fill_value - - j = grp.array("j", data=np.arange(100), chunks=10) - assert isinstance(j, Array) - assert_array_equal(np.arange(100), j[:]) - - grp.store.close() - - grp = self.create_group(read_only=True) - with pytest.raises(PermissionError): - grp.create("aa", shape=100, chunks=10) - with pytest.raises(PermissionError): - grp.empty("aa", shape=100, chunks=10) - with pytest.raises(PermissionError): - grp.zeros("aa", shape=100, chunks=10) - with pytest.raises(PermissionError): - grp.ones("aa", shape=100, chunks=10) - with pytest.raises(PermissionError): - grp.full("aa", shape=100, chunks=10, fill_value=42) - with pytest.raises(PermissionError): - grp.array("aa", data=np.arange(100), chunks=10) - with pytest.raises(PermissionError): - grp.create("aa", shape=100, chunks=10) - with pytest.raises(PermissionError): - grp.empty_like("aa", a) - with pytest.raises(PermissionError): - grp.zeros_like("aa", a) - with pytest.raises(PermissionError): - grp.ones_like("aa", a) - with pytest.raises(PermissionError): - grp.full_like("aa", a) - - grp.store.close() - - def test_paths(self): - g1 = self.create_group() - g2 = g1.create_group("foo/bar") - - assert g1 == g1["/"] - assert g1 == g1["//"] - assert g1 == g1["///"] - assert g1 == g2["/"] - assert g1 == g2["//"] - assert g1 == g2["///"] - assert g2 == g1["foo/bar"] - assert g2 == g1["/foo/bar"] - assert g2 == g1["foo/bar/"] - assert g2 == g1["//foo/bar"] - assert g2 == g1["//foo//bar//"] - assert g2 == g1["///foo///bar///"] - assert g2 == g2["/foo/bar"] - - with pytest.raises(ValueError): - g1["."] - with pytest.raises(ValueError): - g1[".."] - with pytest.raises(ValueError): - g1["foo/."] - with pytest.raises(ValueError): - g1["foo/.."] - with pytest.raises(ValueError): - g1["foo/./bar"] - with pytest.raises(ValueError): - g1["foo/../bar"] - - g1.store.close() - - def test_pickle(self): - # setup group - g = self.create_group() - d = g.create_dataset("foo/bar", shape=100, chunks=10) - d[:] = np.arange(100) - path = g.path - name = g.name - n = len(g) - keys = list(g) - - # round-trip through pickle - dump = pickle.dumps(g) - # some stores cannot be opened twice at the same time, need to close - # store before can round-trip through pickle - g.store.close() - g2 = pickle.loads(dump) - - # verify - assert path == g2.path - assert name == g2.name - assert n == len(g2) - assert keys == list(g2) - assert isinstance(g2["foo"], Group) - assert isinstance(g2["foo/bar"], Array) - - g2.store.close() - - def test_context_manager(self): - with self.create_group() as g: - d = g.create_dataset("foo/bar", shape=100, chunks=10) - d[:] = np.arange(100) - - -@pytest.mark.parametrize("chunk_dict", [False, True]) -def test_group_init_from_dict(chunk_dict): - if chunk_dict: - store, chunk_store = dict(), dict() - else: - store, chunk_store = dict(), None - init_group(store, path=None, chunk_store=chunk_store) - g = Group(store, path=None, read_only=False, chunk_store=chunk_store) - assert store is not g.store - assert isinstance(g.store, KVStore) - if chunk_store is None: - assert g.store is g.chunk_store - else: - assert chunk_store is not g.chunk_store - - -class TestGroupWithMemoryStore(TestGroup): - @staticmethod - def create_store(): - return MemoryStore(), None - - -class TestGroupWithDirectoryStore(TestGroup): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = DirectoryStore(path) - return store, None - - -@skip_test_env_var("ZARR_TEST_ABS") -class TestGroupWithABSStore(TestGroup): - @staticmethod - def create_store(): - container_client = abs_container() - store = ABSStore(client=container_client) - store.rmdir() - return store, None - - @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") - def test_pickle(self): - # internal attribute on ContainerClient isn't serializable for py36 and earlier - super().test_pickle() - - -class TestGroupWithNestedDirectoryStore(TestGroup): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = NestedDirectoryStore(path) - return store, None - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestGroupWithFSStore(TestGroup): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = FSStore(path) - return store, None - - def test_round_trip_nd(self): - data = np.arange(1000).reshape(10, 10, 10) - name = "raw" - - store, _ = self.create_store() - f = open_group(store, mode="w") - f.create_dataset(name, data=data, chunks=(5, 5, 5), compressor=None) - assert name in f - h = open_group(store, mode="r") - np.testing.assert_array_equal(h[name][:], data) - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestGroupWithNestedFSStore(TestGroupWithFSStore): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = FSStore(path, key_separator="/", auto_mkdir=True) - return store, None - - def test_inconsistent_dimension_separator(self): - data = np.arange(1000).reshape(10, 10, 10) - name = "raw" - - store, _ = self.create_store() - f = open_group(store, mode="w") - - # cannot specify dimension_separator that conflicts with the store - with pytest.raises(ValueError): - f.create_dataset( - name, data=data, chunks=(5, 5, 5), compressor=None, dimension_separator="." - ) - - -class TestGroupWithZipStore(TestGroup): - @staticmethod - def create_store(): - path = mktemp(suffix=".zip") - atexit.register(os.remove, path) - store = ZipStore(path) - return store, None - - def test_context_manager(self): - with self.create_group() as g: - store = g.store - d = g.create_dataset("foo/bar", shape=100, chunks=10) - d[:] = np.arange(100) - - # Check that exiting the context manager closes the store, - # and therefore the underlying ZipFile. - with pytest.raises(ValueError): - store.zf.extractall() - - def test_move(self): - # zip store is not erasable (can so far only append to a zip - # so we can't test for move. - pass - - -class TestGroupWithDBMStore(TestGroup): - @staticmethod - def create_store(): - path = mktemp(suffix=".anydbm") - atexit.register(atexit_rmglob, path + "*") - store = DBMStore(path, flag="n") - return store, None - - -class TestGroupWithDBMStoreBerkeleyDB(TestGroup): - @staticmethod - def create_store(): - bsddb3 = pytest.importorskip("bsddb3") - path = mktemp(suffix=".dbm") - atexit.register(os.remove, path) - store = DBMStore(path, flag="n", open=bsddb3.btopen) - return store, None - - -class TestGroupWithLMDBStore(TestGroup): - @staticmethod - def create_store(): - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - store = LMDBStore(path) - return store, None - - -class TestGroupWithSQLiteStore(TestGroup): - def create_store(self): - pytest.importorskip("sqlite3") - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStore(path) - return store, None - - -class TestGroupWithChunkStore(TestGroup): - @staticmethod - def create_store(): - return KVStore(dict()), KVStore(dict()) - - def test_chunk_store(self): - # setup - store, chunk_store = self.create_store() - g = self.create_group(store, chunk_store=chunk_store) - - # check attributes - assert store is g.store - assert chunk_store is g.chunk_store - - # create array - a = g.zeros("foo", shape=100, chunks=10) - assert store is a.store - assert chunk_store is a.chunk_store - a[:] = np.arange(100) - assert_array_equal(np.arange(100), a[:]) - - # check store keys - expect = sorted([group_meta_key, "foo/" + array_meta_key]) - actual = sorted(store.keys()) - assert expect == actual - expect = ["foo/" + str(i) for i in range(10)] - actual = sorted(chunk_store.keys()) - assert expect == actual - - -class TestGroupWithStoreCache(TestGroup): - @staticmethod - def create_store(): - store = LRUStoreCache(dict(), max_size=None) - return store, None - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -def test_group_writeable_mode(tmp_path): - # Regression test for https://github.com/zarr-developers/zarr-python/issues/1353 - import fsspec - - store = fsspec.get_mapper(str(tmp_path)) - zg = group(store=store) - assert zg.store.map == store - - -def test_open_group(): - # test the open_group() convenience function - - store = "data/group.zarr" - - expected_store_type = DirectoryStore - - # mode == 'w' - path = None - g = open_group(store, path=path, mode="w") - assert isinstance(g, Group) - assert isinstance(g.store, expected_store_type) - assert 0 == len(g) - g.create_groups("foo", "bar") - assert 2 == len(g) - - # mode in 'r', 'r+' - open_array("data/array.zarr", shape=100, chunks=10, mode="w") - for mode in "r", "r+": - with pytest.raises(ValueError): - open_group("doesnotexist", mode=mode) - with pytest.raises(ValueError): - open_group("data/array.zarr", mode=mode) - g = open_group(store, mode="r") - assert isinstance(g, Group) - assert 2 == len(g) - with pytest.raises(PermissionError): - g.create_group("baz") - g = open_group(store, mode="r+") - assert isinstance(g, Group) - assert 2 == len(g) - g.create_groups("baz", "quux") - assert 4 == len(g) - - # mode == 'a' - shutil.rmtree(store) - g = open_group(store, path=path, mode="a") - assert isinstance(g, Group) - assert isinstance(g.store, expected_store_type) - assert 0 == len(g) - g.create_groups("foo", "bar") - assert 2 == len(g) - - with pytest.raises(ValueError): - open_group("data/array.zarr", mode="a") - - # mode in 'w-', 'x' - for mode in "w-", "x": - shutil.rmtree(store) - g = open_group(store, path=path, mode=mode) - assert isinstance(g, Group) - assert isinstance(g.store, expected_store_type) - assert 0 == len(g) - g.create_groups("foo", "bar") - assert 2 == len(g) - with pytest.raises(ValueError): - open_group(store, path=path, mode=mode) - with pytest.raises(ValueError): - open_group("data/array.zarr", mode=mode) - - # open with path - g = open_group(store, path="foo/bar") - assert isinstance(g, Group) - assert "foo/bar" == g.path - - -def test_group_completions(): - path = None - g = group(path=path) - d = dir(g) - assert "foo" not in d - assert "bar" not in d - assert "baz" not in d - assert "qux" not in d - assert "xxx" not in d - assert "yyy" not in d - assert "zzz" not in d - assert "123" not in d - assert "456" not in d - g.create_groups("foo", "bar", "baz/qux", "123") - g.zeros("xxx", shape=100) - g.zeros("yyy", shape=100) - g.zeros("zzz", shape=100) - g.zeros("456", shape=100) - d = dir(g) - assert "foo" in d - assert "bar" in d - assert "baz" in d - assert "qux" not in d - assert "xxx" in d - assert "yyy" in d - assert "zzz" in d - assert "123" not in d # not valid identifier - assert "456" not in d # not valid identifier - - -def test_group_key_completions(): - path = None - g = group(path=path) - d = dir(g) - # noinspection PyProtectedMember - k = g._ipython_key_completions_() - - # none of these names should be an attribute - assert "foo" not in d - assert "bar" not in d - assert "baz" not in d - assert "qux" not in d - assert "xxx" not in d - assert "yyy" not in d - assert "zzz" not in d - assert "123" not in d - assert "456" not in d - assert "asdf;" not in d - - # none of these names should be an item - assert "foo" not in k - assert "bar" not in k - assert "baz" not in k - assert "qux" not in k - assert "xxx" not in k - assert "yyy" not in k - assert "zzz" not in k - assert "123" not in k - assert "456" not in k - assert "asdf;" not in k - - g.create_groups("foo", "bar", "baz/qux", "123") - g.zeros("xxx", shape=100) - g.zeros("yyy", shape=100) - g.zeros("zzz", shape=100) - g.zeros("456", shape=100) - g.zeros("asdf;", shape=100) - - d = dir(g) - # noinspection PyProtectedMember - k = g._ipython_key_completions_() - - assert "foo" in d - assert "bar" in d - assert "baz" in d - assert "qux" not in d - assert "xxx" in d - assert "yyy" in d - assert "zzz" in d - assert "123" not in d # not valid identifier - assert "456" not in d # not valid identifier - assert "asdf;" not in d # not valid identifier - - assert "foo" in k - assert "bar" in k - assert "baz" in k - assert "qux" not in k - assert "xxx" in k - assert "yyy" in k - assert "zzz" in k - assert "123" in k - assert "456" in k - assert "asdf;" in k - - -def _check_tree(g, expect_bytes, expect_text): - assert expect_bytes == bytes(g.tree()) - assert expect_text == str(g.tree()) - expect_repr = expect_text - assert expect_repr == repr(g.tree()) - if ipytree: - # noinspection PyProtectedMember - widget = g.tree()._repr_mimebundle_() - isinstance(widget, ipytree.Tree) - - -@pytest.mark.parametrize("at_root", [False, True]) -def test_tree(at_root): - # setup - path = None if at_root else "group1" - g1 = group(path=path) - g2 = g1.create_group("foo") - g3 = g1.create_group("bar") - g3.create_group("baz") - g5 = g3.create_group("quux") - g5.create_dataset("baz", shape=100, chunks=10) - - tree_path = "/" if at_root else path - # test root group - - expect_bytes = textwrap.dedent( - f"""\ - {tree_path} - +-- bar - | +-- baz - | +-- quux - | +-- baz (100,) float64 - +-- foo""" - ).encode() - expect_text = textwrap.dedent( - f"""\ - {tree_path} - ├── bar - │ ├── baz - │ └── quux - │ └── baz (100,) float64 - └── foo""" - ) - _check_tree(g1, expect_bytes, expect_text) - - # test different group - expect_bytes = textwrap.dedent( - """\ - foo""" - ).encode() - expect_text = textwrap.dedent( - """\ - foo""" - ) - _check_tree(g2, expect_bytes, expect_text) - - # test different group - expect_bytes = textwrap.dedent( - """\ - bar - +-- baz - +-- quux - +-- baz (100,) float64""" - ).encode() - expect_text = textwrap.dedent( - """\ - bar - ├── baz - └── quux - └── baz (100,) float64""" - ) - _check_tree(g3, expect_bytes, expect_text) - - -def test_open_group_from_paths(): - """Verify zarr_version is applied to both the store and chunk_store.""" - store = tempfile.mkdtemp() - chunk_store = tempfile.mkdtemp() - atexit.register(atexit_rmtree, store) - atexit.register(atexit_rmtree, chunk_store) - path = "g1" - _ = open_group(store, path=path, chunk_store=chunk_store) diff --git a/tests/v2/test_indexing.py b/tests/v2/test_indexing.py deleted file mode 100644 index c1fd87572d..0000000000 --- a/tests/v2/test_indexing.py +++ /dev/null @@ -1,1758 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_array_equal - -import zarr.v2 -from zarr.v2.indexing import ( - make_slice_selection, - normalize_integer_selection, - oindex, - oindex_set, - replace_ellipsis, - PartialChunkIterator, -) - -from .util import CountingDict - - -def test_normalize_integer_selection(): - assert 1 == normalize_integer_selection(1, 100) - assert 99 == normalize_integer_selection(-1, 100) - with pytest.raises(IndexError): - normalize_integer_selection(100, 100) - with pytest.raises(IndexError): - normalize_integer_selection(1000, 100) - with pytest.raises(IndexError): - normalize_integer_selection(-1000, 100) - - -def test_replace_ellipsis(): - # 1D, single item - assert (0,) == replace_ellipsis(0, (100,)) - - # 1D - assert (slice(None),) == replace_ellipsis(Ellipsis, (100,)) - assert (slice(None),) == replace_ellipsis(slice(None), (100,)) - assert (slice(None, 100),) == replace_ellipsis(slice(None, 100), (100,)) - assert (slice(0, None),) == replace_ellipsis(slice(0, None), (100,)) - assert (slice(None),) == replace_ellipsis((slice(None), Ellipsis), (100,)) - assert (slice(None),) == replace_ellipsis((Ellipsis, slice(None)), (100,)) - - # 2D, single item - assert (0, 0) == replace_ellipsis((0, 0), (100, 100)) - assert (-1, 1) == replace_ellipsis((-1, 1), (100, 100)) - - # 2D, single col/row - assert (0, slice(None)) == replace_ellipsis((0, slice(None)), (100, 100)) - assert (0, slice(None)) == replace_ellipsis((0,), (100, 100)) - assert (slice(None), 0) == replace_ellipsis((slice(None), 0), (100, 100)) - - # 2D slice - assert (slice(None), slice(None)) == replace_ellipsis(Ellipsis, (100, 100)) - assert (slice(None), slice(None)) == replace_ellipsis(slice(None), (100, 100)) - assert (slice(None), slice(None)) == replace_ellipsis((slice(None), slice(None)), (100, 100)) - assert (slice(None), slice(None)) == replace_ellipsis((Ellipsis, slice(None)), (100, 100)) - assert (slice(None), slice(None)) == replace_ellipsis((slice(None), Ellipsis), (100, 100)) - assert (slice(None), slice(None)) == replace_ellipsis( - (slice(None), Ellipsis, slice(None)), (100, 100) - ) - assert (slice(None), slice(None)) == replace_ellipsis( - (Ellipsis, slice(None), slice(None)), (100, 100) - ) - assert (slice(None), slice(None)) == replace_ellipsis( - (slice(None), slice(None), Ellipsis), (100, 100) - ) - - -def test_get_basic_selection_0d(): - # setup - a = np.array(42) - z = zarr.v2.create(shape=a.shape, dtype=a.dtype, fill_value=None) - z[...] = a - - assert_array_equal(a, z.get_basic_selection(Ellipsis)) - assert_array_equal(a, z[...]) - assert 42 == z.get_basic_selection(()) - assert 42 == z[()] - - # test out param - b = np.zeros_like(a) - z.get_basic_selection(Ellipsis, out=b) - assert_array_equal(a, b) - - # test structured array - value = (b"aaa", 1, 4.2) - a = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) - z = zarr.v2.create(shape=a.shape, dtype=a.dtype, fill_value=None) - z[()] = value - assert_array_equal(a, z.get_basic_selection(Ellipsis)) - assert_array_equal(a, z[...]) - assert a[()] == z.get_basic_selection(()) - assert a[()] == z[()] - assert b"aaa" == z.get_basic_selection((), fields="foo") - assert b"aaa" == z["foo"] - assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) - assert a[["foo", "bar"]] == z["foo", "bar"] - # test out param - b = np.zeros_like(a) - z.get_basic_selection(Ellipsis, out=b) - assert_array_equal(a, b) - c = np.zeros_like(a[["foo", "bar"]]) - z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) - assert_array_equal(a[["foo", "bar"]], c) - - -basic_selections_1d = [ - # single value - 42, - -1, - # slices - slice(0, 1050), - slice(50, 150), - slice(0, 2000), - slice(-150, -50), - slice(-2000, 2000), - slice(0, 0), # empty result - slice(-1, 0), # empty result - # total selections - slice(None), - Ellipsis, - (), - (Ellipsis, slice(None)), - # slice with step - slice(None), - slice(None, None), - slice(None, None, 1), - slice(None, None, 10), - slice(None, None, 100), - slice(None, None, 1000), - slice(None, None, 10000), - slice(0, 1050), - slice(0, 1050, 1), - slice(0, 1050, 10), - slice(0, 1050, 100), - slice(0, 1050, 1000), - slice(0, 1050, 10000), - slice(1, 31, 3), - slice(1, 31, 30), - slice(1, 31, 300), - slice(81, 121, 3), - slice(81, 121, 30), - slice(81, 121, 300), - slice(50, 150), - slice(50, 150, 1), - slice(50, 150, 10), -] - - -basic_selections_1d_bad = [ - # only positive step supported - slice(None, None, -1), - slice(None, None, -10), - slice(None, None, -100), - slice(None, None, -1000), - slice(None, None, -10000), - slice(1050, -1, -1), - slice(1050, -1, -10), - slice(1050, -1, -100), - slice(1050, -1, -1000), - slice(1050, -1, -10000), - slice(1050, 0, -1), - slice(1050, 0, -10), - slice(1050, 0, -100), - slice(1050, 0, -1000), - slice(1050, 0, -10000), - slice(150, 50, -1), - slice(150, 50, -10), - slice(31, 1, -3), - slice(121, 81, -3), - slice(-1, 0, -1), - # bad stuff - 2.3, - "foo", - b"xxx", - None, - (0, 0), - (slice(None), slice(None)), -] - - -def _test_get_basic_selection(a, z, selection): - expect = a[selection] - actual = z.get_basic_selection(selection) - assert_array_equal(expect, actual) - actual = z[selection] - assert_array_equal(expect, actual) - - -# noinspection PyStatementEffect -def test_get_basic_selection_1d(): - # setup - a = np.arange(1050, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - for selection in basic_selections_1d: - _test_get_basic_selection(a, z, selection) - - for selection in basic_selections_1d_bad: - with pytest.raises(IndexError): - z.get_basic_selection(selection) - with pytest.raises(IndexError): - z[selection] - - with pytest.raises(IndexError): - z.get_basic_selection([1, 0]) - - -basic_selections_2d = [ - # single row - 42, - -1, - (42, slice(None)), - (-1, slice(None)), - # single col - (slice(None), 4), - (slice(None), -1), - # row slices - slice(None), - slice(0, 1000), - slice(250, 350), - slice(0, 2000), - slice(-350, -250), - slice(0, 0), # empty result - slice(-1, 0), # empty result - slice(-2000, 0), - slice(-2000, 2000), - # 2D slices - (slice(None), slice(1, 5)), - (slice(250, 350), slice(None)), - (slice(250, 350), slice(1, 5)), - (slice(250, 350), slice(-5, -1)), - (slice(250, 350), slice(-50, 50)), - (slice(250, 350, 10), slice(1, 5)), - (slice(250, 350), slice(1, 5, 2)), - (slice(250, 350, 33), slice(1, 5, 3)), - # total selections - (slice(None), slice(None)), - Ellipsis, - (), - (Ellipsis, slice(None)), - (Ellipsis, slice(None), slice(None)), -] - - -basic_selections_2d_bad = [ - # bad stuff - 2.3, - "foo", - b"xxx", - None, - (2.3, slice(None)), - # only positive step supported - slice(None, None, -1), - (slice(None, None, -1), slice(None)), - (0, 0, 0), - (slice(None), slice(None), slice(None)), -] - - -# noinspection PyStatementEffect -def test_get_basic_selection_2d(): - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - for selection in basic_selections_2d: - _test_get_basic_selection(a, z, selection) - - bad_selections = basic_selections_2d_bad + [ - # integer arrays - [0, 1], - (slice(None), [0, 1]), - ] - for selection in bad_selections: - with pytest.raises(IndexError): - z.get_basic_selection(selection) - # check fallback on fancy indexing - fancy_selection = ([0, 1], [0, 1]) - np.testing.assert_array_equal(z[fancy_selection], [0, 11]) - - -def test_fancy_indexing_fallback_on_get_setitem(): - z = zarr.v2.zeros((20, 20)) - z[[1, 2, 3], [1, 2, 3]] = 1 - np.testing.assert_array_equal( - z[:4, :4], - [ - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [0, 0, 0, 1], - ], - ) - np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) - # test broadcasting - np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) - # test 1D fancy indexing - z2 = zarr.v2.zeros(5) - z2[[1, 2, 3]] = 1 - np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) - - -@pytest.mark.parametrize( - "index,expected_result", - [ - # Single iterable of integers - ([0, 1], [[0, 1, 2], [3, 4, 5]]), - # List first, then slice - (([0, 1], slice(None)), [[0, 1, 2], [3, 4, 5]]), - # List first, then slice - (([0, 1], slice(1, None)), [[1, 2], [4, 5]]), - # Slice first, then list - ((slice(0, 2), [0, 2]), [[0, 2], [3, 5]]), - # Slices only - ((slice(0, 2), slice(0, 2)), [[0, 1], [3, 4]]), - # List with repeated index - (([1, 0, 1], slice(1, None)), [[4, 5], [1, 2], [4, 5]]), - # 1D indexing - (([1, 0, 1]), [[3, 4, 5], [0, 1, 2], [3, 4, 5]]), - ], -) -def test_orthogonal_indexing_fallback_on_getitem_2d(index, expected_result): - """ - Tests the orthogonal indexing fallback on __getitem__ for a 2D matrix. - - In addition to checking expected behavior, all indexing - is also checked against numpy. - """ - # [0, 1, 2], - # [3, 4, 5], - # [6, 7, 8] - a = np.arange(9).reshape(3, 3) - z = zarr.v2.array(a) - - np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") - np.testing.assert_array_equal(z[index], expected_result) - - -@pytest.mark.parametrize( - "index,expected_result", - [ - # Single iterable of integers - ([0, 1], [[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]), - # One slice, two integers - ((slice(0, 2), 1, 1), [4, 13]), - # One integer, two slices - ((slice(0, 2), 1, slice(0, 2)), [[3, 4], [12, 13]]), - # Two slices and a list - ((slice(0, 2), [1, 2], slice(0, 2)), [[[3, 4], [6, 7]], [[12, 13], [15, 16]]]), - ], -) -def test_orthogonal_indexing_fallback_on_getitem_3d(index, expected_result): - """ - Tests the orthogonal indexing fallback on __getitem__ for a 3D matrix. - - In addition to checking expected behavior, all indexing - is also checked against numpy. - """ - # [[[ 0, 1, 2], - # [ 3, 4, 5], - # [ 6, 7, 8]], - - # [[ 9, 10, 11], - # [12, 13, 14], - # [15, 16, 17]], - - # [[18, 19, 20], - # [21, 22, 23], - # [24, 25, 26]]] - a = np.arange(27).reshape(3, 3, 3) - z = zarr.v2.array(a) - - np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") - np.testing.assert_array_equal(z[index], expected_result) - - -@pytest.mark.parametrize( - "index,expected_result", - [ - # Single iterable of integers - ([0, 1], [[1, 1, 1], [1, 1, 1], [0, 0, 0]]), - # List and slice combined - (([0, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), - # Index repetition is ignored on setitem - (([0, 1, 1, 1, 1, 1, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), - # Slice with step - (([0, 2], slice(None, None, 2)), [[1, 0, 1], [0, 0, 0], [1, 0, 1]]), - ], -) -def test_orthogonal_indexing_fallback_on_setitem_2d(index, expected_result): - """ - Tests the orthogonal indexing fallback on __setitem__ for a 3D matrix. - - In addition to checking expected behavior, all indexing - is also checked against numpy. - """ - # Slice + fancy index - a = np.zeros((3, 3)) - z = zarr.v2.array(a) - z[index] = 1 - a[index] = 1 - np.testing.assert_array_equal(z, expected_result) - np.testing.assert_array_equal(z, a, err_msg="Indexing disagrees with numpy") - - -def test_fancy_indexing_doesnt_mix_with_implicit_slicing(): - z2 = zarr.v2.zeros((5, 5, 5)) - with pytest.raises(IndexError): - z2[[1, 2, 3], [1, 2, 3]] = 2 - with pytest.raises(IndexError): - np.testing.assert_array_equal(z2[[1, 2, 3], [1, 2, 3]], 0) - with pytest.raises(IndexError): - z2[..., [1, 2, 3]] = 2 - with pytest.raises(IndexError): - np.testing.assert_array_equal(z2[..., [1, 2, 3]], 0) - - -def test_set_basic_selection_0d(): - # setup - v = np.array(42) - a = np.zeros_like(v) - z = zarr.v2.zeros_like(v) - assert_array_equal(a, z) - - # tests - z.set_basic_selection(Ellipsis, v) - assert_array_equal(v, z) - z[...] = 0 - assert_array_equal(a, z) - z[...] = v - assert_array_equal(v, z) - - # test structured array - value = (b"aaa", 1, 4.2) - v = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) - a = np.zeros_like(v) - z = zarr.v2.create(shape=a.shape, dtype=a.dtype, fill_value=None) - - # tests - z.set_basic_selection(Ellipsis, v) - assert_array_equal(v, z) - z.set_basic_selection(Ellipsis, a) - assert_array_equal(a, z) - z[...] = v - assert_array_equal(v, z) - z[...] = a - assert_array_equal(a, z) - # with fields - z.set_basic_selection(Ellipsis, v["foo"], fields="foo") - assert v["foo"] == z["foo"] - assert a["bar"] == z["bar"] - assert a["baz"] == z["baz"] - z["bar"] = v["bar"] - assert v["foo"] == z["foo"] - assert v["bar"] == z["bar"] - assert a["baz"] == z["baz"] - # multiple field assignment not supported - with pytest.raises(IndexError): - z.set_basic_selection(Ellipsis, v[["foo", "bar"]], fields=["foo", "bar"]) - with pytest.raises(IndexError): - z[..., "foo", "bar"] = v[["foo", "bar"]] - - -def _test_get_orthogonal_selection(a, z, selection): - expect = oindex(a, selection) - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - - -# noinspection PyStatementEffect -def test_get_orthogonal_selection_1d_bool(): - # setup - a = np.arange(1050, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_get_orthogonal_selection(a, z, ix) - - # test errors - with pytest.raises(IndexError): - z.oindex[np.zeros(50, dtype=bool)] # too short - with pytest.raises(IndexError): - z.oindex[np.zeros(2000, dtype=bool)] # too long - with pytest.raises(IndexError): - z.oindex[[[True, False], [False, True]]] # too many dimensions - - -# noinspection PyStatementEffect -def test_get_orthogonal_selection_1d_int(): - # setup - a = np.arange(1050, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: - # unordered - ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - _test_get_orthogonal_selection(a, z, ix) - # increasing - ix.sort() - _test_get_orthogonal_selection(a, z, ix) - # decreasing - ix = ix[::-1] - _test_get_orthogonal_selection(a, z, ix) - - selections = basic_selections_1d + [ - # test wraparound - [0, 3, 10, -23, -12, -1], - # explicit test not sorted - [3, 105, 23, 127], - ] - for selection in selections: - _test_get_orthogonal_selection(a, z, selection) - - bad_selections = basic_selections_1d_bad + [ - [a.shape[0] + 1], # out of bounds - [-(a.shape[0] + 1)], # out of bounds - [[2, 4], [6, 8]], # too many dimensions - ] - for selection in bad_selections: - with pytest.raises(IndexError): - z.get_orthogonal_selection(selection) - with pytest.raises(IndexError): - z.oindex[selection] - - -def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / slice - (ix0, slice(1, 5)), - (ix0, slice(1, 5, 2)), - (slice(250, 350), ix1), - (slice(250, 350, 10), ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - ] - for selection in selections: - _test_get_orthogonal_selection(a, z, selection) - - -# noinspection PyStatementEffect -def test_get_orthogonal_selection_2d(): - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - # boolean arrays - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - _test_get_orthogonal_selection_2d(a, z, ix0, ix1) - - # mixed int array / bool array - selections = ( - (ix0, np.nonzero(ix1)[0]), - (np.nonzero(ix0)[0], ix1), - ) - for selection in selections: - _test_get_orthogonal_selection(a, z, selection) - - # integer arrays - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) - _test_get_orthogonal_selection_2d(a, z, ix0, ix1) - ix0.sort() - ix1.sort() - _test_get_orthogonal_selection_2d(a, z, ix0, ix1) - ix0 = ix0[::-1] - ix1 = ix1[::-1] - _test_get_orthogonal_selection_2d(a, z, ix0, ix1) - - for selection in basic_selections_2d: - _test_get_orthogonal_selection(a, z, selection) - - for selection in basic_selections_2d_bad: - with pytest.raises(IndexError): - z.get_orthogonal_selection(selection) - with pytest.raises(IndexError): - z.oindex[selection] - - -def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): - selections = [ - # single value - (84, 42, 4), - (-1, -1, -1), - # index all axes with array - (ix0, ix1, ix2), - # mixed indexing with single array / slices - (ix0, slice(15, 25), slice(1, 5)), - (slice(50, 70), ix1, slice(1, 5)), - (slice(50, 70), slice(15, 25), ix2), - (ix0, slice(15, 25, 5), slice(1, 5, 2)), - (slice(50, 70, 3), ix1, slice(1, 5, 2)), - (slice(50, 70, 3), slice(15, 25, 5), ix2), - # mixed indexing with single array / ints - (ix0, 42, 4), - (84, ix1, 4), - (84, 42, ix2), - # mixed indexing with single array / slice / int - (ix0, slice(15, 25), 4), - (42, ix1, slice(1, 5)), - (slice(50, 70), 42, ix2), - # mixed indexing with two array / slice - (ix0, ix1, slice(1, 5)), - (slice(50, 70), ix1, ix2), - (ix0, slice(15, 25), ix2), - # mixed indexing with two array / integer - (ix0, ix1, 4), - (42, ix1, ix2), - (ix0, 42, ix2), - ] - for selection in selections: - _test_get_orthogonal_selection(a, z, selection) - - -def test_get_orthogonal_selection_3d(): - # setup - a = np.arange(100000, dtype=int).reshape(200, 50, 10) - z = zarr.v2.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - # boolean arrays - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) - _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) - - # integer arrays - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) - ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) - _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) - ix0.sort() - ix1.sort() - ix2.sort() - _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) - ix0 = ix0[::-1] - ix1 = ix1[::-1] - ix2 = ix2[::-1] - _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) - - -def test_orthogonal_indexing_edge_cases(): - a = np.arange(6).reshape(1, 2, 3) - z = zarr.v2.create(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) - z[:] = a - - expect = oindex(a, (0, slice(None), [0, 1, 2])) - actual = z.oindex[0, :, [0, 1, 2]] - assert_array_equal(expect, actual) - - expect = oindex(a, (0, slice(None), [True, True, True])) - actual = z.oindex[0, :, [True, True, True]] - assert_array_equal(expect, actual) - - -def _test_set_orthogonal_selection(v, a, z, selection): - for value in 42, oindex(v, selection), oindex(v, selection).tolist(): - if isinstance(value, list) and value == []: - # skip these cases as cannot preserve all dimensions - continue - # setup expectation - a[:] = 0 - oindex_set(a, selection, value) - # long-form API - z[:] = 0 - z.set_orthogonal_selection(selection, value) - assert_array_equal(a, z[:]) - # short-form API - z[:] = 0 - z.oindex[selection] = value - assert_array_equal(a, z[:]) - - -def test_set_orthogonal_selection_1d(): - # setup - v = np.arange(1050, dtype=int) - a = np.empty(v.shape, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - - # test with different degrees of sparseness - np.random.seed(42) - for p in 0.5, 0.1, 0.01: - # boolean arrays - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_set_orthogonal_selection(v, a, z, ix) - - # integer arrays - ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - _test_set_orthogonal_selection(v, a, z, ix) - ix.sort() - _test_set_orthogonal_selection(v, a, z, ix) - ix = ix[::-1] - _test_set_orthogonal_selection(v, a, z, ix) - - # basic selections - for selection in basic_selections_1d: - _test_set_orthogonal_selection(v, a, z, selection) - - -def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / slice or int - (ix0, slice(1, 5)), - (slice(250, 350), ix1), - (ix0, 4), - (42, ix1), - ] - for selection in selections: - _test_set_orthogonal_selection(v, a, z, selection) - - -def test_set_orthogonal_selection_2d(): - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty_like(v) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - # boolean arrays - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) - - # integer arrays - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) - _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) - ix0.sort() - ix1.sort() - _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) - ix0 = ix0[::-1] - ix1 = ix1[::-1] - _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) - - for selection in basic_selections_2d: - _test_set_orthogonal_selection(v, a, z, selection) - - -def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): - selections = ( - # single value - (84, 42, 4), - (-1, -1, -1), - # index all axes with bool array - (ix0, ix1, ix2), - # mixed indexing with single bool array / slice or int - (ix0, slice(15, 25), slice(1, 5)), - (slice(50, 70), ix1, slice(1, 5)), - (slice(50, 70), slice(15, 25), ix2), - (ix0, 42, 4), - (84, ix1, 4), - (84, 42, ix2), - (ix0, slice(15, 25), 4), - (slice(50, 70), ix1, 4), - (slice(50, 70), 42, ix2), - # indexing with two arrays / slice - (ix0, ix1, slice(1, 5)), - # indexing with two arrays / integer - (ix0, ix1, 4), - ) - for selection in selections: - _test_set_orthogonal_selection(v, a, z, selection) - - -def test_set_orthogonal_selection_3d(): - # setup - v = np.arange(100000, dtype=int).reshape(200, 50, 10) - a = np.empty_like(v) - z = zarr.v2.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - # boolean arrays - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) - _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) - - # integer arrays - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) - ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) - _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) - - # sorted increasing - ix0.sort() - ix1.sort() - ix2.sort() - _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) - - # sorted decreasing - ix0 = ix0[::-1] - ix1 = ix1[::-1] - ix2 = ix2[::-1] - _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) - - -def test_orthogonal_indexing_fallback_on_get_setitem(): - z = zarr.v2.zeros((20, 20)) - z[[1, 2, 3], [1, 2, 3]] = 1 - np.testing.assert_array_equal( - z[:4, :4], - [ - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [0, 0, 0, 1], - ], - ) - np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) - # test broadcasting - np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) - # test 1D fancy indexing - z2 = zarr.v2.zeros(5) - z2[[1, 2, 3]] = 1 - np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) - - -def _test_get_coordinate_selection(a, z, selection): - expect = a[selection] - actual = z.get_coordinate_selection(selection) - assert_array_equal(expect, actual) - actual = z.vindex[selection] - assert_array_equal(expect, actual) - - -coordinate_selections_1d_bad = [ - # slice not supported - slice(5, 15), - slice(None), - Ellipsis, - # bad stuff - 2.3, - "foo", - b"xxx", - None, - (0, 0), - (slice(None), slice(None)), -] - - -# noinspection PyStatementEffect -def test_get_coordinate_selection_1d(): - # setup - a = np.arange(1050, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: - n = int(a.size * p) - ix = np.random.choice(a.shape[0], size=n, replace=True) - _test_get_coordinate_selection(a, z, ix) - ix.sort() - _test_get_coordinate_selection(a, z, ix) - ix = ix[::-1] - _test_get_coordinate_selection(a, z, ix) - - selections = [ - # test single item - 42, - -1, - # test wraparound - [0, 3, 10, -23, -12, -1], - # test out of order - [3, 105, 23, 127], # not monotonically increasing - # test multi-dimensional selection - np.array([[2, 4], [6, 8]]), - ] - for selection in selections: - _test_get_coordinate_selection(a, z, selection) - - # test errors - bad_selections = coordinate_selections_1d_bad + [ - [a.shape[0] + 1], # out of bounds - [-(a.shape[0] + 1)], # out of bounds - ] - for selection in bad_selections: - with pytest.raises(IndexError): - z.get_coordinate_selection(selection) - with pytest.raises(IndexError): - z.vindex[selection] - - -def test_get_coordinate_selection_2d(): - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: - n = int(a.size * p) - ix0 = np.random.choice(a.shape[0], size=n, replace=True) - ix1 = np.random.choice(a.shape[1], size=n, replace=True) - selections = [ - # single value - (42, 4), - (-1, -1), - # index both axes with array - (ix0, ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - (42, 4), - ] - for selection in selections: - _test_get_coordinate_selection(a, z, selection) - - # not monotonically increasing (first dim) - ix0 = [3, 3, 4, 2, 5] - ix1 = [1, 3, 5, 7, 9] - _test_get_coordinate_selection(a, z, (ix0, ix1)) - - # not monotonically increasing (second dim) - ix0 = [1, 1, 2, 2, 5] - ix1 = [1, 3, 2, 1, 0] - _test_get_coordinate_selection(a, z, (ix0, ix1)) - - # multi-dimensional selection - ix0 = np.array([[1, 1, 2], [2, 2, 5]]) - ix1 = np.array([[1, 3, 2], [1, 0, 0]]) - _test_get_coordinate_selection(a, z, (ix0, ix1)) - - with pytest.raises(IndexError): - selection = slice(5, 15), [1, 2, 3] - z.get_coordinate_selection(selection) - with pytest.raises(IndexError): - selection = [1, 2, 3], slice(5, 15) - z.get_coordinate_selection(selection) - with pytest.raises(IndexError): - selection = Ellipsis, [1, 2, 3] - z.get_coordinate_selection(selection) - with pytest.raises(IndexError): - selection = Ellipsis - z.get_coordinate_selection(selection) - - -def _test_set_coordinate_selection(v, a, z, selection): - for value in 42, v[selection], v[selection].tolist(): - # setup expectation - a[:] = 0 - a[selection] = value - # test long-form API - z[:] = 0 - z.set_coordinate_selection(selection, value) - assert_array_equal(a, z[:]) - # test short-form API - z[:] = 0 - z.vindex[selection] = value - assert_array_equal(a, z[:]) - - -def test_set_coordinate_selection_1d(): - # setup - v = np.arange(1050, dtype=int) - a = np.empty(v.shape, dtype=v.dtype) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: - n = int(a.size * p) - ix = np.random.choice(a.shape[0], size=n, replace=True) - _test_set_coordinate_selection(v, a, z, ix) - - # multi-dimensional selection - ix = np.array([[2, 4], [6, 8]]) - _test_set_coordinate_selection(v, a, z, ix) - - for selection in coordinate_selections_1d_bad: - with pytest.raises(IndexError): - z.set_coordinate_selection(selection, 42) - with pytest.raises(IndexError): - z.vindex[selection] = 42 - - -def test_set_coordinate_selection_2d(): - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty_like(v) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: - n = int(a.size * p) - ix0 = np.random.choice(a.shape[0], size=n, replace=True) - ix1 = np.random.choice(a.shape[1], size=n, replace=True) - - selections = ( - (42, 4), - (-1, -1), - # index both axes with array - (ix0, ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - ) - for selection in selections: - _test_set_coordinate_selection(v, a, z, selection) - - # multi-dimensional selection - ix0 = np.array([[1, 2, 3], [4, 5, 6]]) - ix1 = np.array([[1, 3, 2], [2, 0, 5]]) - _test_set_coordinate_selection(v, a, z, (ix0, ix1)) - - -def _test_get_block_selection(a, z, selection, expected_idx): - expect = a[expected_idx] - actual = z.get_block_selection(selection) - assert_array_equal(expect, actual) - actual = z.blocks[selection] - assert_array_equal(expect, actual) - - -block_selections_1d = [ - # test single item - 0, - 5, - # test wraparound - -1, - -4, - # test slice - slice(5), - slice(None, 3), - slice(5, 6), - slice(-3, -1), - slice(None), # Full slice -] - -block_selections_1d_array_projection = [ - # test single item - slice(100), - slice(500, 600), - # test wraparound - slice(1000, None), - slice(700, 800), - # test slice - slice(500), - slice(None, 300), - slice(500, 600), - slice(800, 1000), - slice(None), -] - -block_selections_1d_bad = [ - # slice not supported - slice(3, 8, 2), - # bad stuff - 2.3, - "foo", - b"xxx", - None, - (0, 0), - (slice(None), slice(None)), - [0, 5, 3], -] - - -def test_get_block_selection_1d(): - # setup - a = np.arange(1050, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - for selection, expected_idx in zip(block_selections_1d, block_selections_1d_array_projection): - _test_get_block_selection(a, z, selection, expected_idx) - - bad_selections = block_selections_1d_bad + [ - z.nchunks + 1, # out of bounds - -(z.nchunks + 1), # out of bounds - ] - - for selection in bad_selections: - with pytest.raises(IndexError): - z.get_block_selection(selection) - with pytest.raises(IndexError): - z.blocks[selection] - - -block_selections_2d = [ - # test single item - (0, 0), - (1, 2), - # test wraparound - (-1, -1), - (-3, -2), - # test slice - (slice(1), slice(2)), - (slice(None, 2), slice(-2, -1)), - (slice(2, 3), slice(-2, None)), - (slice(-3, -1), slice(-3, -2)), - (slice(None), slice(None)), # Full slice -] - -block_selections_2d_array_projection = [ - # test single item - (slice(300), slice(3)), - (slice(300, 600), slice(6, 9)), - # test wraparound - (slice(900, None), slice(9, None)), - (slice(300, 600), slice(6, 9)), - # test slice - (slice(300), slice(6)), - (slice(None, 600), slice(6, 9)), - (slice(600, 900), slice(6, None)), - (slice(300, 900), slice(3, 6)), - (slice(None), slice(None)), # Full slice -] - - -def test_get_block_selection_2d(): - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - for selection, expected_idx in zip(block_selections_2d, block_selections_2d_array_projection): - _test_get_block_selection(a, z, selection, expected_idx) - - with pytest.raises(IndexError): - selection = slice(5, 15), [1, 2, 3] - z.get_block_selection(selection) - with pytest.raises(IndexError): - selection = Ellipsis, [1, 2, 3] - z.get_block_selection(selection) - with pytest.raises(IndexError): # out of bounds - selection = slice(15, 20), slice(None) - z.get_block_selection(selection) - - -def _test_set_block_selection( - v: np.ndarray, a: np.ndarray, z: zarr.v2.Array, selection, expected_idx -): - for value in 42, v[expected_idx], v[expected_idx].tolist(): - # setup expectation - a[:] = 0 - a[expected_idx] = value - # test long-form API - z[:] = 0 - z.set_block_selection(selection, value) - assert_array_equal(a, z[:]) - # test short-form API - z[:] = 0 - z.blocks[selection] = value - assert_array_equal(a, z[:]) - - -def test_set_block_selection_1d(): - # setup - v = np.arange(1050, dtype=int) - a = np.empty(v.shape, dtype=v.dtype) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - - for selection, expected_idx in zip(block_selections_1d, block_selections_1d_array_projection): - _test_set_block_selection(v, a, z, selection, expected_idx) - - for selection in block_selections_1d_bad: - with pytest.raises(IndexError): - z.set_block_selection(selection, 42) - with pytest.raises(IndexError): - z.blocks[selection] = 42 - - -def test_set_block_selection_2d(): - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty(v.shape, dtype=v.dtype) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - for selection, expected_idx in zip(block_selections_2d, block_selections_2d_array_projection): - _test_set_block_selection(v, a, z, selection, expected_idx) - - with pytest.raises(IndexError): - selection = slice(5, 15), [1, 2, 3] - z.set_block_selection(selection, 42) - with pytest.raises(IndexError): - selection = Ellipsis, [1, 2, 3] - z.set_block_selection(selection, 42) - with pytest.raises(IndexError): # out of bounds - selection = slice(15, 20), slice(None) - z.set_block_selection(selection, 42) - - -def _test_get_mask_selection(a, z, selection): - expect = a[selection] - actual = z.get_mask_selection(selection) - assert_array_equal(expect, actual) - actual = z.vindex[selection] - assert_array_equal(expect, actual) - - -mask_selections_1d_bad = [ - # slice not supported - slice(5, 15), - slice(None), - Ellipsis, - # bad stuff - 2.3, - "foo", - b"xxx", - None, - (0, 0), - (slice(None), slice(None)), -] - - -# noinspection PyStatementEffect -def test_get_mask_selection_1d(): - # setup - a = np.arange(1050, dtype=int) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_get_mask_selection(a, z, ix) - - # test errors - bad_selections = mask_selections_1d_bad + [ - np.zeros(50, dtype=bool), # too short - np.zeros(2000, dtype=bool), # too long - [[True, False], [False, True]], # too many dimensions - ] - for selection in bad_selections: - with pytest.raises(IndexError): - z.get_mask_selection(selection) - with pytest.raises(IndexError): - z.vindex[selection] - - -# noinspection PyStatementEffect -def test_get_mask_selection_2d(): - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) - _test_get_mask_selection(a, z, ix) - - # test errors - with pytest.raises(IndexError): - z.vindex[np.zeros((1000, 5), dtype=bool)] # too short - with pytest.raises(IndexError): - z.vindex[np.zeros((2000, 10), dtype=bool)] # too long - with pytest.raises(IndexError): - z.vindex[[True, False]] # wrong no. dimensions - - -def _test_set_mask_selection(v, a, z, selection): - a[:] = 0 - z[:] = 0 - a[selection] = v[selection] - z.set_mask_selection(selection, v[selection]) - assert_array_equal(a, z[:]) - z[:] = 0 - z.vindex[selection] = v[selection] - assert_array_equal(a, z[:]) - - -def test_set_mask_selection_1d(): - # setup - v = np.arange(1050, dtype=int) - a = np.empty_like(v) - z = zarr.v2.create(shape=a.shape, chunks=100, dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_set_mask_selection(v, a, z, ix) - - for selection in mask_selections_1d_bad: - with pytest.raises(IndexError): - z.set_mask_selection(selection, 42) - with pytest.raises(IndexError): - z.vindex[selection] = 42 - - -def test_set_mask_selection_2d(): - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty_like(v) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) - _test_set_mask_selection(v, a, z, ix) - - -def test_get_selection_out(): - # basic selections - a = np.arange(1050) - z = zarr.v2.create(shape=1050, chunks=100, dtype=a.dtype) - z[:] = a - selections = [ - slice(50, 150), - slice(0, 1050), - slice(1, 2), - ] - for selection in selections: - expect = a[selection] - out = zarr.v2.create(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) - z.get_basic_selection(selection, out=out) - assert_array_equal(expect, out[:]) - - with pytest.raises(TypeError): - z.get_basic_selection(Ellipsis, out=[]) - - # orthogonal selections - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / slice - (ix0, slice(1, 5)), - (slice(250, 350), ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - # mixed int array / bool array - (ix0, np.nonzero(ix1)[0]), - (np.nonzero(ix0)[0], ix1), - ] - for selection in selections: - expect = oindex(a, selection) - # out = zarr.v2.create(shape=expect.shape, chunks=10, dtype=expect.dtype, - # fill_value=0) - out = np.zeros(expect.shape, dtype=expect.dtype) - z.get_orthogonal_selection(selection, out=out) - assert_array_equal(expect, out[:]) - - # coordinate selections - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.v2.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - n = int(a.size * p) - ix0 = np.random.choice(a.shape[0], size=n, replace=True) - ix1 = np.random.choice(a.shape[1], size=n, replace=True) - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - ] - for selection in selections: - expect = a[selection] - out = np.zeros(expect.shape, dtype=expect.dtype) - z.get_coordinate_selection(selection, out=out) - assert_array_equal(expect, out[:]) - - -def test_get_selections_with_fields(): - a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] - a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) - z = zarr.v2.create(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=None) - z[:] = a - - fields_fixture = [ - "foo", - ["foo"], - ["foo", "bar"], - ["foo", "baz"], - ["bar", "baz"], - ["foo", "bar", "baz"], - ["bar", "foo"], - ["baz", "bar", "foo"], - ] - - for fields in fields_fixture: - # total selection - expect = a[fields] - actual = z.get_basic_selection(Ellipsis, fields=fields) - assert_array_equal(expect, actual) - # alternative API - if isinstance(fields, str): - actual = z[fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z[fields[0], fields[1]] - assert_array_equal(expect, actual) - if isinstance(fields, str): - actual = z[..., fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z[..., fields[0], fields[1]] - assert_array_equal(expect, actual) - - # basic selection with slice - expect = a[fields][0:2] - actual = z.get_basic_selection(slice(0, 2), fields=fields) - assert_array_equal(expect, actual) - # alternative API - if isinstance(fields, str): - actual = z[0:2, fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z[0:2, fields[0], fields[1]] - assert_array_equal(expect, actual) - - # basic selection with single item - expect = a[fields][1] - actual = z.get_basic_selection(1, fields=fields) - assert_array_equal(expect, actual) - # alternative API - if isinstance(fields, str): - actual = z[1, fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z[1, fields[0], fields[1]] - assert_array_equal(expect, actual) - - # orthogonal selection - ix = [0, 2] - expect = a[fields][ix] - actual = z.get_orthogonal_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative API - if isinstance(fields, str): - actual = z.oindex[ix, fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z.oindex[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) - - # coordinate selection - ix = [0, 2] - expect = a[fields][ix] - actual = z.get_coordinate_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative API - if isinstance(fields, str): - actual = z.vindex[ix, fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z.vindex[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) - - # mask selection - ix = [True, False, True] - expect = a[fields][ix] - actual = z.get_mask_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative API - if isinstance(fields, str): - actual = z.vindex[ix, fields] - assert_array_equal(expect, actual) - elif len(fields) == 2: - actual = z.vindex[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) - - # missing/bad fields - with pytest.raises(IndexError): - z.get_basic_selection(Ellipsis, fields=["notafield"]) - with pytest.raises(IndexError): - z.get_basic_selection(Ellipsis, fields=slice(None)) - - -def test_set_selections_with_fields(): - v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] - v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) - a = np.empty_like(v) - z = zarr.v2.empty_like(v, chunks=2) - - fields_fixture = [ - "foo", - [], - ["foo"], - ["foo", "bar"], - ["foo", "baz"], - ["bar", "baz"], - ["foo", "bar", "baz"], - ["bar", "foo"], - ["baz", "bar", "foo"], - ] - - for fields in fields_fixture: - # currently multi-field assignment is not supported in numpy, so we won't support - # it either - if isinstance(fields, list) and len(fields) > 1: - with pytest.raises(IndexError): - z.set_basic_selection(Ellipsis, v, fields=fields) - with pytest.raises(IndexError): - z.set_orthogonal_selection([0, 2], v, fields=fields) - with pytest.raises(IndexError): - z.set_coordinate_selection([0, 2], v, fields=fields) - with pytest.raises(IndexError): - z.set_mask_selection([True, False, True], v, fields=fields) - - else: - if isinstance(fields, list) and len(fields) == 1: - # work around numpy does not support multi-field assignment even if there - # is only one field - key = fields[0] - elif isinstance(fields, list) and len(fields) == 0: - # work around numpy ambiguity about what is a field selection - key = Ellipsis - else: - key = fields - - # setup expectation - a[:] = ("", 0, 0) - z[:] = ("", 0, 0) - assert_array_equal(a, z[:]) - a[key] = v[key] - # total selection - z.set_basic_selection(Ellipsis, v[key], fields=fields) - assert_array_equal(a, z[:]) - - # basic selection with slice - a[:] = ("", 0, 0) - z[:] = ("", 0, 0) - a[key][0:2] = v[key][0:2] - z.set_basic_selection(slice(0, 2), v[key][0:2], fields=fields) - assert_array_equal(a, z[:]) - - # orthogonal selection - a[:] = ("", 0, 0) - z[:] = ("", 0, 0) - ix = [0, 2] - a[key][ix] = v[key][ix] - z.set_orthogonal_selection(ix, v[key][ix], fields=fields) - assert_array_equal(a, z[:]) - - # coordinate selection - a[:] = ("", 0, 0) - z[:] = ("", 0, 0) - ix = [0, 2] - a[key][ix] = v[key][ix] - z.set_coordinate_selection(ix, v[key][ix], fields=fields) - assert_array_equal(a, z[:]) - - # mask selection - a[:] = ("", 0, 0) - z[:] = ("", 0, 0) - ix = [True, False, True] - a[key][ix] = v[key][ix] - z.set_mask_selection(ix, v[key][ix], fields=fields) - assert_array_equal(a, z[:]) - - -@pytest.mark.parametrize( - "selection, arr, expected", - [ - ( - (slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), - np.arange(2, 100_002).reshape((100, 10, 100)), - [ - (5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), - (6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), - (7200, 200, (slice(7, 8, 1), slice(2, 4, 1))), - ], - ), - ( - (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), - np.arange(2, 100_002).reshape((100, 10, 100)), - [ - (5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), - (5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), - (6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), - (6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), - (7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), - (7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1))), - ], - ), - ( - (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), - np.asfortranarray(np.arange(2, 100_002).reshape((100, 10, 100))), - [ - (5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), - (5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), - (6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), - (6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), - (7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), - (7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1))), - ], - ), - ( - (slice(5, 8, 1), slice(2, 4, 1)), - np.arange(2, 100_002).reshape((100, 10, 100)), - [ - (5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), - (6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), - (7200, 200, (slice(7, 8, 1), slice(2, 4, 1))), - ], - ), - ( - (slice(0, 10, 1),), - np.arange(0, 10).reshape((10)), - [(0, 10, (slice(0, 10, 1),))], - ), - ((0,), np.arange(0, 100).reshape((10, 10)), [(0, 10, (slice(0, 1, 1),))]), - ( - ( - 0, - 0, - ), - np.arange(0, 100).reshape((10, 10)), - [(0, 1, (slice(0, 1, 1), slice(0, 1, 1)))], - ), - ((0,), np.arange(0, 10).reshape((10)), [(0, 1, (slice(0, 1, 1),))]), - pytest.param( - (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), - np.arange(2, 100002).reshape((10, 1, 10000)), - None, - marks=[pytest.mark.xfail(reason="slice 2 is out of range")], - ), - pytest.param( - (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), - np.arange(2, 100_002).reshape((10, 10_000)), - None, - marks=[pytest.mark.xfail(reason="slice 2 is out of range")], - ), - ], -) -def test_PartialChunkIterator(selection, arr, expected): - PCI = PartialChunkIterator(selection, arr.shape) - results = list(PCI) - assert results == expected - - -def test_slice_selection_uints(): - arr = np.arange(24).reshape((4, 6)) - idx = np.uint64(3) - slice_sel = make_slice_selection((idx,)) - assert arr[tuple(slice_sel)].shape == (1, 6) - - -def test_numpy_int_indexing(): - a = np.arange(1050) - z = zarr.v2.create(shape=1050, chunks=100, dtype=a.dtype) - z[:] = a - assert a[42] == z[42] - assert a[np.int64(42)] == z[np.int64(42)] - - -@pytest.mark.parametrize( - "shape, chunks, ops", - [ - # 1D test cases - ((1070,), (50,), [("__getitem__", (slice(200, 400),))]), - ((1070,), (50,), [("__getitem__", (slice(200, 400, 100),))]), - ( - (1070,), - (50,), - [ - ("__getitem__", (slice(200, 400),)), - ("__setitem__", (slice(200, 400, 100),)), - ], - ), - # 2D test cases - ( - (40, 50), - (5, 8), - [ - ("__getitem__", (slice(6, 37, 13), (slice(4, 10)))), - ("__setitem__", (slice(None), (slice(None)))), - ], - ), - ], -) -def test_accessed_chunks(shape, chunks, ops): - # Test that only the required chunks are accessed during basic selection operations - # shape: array shape - # chunks: chunk size - # ops: list of tuples with (optype, tuple of slices) - # optype = "__getitem__" or "__setitem__", tuple length must match number of dims - import itertools - - # Use a counting dict as the backing store so we can track the items access - store = CountingDict() - z = zarr.v2.create(shape=shape, chunks=chunks, store=store) - - for ii, (optype, slices) in enumerate(ops): - # Resolve the slices into the accessed chunks for each dimension - chunks_per_dim = [] - for N, C, sl in zip(shape, chunks, slices): - chunk_ind = np.arange(N, dtype=int)[sl] // C - chunks_per_dim.append(np.unique(chunk_ind)) - - # Combine and generate the cartesian product to determine the chunks keys that - # will be accessed - chunks_accessed = [] - for comb in itertools.product(*chunks_per_dim): - chunks_accessed.append(".".join([str(ci) for ci in comb])) - - counts_before = store.counter.copy() - - # Perform the operation - if optype == "__getitem__": - z[slices] - else: - z[slices] = ii - - # Get the change in counts - delta_counts = store.counter - counts_before - - # Check that the access counts for the operation have increased by one for all - # the chunks we expect to be included - for ci in chunks_accessed: - assert delta_counts.pop((optype, ci)) == 1 - - # If the chunk was partially written to it will also have been read once. We - # don't determine if the chunk was actually partial here, just that the - # counts are consistent that this might have happened - if optype == "__setitem__": - assert ("__getitem__", ci) not in delta_counts or delta_counts.pop( - ("__getitem__", ci) - ) == 1 - # Check that no other chunks were accessed - assert len(delta_counts) == 0 diff --git a/tests/v2/test_info.py b/tests/v2/test_info.py deleted file mode 100644 index 9f68119295..0000000000 --- a/tests/v2/test_info.py +++ /dev/null @@ -1,66 +0,0 @@ -import numcodecs -import pytest - -import zarr.v2 -from zarr.v2.util import InfoReporter - - -@pytest.mark.parametrize("array_size", [10, 15000]) -def test_info(array_size): - # setup - g = zarr.v2.group(store=dict(), chunk_store=dict(), synchronizer=zarr.v2.ThreadSynchronizer()) - g.create_group("foo") - z = g.zeros("bar", shape=array_size, filters=[numcodecs.Adler32()]) - - # test group info - items = g.info_items() - keys = sorted([k for k, _ in items]) - expected_keys = sorted( - [ - "Type", - "Read-only", - "Synchronizer type", - "Store type", - "Chunk store type", - "No. members", - "No. arrays", - "No. groups", - "Arrays", - "Groups", - "Name", - ] - ) - assert expected_keys == keys - - # can also get a string representation of info via the info attribute - assert isinstance(g.info, InfoReporter) - assert "Type" in repr(g.info) - - # test array info - items = z.info_items() - keys = sorted([k for k, _ in items]) - expected_keys = sorted( - [ - "Type", - "Data type", - "Shape", - "Chunk shape", - "Order", - "Read-only", - "Filter [0]", - "Compressor", - "Synchronizer type", - "Store type", - "Chunk store type", - "No. bytes", - "No. bytes stored", - "Storage ratio", - "Chunks initialized", - "Name", - ] - ) - assert expected_keys == keys - - # can also get a string representation of info via the info attribute - assert isinstance(z.info, InfoReporter) - assert "Type" in repr(z.info) diff --git a/tests/v2/test_meta.py b/tests/v2/test_meta.py deleted file mode 100644 index b7c00ec64c..0000000000 --- a/tests/v2/test_meta.py +++ /dev/null @@ -1,527 +0,0 @@ -import base64 -import json - -import numpy as np -import pytest - -from zarr.v2.codecs import Blosc, Delta, Pickle, Zlib -from zarr.v2.errors import MetadataError -from zarr.v2.meta import ( - ZARR_FORMAT, - decode_array_metadata, - decode_dtype, - decode_group_metadata, - encode_array_metadata, - encode_dtype, - encode_fill_value, - decode_fill_value, -) -from zarr.v2.util import normalize_dtype, normalize_fill_value - - -def assert_json_equal(expect, actual): - if isinstance(actual, bytes): - actual = str(actual, "ascii") - ej = json.loads(expect) - aj = json.loads(actual) - assert ej == aj - - -def test_encode_decode_array_1(): - meta = dict( - shape=(100,), - chunks=(10,), - dtype=np.dtype(" CuPyCPUCompressor: - if compressor: - compressor = getattr(zarr.v2.codecs, compressor)() - return CuPyCPUCompressor(compressor) - - -def init_store(tmp_path, store_type) -> Optional[Store]: - if store_type is DirectoryStore: - return store_type(str(tmp_path / "store")) - if store_type is MemoryStore: - return MemoryStore() - return None - - -def ensure_module(module): - if isinstance(module, str): - return pytest.importorskip(module) - return module - - -param_module_and_compressor = [ - (MyArray, None), - ("cupy", init_compressor(None)), - ("cupy", init_compressor("Zlib")), - ("cupy", init_compressor("Blosc")), -] - - -@pytest.mark.parametrize("module, compressor", param_module_and_compressor) -@pytest.mark.parametrize("store_type", [None, DirectoryStore, MemoryStore, ZipStore]) -def test_array(tmp_path, module, compressor, store_type): - xp = ensure_module(module) - - store = init_store(tmp_path / "from_cupy_array", store_type) - a = xp.arange(100) - z = array(a, chunks=10, compressor=compressor, store=store, meta_array=xp.empty(())) - assert a.shape == z.shape - assert a.dtype == z.dtype - assert isinstance(a, type(z[:])) - assert isinstance(z.meta_array, type(xp.empty(()))) - xp.testing.assert_array_equal(a, z[:]) - - # with array-like - store = init_store(tmp_path / "from_list", store_type) - a = list(range(100)) - z = array(a, chunks=10, compressor=compressor, store=store, meta_array=xp.empty(())) - assert (100,) == z.shape - assert np.asarray(a).dtype == z.dtype - xp.testing.assert_array_equal(a, z[:]) - - # with another zarr array - store = init_store(tmp_path / "from_another_store", store_type) - z2 = array(z, compressor=compressor, store=store, meta_array=xp.empty(())) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype - xp.testing.assert_array_equal(z[:], z2[:]) - - store = init_store(tmp_path / "open_array", store_type) - a = xp.arange(100) - z = open_array( - store, - shape=a.shape, - dtype=a.dtype, - chunks=10, - compressor=compressor, - meta_array=xp.empty(()), - ) - z[:] = a - assert a.shape == z.shape - assert a.dtype == z.dtype - assert isinstance(a, type(z[:])) - assert isinstance(z.meta_array, type(xp.empty(()))) - xp.testing.assert_array_equal(a, z[:]) - - -@pytest.mark.parametrize("module, compressor", param_module_and_compressor) -def test_empty(module, compressor): - xp = ensure_module(module) - z = empty( - 100, - chunks=10, - compressor=compressor, - meta_array=xp.empty(()), - ) - assert (100,) == z.shape - assert (10,) == z.chunks - - -@pytest.mark.parametrize("module, compressor", param_module_and_compressor) -def test_zeros(module, compressor): - xp = ensure_module(module) - z = zeros( - 100, - chunks=10, - compressor=compressor, - meta_array=xp.empty(()), - ) - assert (100,) == z.shape - assert (10,) == z.chunks - xp.testing.assert_array_equal(np.zeros(100), z[:]) - - -@pytest.mark.parametrize("module, compressor", param_module_and_compressor) -def test_ones(module, compressor): - xp = ensure_module(module) - z = ones( - 100, - chunks=10, - compressor=compressor, - meta_array=xp.empty(()), - ) - assert (100,) == z.shape - assert (10,) == z.chunks - xp.testing.assert_array_equal(np.ones(100), z[:]) - - -@pytest.mark.parametrize("module, compressor", param_module_and_compressor) -def test_full(module, compressor): - xp = ensure_module(module) - z = full( - 100, - chunks=10, - fill_value=42, - dtype="i4", - compressor=compressor, - meta_array=xp.empty(()), - ) - assert (100,) == z.shape - assert (10,) == z.chunks - xp.testing.assert_array_equal(np.full(100, fill_value=42, dtype="i4"), z[:]) - - # nan - z = full( - 100, - chunks=10, - fill_value=np.nan, - dtype="f8", - compressor=compressor, - meta_array=xp.empty(()), - ) - assert np.all(np.isnan(z[:])) - - -@pytest.mark.parametrize("group_create_function", [group, open_group]) -@pytest.mark.parametrize("module, compressor", param_module_and_compressor) -@pytest.mark.parametrize("store_type", [None, DirectoryStore, MemoryStore, ZipStore]) -def test_group(tmp_path, group_create_function, module, compressor, store_type): - xp = ensure_module(module) - store = init_store(tmp_path, store_type) - g = group_create_function(store, meta_array=xp.empty(())) - g.ones("data", shape=(10, 11), dtype=int, compressor=compressor) - a = g["data"] - assert a.shape == (10, 11) - assert a.dtype == int - assert isinstance(a, Array) - assert isinstance(a[:], type(xp.empty(()))) - assert (a[:] == 1).all() - assert isinstance(g.meta_array, type(xp.empty(()))) diff --git a/tests/v2/test_n5.py b/tests/v2/test_n5.py deleted file mode 100644 index 238e9b2c6e..0000000000 --- a/tests/v2/test_n5.py +++ /dev/null @@ -1,53 +0,0 @@ -import pytest - -from zarr.v2.n5 import N5ChunkWrapper, N5FSStore -from zarr.v2.creation import create -from zarr.v2.storage import atexit_rmtree -from numcodecs import GZip -import numpy as np -from typing import Tuple -import json -import atexit - -from .util import have_fsspec - - -def test_make_n5_chunk_wrapper(): - dtype = "uint8" - chunk_shape = (10,) - codec = GZip() - # ValueError when specifying both compressor and compressor_config - with pytest.raises(ValueError): - N5ChunkWrapper( - dtype, chunk_shape=chunk_shape, compressor_config=codec.get_config(), compressor=codec - ) - - wrapper_a = N5ChunkWrapper(dtype, chunk_shape=chunk_shape, compressor_config=codec.get_config()) - wrapper_b = N5ChunkWrapper(dtype, chunk_shape=chunk_shape, compressor=codec) - assert wrapper_a == wrapper_b - - -@pytest.mark.parametrize("chunk_shape", ((2,), (4, 4), (8, 8, 8))) -def test_partial_chunk_decode(chunk_shape: Tuple[int, ...]): - # Test that the N5Chunk wrapper can handle fractional chunks that - # may be generated by other N5 implementations - dtype = "uint8" - codec = GZip() - codec_wrapped = N5ChunkWrapper(dtype, chunk_shape=chunk_shape, compressor=codec) - subslices = tuple(slice(0, cs // 2) for cs in chunk_shape) - chunk = np.zeros(chunk_shape, dtype=dtype) - chunk[subslices] = 1 - subchunk = np.ascontiguousarray(chunk[subslices]) - assert np.array_equal(codec_wrapped.decode(codec_wrapped.encode(subchunk)), chunk) - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -def test_dtype_decode(): - path = "data/array.n5" - atexit_rmtree(path) - atexit.register(atexit_rmtree, path) - n5_store = N5FSStore(path) - create(100, store=n5_store) - dtype_n5 = json.loads(n5_store[".zarray"])["dtype"] - dtype_zarr = json.loads(create(100).store[".zarray"])["dtype"] - assert dtype_n5 == dtype_zarr diff --git a/tests/v2/test_storage.py b/tests/v2/test_storage.py deleted file mode 100644 index cce1e5cb3c..0000000000 --- a/tests/v2/test_storage.py +++ /dev/null @@ -1,2505 +0,0 @@ -import array -import atexit -import json -import os -import pathlib -import sys -import pickle -import shutil -import tempfile -from contextlib import contextmanager -from pickle import PicklingError -from zipfile import ZipFile - -import numpy as np -import pytest -from numpy.testing import assert_array_almost_equal, assert_array_equal - -from numcodecs.compat import ensure_bytes - -import zarr.v2 -from zarr.v2 import meta_v1 -from zarr.v2.codecs import BZ2, AsType, Blosc, Zlib -from zarr.v2.context import Context -from zarr.v2.convenience import consolidate_metadata -from zarr.v2.errors import ContainsArrayError, ContainsGroupError, MetadataError -from zarr.v2.hierarchy import group -from zarr.v2.meta import ZARR_FORMAT, decode_array_metadata -from zarr.v2.n5 import N5Store, N5FSStore, N5_FORMAT, n5_attrs_key -from zarr.v2.storage import ( - ABSStore, - ConsolidatedMetadataStore, - DBMStore, - DictStore, - DirectoryStore, - KVStore, - LMDBStore, - LRUStoreCache, - MemoryStore, - MongoDBStore, - NestedDirectoryStore, - RedisStore, - SQLiteStore, - Store, - TempStore, - ZipStore, - array_meta_key, - atexit_rmglob, - atexit_rmtree, - attrs_key, - default_compressor, - getsize, - group_meta_key, - init_array, - init_group, - migrate_1to2, - normalize_store_arg, -) -from zarr.v2.storage import FSStore, rename, listdir -from .util import CountingDict, have_fsspec, skip_test_env_var, abs_container, mktemp -from zarr.v2.util import ConstantMap, json_dumps - - -@contextmanager -def does_not_raise(): - yield - - -@pytest.fixture( - params=[ - (None, "."), - (".", "."), - ("/", "/"), - ] -) -def dimension_separator_fixture(request): - return request.param - - -def skip_if_nested_chunks(**kwargs): - if kwargs.get("dimension_separator") == "/": - pytest.skip("nested chunks are unsupported") - - -def test_kvstore_repr(): - repr(KVStore(dict())) - - -def test_ensure_store(): - class InvalidStore: - pass - - with pytest.raises(ValueError): - Store._ensure_store(InvalidStore()) - - # cannot initialize without a store - with pytest.raises(ValueError): - Store._ensure_store(None) - - -def test_capabilities(): - s = KVStore(dict()) - assert s.is_readable() - assert s.is_listable() - assert s.is_erasable() - assert s.is_writeable() - - -def test_getsize_non_implemented(): - assert getsize(object()) == -1 - - -def test_kvstore_eq(): - assert KVStore(dict()) != dict() - - -def test_coverage_rename(): - store = dict() - store["a"] = 1 - rename(store, "a", "b") - - -def test_deprecated_listdir_nosotre(): - store = dict() - with pytest.warns(UserWarning, match="has no `listdir`"): - listdir(store) - - -class StoreTests: - """Abstract store tests.""" - - version = 2 - root = "" - - def create_store(self, **kwargs): # pragma: no cover - # implement in sub-class - raise NotImplementedError - - def test_context_manager(self): - with self.create_store(): - pass - - def test_get_set_del_contains(self): - store = self.create_store() - - # test __contains__, __getitem__, __setitem__ - key = self.root + "foo" - assert key not in store - with pytest.raises(KeyError): - # noinspection PyStatementEffect - store[key] - store[key] = b"bar" - assert key in store - assert b"bar" == ensure_bytes(store[key]) - - # test __delitem__ (optional) - try: - del store[key] - except NotImplementedError: - pass - else: - assert key not in store - with pytest.raises(KeyError): - # noinspection PyStatementEffect - store[key] - with pytest.raises(KeyError): - # noinspection PyStatementEffect - del store[key] - - store.close() - - def test_set_invalid_content(self): - store = self.create_store() - - with pytest.raises(TypeError): - store[self.root + "baz"] = list(range(5)) - - store.close() - - def test_clear(self): - store = self.create_store() - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"qux" - assert len(store) == 2 - store.clear() - assert len(store) == 0 - assert self.root + "foo" not in store - assert self.root + "baz" not in store - - store.close() - - def test_pop(self): - store = self.create_store() - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"qux" - assert len(store) == 2 - v = store.pop(self.root + "foo") - assert ensure_bytes(v) == b"bar" - assert len(store) == 1 - v = store.pop(self.root + "baz") - assert ensure_bytes(v) == b"qux" - assert len(store) == 0 - with pytest.raises(KeyError): - store.pop(self.root + "xxx") - v = store.pop(self.root + "xxx", b"default") - assert v == b"default" - v = store.pop(self.root + "xxx", b"") - assert v == b"" - v = store.pop(self.root + "xxx", None) - assert v is None - - store.close() - - def test_popitem(self): - store = self.create_store() - store[self.root + "foo"] = b"bar" - k, v = store.popitem() - assert k == self.root + "foo" - assert ensure_bytes(v) == b"bar" - assert len(store) == 0 - with pytest.raises(KeyError): - store.popitem() - - store.close() - - def test_writeable_values(self): - store = self.create_store() - - # __setitem__ should accept any value that implements buffer interface - store[self.root + "foo1"] = b"bar" - store[self.root + "foo2"] = bytearray(b"bar") - store[self.root + "foo3"] = array.array("B", b"bar") - store[self.root + "foo4"] = np.frombuffer(b"bar", dtype="u1") - - store.close() - - def test_update(self): - store = self.create_store() - assert self.root + "foo" not in store - assert self.root + "baz" not in store - - if self.version == 2: - store.update(foo=b"bar", baz=b"quux") - else: - kv = {self.root + "foo": b"bar", self.root + "baz": b"quux"} - store.update(kv) - - assert b"bar" == ensure_bytes(store[self.root + "foo"]) - assert b"quux" == ensure_bytes(store[self.root + "baz"]) - - store.close() - - def test_iterators(self): - store = self.create_store() - - # test iterator methods on empty store - assert 0 == len(store) - assert set() == set(store) - assert set() == set(store.keys()) - assert set() == set(store.values()) - assert set() == set(store.items()) - - # setup some values - store[self.root + "a"] = b"aaa" - store[self.root + "b"] = b"bbb" - store[self.root + "c/d"] = b"ddd" - store[self.root + "c/e/f"] = b"fff" - - # test iterators on store with data - assert 4 == len(store) - expected = set(self.root + k for k in ["a", "b", "c/d", "c/e/f"]) - assert expected == set(store) - assert expected == set(store.keys()) - assert {b"aaa", b"bbb", b"ddd", b"fff"} == set(map(ensure_bytes, store.values())) - assert { - (self.root + "a", b"aaa"), - (self.root + "b", b"bbb"), - (self.root + "c/d", b"ddd"), - (self.root + "c/e/f", b"fff"), - } == set(map(lambda kv: (kv[0], ensure_bytes(kv[1])), store.items())) - - store.close() - - def test_pickle(self): - # setup store - store = self.create_store() - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"quux" - n = len(store) - keys = sorted(store.keys()) - - # round-trip through pickle - dump = pickle.dumps(store) - # some stores cannot be opened twice at the same time, need to close - # store before can round-trip through pickle - store.close() - # check can still pickle after close - assert dump == pickle.dumps(store) - store2 = pickle.loads(dump) - - # verify - assert n == len(store2) - assert keys == sorted(store2.keys()) - assert b"bar" == ensure_bytes(store2[self.root + "foo"]) - assert b"quux" == ensure_bytes(store2[self.root + "baz"]) - - store2.close() - - def test_getsize(self): - store = self.create_store() - if isinstance(store, dict) or hasattr(store, "getsize"): - assert 0 == getsize(store) - store["foo"] = b"x" - assert 1 == getsize(store) - assert 1 == getsize(store, "foo") - store["bar"] = b"yy" - assert 3 == getsize(store) - assert 2 == getsize(store, "bar") - store["baz"] = bytearray(b"zzz") - assert 6 == getsize(store) - assert 3 == getsize(store, "baz") - store["quux"] = array.array("B", b"zzzz") - assert 10 == getsize(store) - assert 4 == getsize(store, "quux") - store["spong"] = np.frombuffer(b"zzzzz", dtype="u1") - assert 15 == getsize(store) - assert 5 == getsize(store, "spong") - - store.close() - - # noinspection PyStatementEffect - def test_hierarchy(self): - # setup - store = self.create_store() - store[self.root + "a"] = b"aaa" - store[self.root + "b"] = b"bbb" - store[self.root + "c/d"] = b"ddd" - store[self.root + "c/e/f"] = b"fff" - store[self.root + "c/e/g"] = b"ggg" - - # check keys - assert self.root + "a" in store - assert self.root + "b" in store - assert self.root + "c/d" in store - assert self.root + "c/e/f" in store - assert self.root + "c/e/g" in store - assert self.root + "c" not in store - assert self.root + "c/" not in store - assert self.root + "c/e" not in store - assert self.root + "c/e/" not in store - assert self.root + "c/d/x" not in store - - # check __getitem__ - with pytest.raises(KeyError): - store[self.root + "c"] - with pytest.raises(KeyError): - store[self.root + "c/e"] - with pytest.raises(KeyError): - store[self.root + "c/d/x"] - - # test getsize (optional) - if hasattr(store, "getsize"): - # TODO: proper behavior of getsize? - # v3 returns size of all nested arrays, not just the - # size of the arrays in the current folder. - if self.version == 2: - assert 6 == store.getsize() - else: - assert 15 == store.getsize() - assert 3 == store.getsize("a") - assert 3 == store.getsize("b") - if self.version == 2: - assert 3 == store.getsize("c") - else: - assert 9 == store.getsize("c") - assert 3 == store.getsize("c/d") - assert 6 == store.getsize("c/e") - assert 3 == store.getsize("c/e/f") - assert 3 == store.getsize("c/e/g") - # non-existent paths - assert 0 == store.getsize("x") - assert 0 == store.getsize("a/x") - assert 0 == store.getsize("c/x") - assert 0 == store.getsize("c/x/y") - assert 0 == store.getsize("c/d/y") - assert 0 == store.getsize("c/d/y/z") - - # access item via full path - assert 3 == store.getsize(self.root + "a") - - # test listdir (optional) - if hasattr(store, "listdir"): - assert {"a", "b", "c"} == set(store.listdir(self.root)) - assert {"d", "e"} == set(store.listdir(self.root + "c")) - assert {"f", "g"} == set(store.listdir(self.root + "c/e")) - # no exception raised if path does not exist or is leaf - assert [] == store.listdir(self.root + "x") - assert [] == store.listdir(self.root + "a/x") - assert [] == store.listdir(self.root + "c/x") - assert [] == store.listdir(self.root + "c/x/y") - assert [] == store.listdir(self.root + "c/d/y") - assert [] == store.listdir(self.root + "c/d/y/z") - # the following is listdir(filepath), for which fsspec gives [filepath] - # as posix would, but an empty list was previously assumed - # assert [] == store.listdir(self.root + "c/e/f") - - # test rename (optional) - if store.is_erasable(): - store.rename("c/e", "c/e2") - assert self.root + "c/d" in store - assert self.root + "c/e" not in store - assert self.root + "c/e/f" not in store - assert self.root + "c/e/g" not in store - assert self.root + "c/e2" not in store - assert self.root + "c/e2/f" in store - assert self.root + "c/e2/g" in store - store.rename("c/e2", "c/e") - assert self.root + "c/d" in store - assert self.root + "c/e2" not in store - assert self.root + "c/e2/f" not in store - assert self.root + "c/e2/g" not in store - assert self.root + "c/e" not in store - assert self.root + "c/e/f" in store - assert self.root + "c/e/g" in store - store.rename("c", "c1/c2/c3") - assert self.root + "a" in store - assert self.root + "c" not in store - assert self.root + "c/d" not in store - assert self.root + "c/e" not in store - assert self.root + "c/e/f" not in store - assert self.root + "c/e/g" not in store - assert self.root + "c1" not in store - assert self.root + "c1/c2" not in store - assert self.root + "c1/c2/c3" not in store - assert self.root + "c1/c2/c3/d" in store - assert self.root + "c1/c2/c3/e" not in store - assert self.root + "c1/c2/c3/e/f" in store - assert self.root + "c1/c2/c3/e/g" in store - store.rename("c1/c2/c3", "c") - assert self.root + "c" not in store - assert self.root + "c/d" in store - assert self.root + "c/e" not in store - assert self.root + "c/e/f" in store - assert self.root + "c/e/g" in store - assert self.root + "c1" not in store - assert self.root + "c1/c2" not in store - assert self.root + "c1/c2/c3" not in store - assert self.root + "c1/c2/c3/d" not in store - assert self.root + "c1/c2/c3/e" not in store - assert self.root + "c1/c2/c3/e/f" not in store - assert self.root + "c1/c2/c3/e/g" not in store - - # test rmdir (optional) - store.rmdir("c/e") - assert self.root + "c/d" in store - assert self.root + "c/e/f" not in store - assert self.root + "c/e/g" not in store - store.rmdir("c") - assert self.root + "c/d" not in store - store.rmdir() - assert self.root + "a" not in store - assert self.root + "b" not in store - store[self.root + "a"] = b"aaa" - store[self.root + "c/d"] = b"ddd" - store[self.root + "c/e/f"] = b"fff" - # no exceptions raised if path does not exist or is leaf - store.rmdir("x") - store.rmdir("a/x") - store.rmdir("c/x") - store.rmdir("c/x/y") - store.rmdir("c/d/y") - store.rmdir("c/d/y/z") - store.rmdir("c/e/f") - assert self.root + "a" in store - assert self.root + "c/d" in store - assert self.root + "c/e/f" in store - - store.close() - - def test_init_array(self, dimension_separator_fixture): - pass_dim_sep, want_dim_sep = dimension_separator_fixture - - store = self.create_store(dimension_separator=pass_dim_sep) - init_array(store, shape=1000, chunks=100) - - # check metadata - assert array_meta_key in store - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - assert default_compressor.get_config() == meta["compressor"] - assert meta["fill_value"] is None - # Missing MUST be assumed to be "." - assert meta.get("dimension_separator", ".") is want_dim_sep - - store.close() - - def test_init_array_overwrite(self): - self._test_init_array_overwrite("F") - - def test_init_array_overwrite_path(self): - self._test_init_array_overwrite_path("F") - - def test_init_array_overwrite_chunk_store(self): - self._test_init_array_overwrite_chunk_store("F") - - def test_init_group_overwrite(self): - self._test_init_group_overwrite("F") - - def test_init_group_overwrite_path(self): - self._test_init_group_overwrite_path("F") - - def test_init_group_overwrite_chunk_store(self): - self._test_init_group_overwrite_chunk_store("F") - - def _test_init_array_overwrite(self, order): - # setup - store = self.create_store() - path = None - mkey = array_meta_key - meta = dict( - shape=(2000,), - chunks=(200,), - dtype=np.dtype("u1"), - compressor=Zlib(1).get_config(), - fill_value=0, - order=order, - filters=None, - ) - - store[mkey] = store._metadata_class.encode_array_metadata(meta) - - # don't overwrite (default) - with pytest.raises(ContainsArrayError): - init_array(store, shape=1000, chunks=100, path=path) - - # do overwrite - try: - init_array(store, shape=1000, chunks=100, dtype="i4", overwrite=True, path=path) - except NotImplementedError: - pass - else: - assert mkey in store - meta = store._metadata_class.decode_array_metadata(store[mkey]) - if self.version == 2: - assert ZARR_FORMAT == meta["zarr_format"] - assert (100,) == meta["chunks"] - assert np.dtype("i4") == meta["dtype"] - else: - assert (100,) == meta["chunk_grid"]["chunk_shape"] - assert np.dtype("i4") == meta["data_type"] - assert (1000,) == meta["shape"] - - store.close() - - def test_init_array_path(self): - path = "foo/bar" - store = self.create_store() - init_array(store, shape=1000, chunks=100, path=path) - - # check metadata - mkey = path + "/" + array_meta_key - assert mkey in store - meta = store._metadata_class.decode_array_metadata(store[mkey]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - assert default_compressor.get_config() == meta["compressor"] - assert (1000,) == meta["shape"] - assert meta["fill_value"] is None - - store.close() - - def _test_init_array_overwrite_path(self, order): - # setup - path = "foo/bar" - store = self.create_store() - mkey = path + "/" + array_meta_key - meta = dict( - shape=(2000,), - chunks=(200,), - dtype=np.dtype("u1"), - compressor=Zlib(1).get_config(), - fill_value=0, - order=order, - filters=None, - ) - store[mkey] = store._metadata_class.encode_array_metadata(meta) - - # don't overwrite - with pytest.raises(ContainsArrayError): - init_array(store, shape=1000, chunks=100, path=path) - - # do overwrite - try: - init_array(store, shape=1000, chunks=100, dtype="i4", path=path, overwrite=True) - except NotImplementedError: - pass - else: - if self.version == 2: - assert group_meta_key in store - assert array_meta_key not in store - assert mkey in store - # should have been overwritten - meta = store._metadata_class.decode_array_metadata(store[mkey]) - if self.version == 2: - assert ZARR_FORMAT == meta["zarr_format"] - assert (100,) == meta["chunks"] - assert np.dtype("i4") == meta["dtype"] - else: - assert (100,) == meta["chunk_grid"]["chunk_shape"] - assert np.dtype("i4") == meta["data_type"] - assert (1000,) == meta["shape"] - - store.close() - - def test_init_array_overwrite_group(self): - # setup - path = "foo/bar" - store = self.create_store() - array_key = path + "/" + array_meta_key - group_key = path + "/" + group_meta_key - store[group_key] = store._metadata_class.encode_group_metadata() - - # don't overwrite - with pytest.raises(ContainsGroupError): - init_array(store, shape=1000, chunks=100, path=path) - - # do overwrite - try: - init_array(store, shape=1000, chunks=100, dtype="i4", path=path, overwrite=True) - except NotImplementedError: - pass - else: - assert group_key not in store - assert array_key in store - meta = store._metadata_class.decode_array_metadata(store[array_key]) - if self.version == 2: - assert ZARR_FORMAT == meta["zarr_format"] - assert (100,) == meta["chunks"] - assert np.dtype("i4") == meta["dtype"] - else: - assert (100,) == meta["chunk_grid"]["chunk_shape"] - assert np.dtype("i4") == meta["data_type"] - assert (1000,) == meta["shape"] - - store.close() - - def _test_init_array_overwrite_chunk_store(self, order): - # setup - store = self.create_store() - chunk_store = self.create_store() - - path = None - data_path = "" - mkey = array_meta_key - meta = dict( - shape=(2000,), - chunks=(200,), - dtype=np.dtype("u1"), - compressor=None, - fill_value=0, - filters=None, - order=order, - ) - - store[mkey] = store._metadata_class.encode_array_metadata(meta) - - chunk_store[data_path + "0"] = b"aaa" - chunk_store[data_path + "1"] = b"bbb" - - # don't overwrite (default) - with pytest.raises(ContainsArrayError): - init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) - - # do overwrite - try: - init_array( - store, - path=path, - shape=1000, - chunks=100, - dtype="i4", - overwrite=True, - chunk_store=chunk_store, - ) - except NotImplementedError: - pass - else: - assert mkey in store - meta = store._metadata_class.decode_array_metadata(store[mkey]) - if self.version == 2: - assert ZARR_FORMAT == meta["zarr_format"] - assert (100,) == meta["chunks"] - assert np.dtype("i4") == meta["dtype"] - else: - assert (100,) == meta["chunk_grid"]["chunk_shape"] - assert np.dtype("i4") == meta["data_type"] - assert (1000,) == meta["shape"] - assert data_path + "0" not in chunk_store - assert data_path + "1" not in chunk_store - - store.close() - chunk_store.close() - - def test_init_array_compat(self): - store = self.create_store() - path = None - mkey = array_meta_key - init_array(store, path=path, shape=1000, chunks=100, compressor="none") - meta = store._metadata_class.decode_array_metadata(store[mkey]) - if self.version == 2: - assert meta["compressor"] is None - else: - assert "compressor" not in meta - store.close() - - def test_init_group(self): - store = self.create_store() - path = None - mkey = group_meta_key - init_group(store, path=path) - - # check metadata - assert mkey in store - meta = store._metadata_class.decode_group_metadata(store[mkey]) - if self.version == 2: - assert ZARR_FORMAT == meta["zarr_format"] - else: - assert meta == {"attributes": {}} - - store.close() - - def _test_init_group_overwrite(self, order): - # setup - store = self.create_store() - store[array_meta_key] = store._metadata_class.encode_array_metadata( - dict( - shape=(2000,), - chunks=(200,), - dtype=np.dtype("u1"), - compressor=None, - fill_value=0, - order=order, - filters=None, - ) - ) - - # don't overwrite array (default) - with pytest.raises(ContainsArrayError): - init_group(store) - - # do overwrite - try: - init_group(store, overwrite=True) - except NotImplementedError: - pass - else: - assert array_meta_key not in store - assert group_meta_key in store - meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - - # don't overwrite group - with pytest.raises(ValueError): - init_group(store) - - store.close() - - def _test_init_group_overwrite_path(self, order): - # setup - path = "foo/bar" - store = self.create_store() - meta = dict( - shape=(2000,), - chunks=(200,), - dtype=np.dtype("u1"), - compressor=None, - fill_value=0, - order=order, - filters=None, - ) - array_key = path + "/" + array_meta_key - group_key = path + "/" + group_meta_key - store[array_key] = store._metadata_class.encode_array_metadata(meta) - - # don't overwrite - with pytest.raises(ValueError): - init_group(store, path=path) - - # do overwrite - try: - init_group(store, overwrite=True, path=path) - except NotImplementedError: - pass - else: - if self.version == 2: - assert array_meta_key not in store - assert group_meta_key in store - assert array_key not in store - assert group_key in store - # should have been overwritten - meta = store._metadata_class.decode_group_metadata(store[group_key]) - if self.version == 2: - assert ZARR_FORMAT == meta["zarr_format"] - else: - assert meta == {"attributes": {}} - - store.close() - - def _test_init_group_overwrite_chunk_store(self, order): - # setup - store = self.create_store() - chunk_store = self.create_store() - store[array_meta_key] = store._metadata_class.encode_array_metadata( - dict( - shape=(2000,), - chunks=(200,), - dtype=np.dtype("u1"), - compressor=None, - fill_value=0, - filters=None, - order=order, - ) - ) - chunk_store["foo"] = b"bar" - chunk_store["baz"] = b"quux" - - # don't overwrite array (default) - with pytest.raises(ValueError): - init_group(store, chunk_store=chunk_store) - - # do overwrite - try: - init_group(store, overwrite=True, chunk_store=chunk_store) - except NotImplementedError: - pass - else: - assert array_meta_key not in store - assert group_meta_key in store - meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert "foo" not in chunk_store - assert "baz" not in chunk_store - - # don't overwrite group - with pytest.raises(ValueError): - init_group(store) - - store.close() - chunk_store.close() - - -class TestMappingStore(StoreTests): - def create_store(self, **kwargs): - skip_if_nested_chunks(**kwargs) - return KVStore(dict()) - - def test_set_invalid_content(self): - # Generic mappings support non-buffer types - pass - - -def setdel_hierarchy_checks(store, root=""): - # these tests are for stores that are aware of hierarchy levels; this - # behaviour is not strictly required by Zarr but these tests are included - # to define behaviour of MemoryStore and DirectoryStore classes - - # check __setitem__ and __delitem__ blocked by leaf - - store[root + "a/b"] = b"aaa" - with pytest.raises(KeyError): - store[root + "a/b/c"] = b"xxx" - with pytest.raises(KeyError): - del store[root + "a/b/c"] - - store[root + "d"] = b"ddd" - with pytest.raises(KeyError): - store[root + "d/e/f"] = b"xxx" - with pytest.raises(KeyError): - del store[root + "d/e/f"] - - # test __setitem__ overwrite level - store[root + "x/y/z"] = b"xxx" - store[root + "x/y"] = b"yyy" - assert b"yyy" == ensure_bytes(store[root + "x/y"]) - assert root + "x/y/z" not in store - store[root + "x"] = b"zzz" - assert b"zzz" == ensure_bytes(store[root + "x"]) - assert root + "x/y" not in store - - # test __delitem__ overwrite level - store[root + "r/s/t"] = b"xxx" - del store[root + "r/s"] - assert root + "r/s/t" not in store - store[root + "r/s"] = b"xxx" - del store[root + "r"] - assert root + "r/s" not in store - - -class TestMemoryStore(StoreTests): - def create_store(self, **kwargs): - skip_if_nested_chunks(**kwargs) - return MemoryStore(**kwargs) - - def test_store_contains_bytes(self): - store = self.create_store() - store[self.root + "foo"] = np.array([97, 98, 99, 100, 101], dtype=np.uint8) - assert store[self.root + "foo"] == b"abcde" - - def test_setdel(self): - store = self.create_store() - setdel_hierarchy_checks(store, self.root) - - -class TestDictStore(StoreTests): - def create_store(self, **kwargs): - skip_if_nested_chunks(**kwargs) - - with pytest.warns(DeprecationWarning): - return DictStore(**kwargs) - - def test_deprecated(self): - store = self.create_store() - assert isinstance(store, MemoryStore) - - def test_pickle(self): - with pytest.warns(DeprecationWarning): - # pickle.load() will also trigger deprecation warning - super().test_pickle() - - -class TestDirectoryStore(StoreTests): - def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = DirectoryStore( - path, normalize_keys=normalize_keys, dimension_separator=dimension_separator, **kwargs - ) - return store - - def test_filesystem_path(self): - # test behaviour with path that does not exist - path = "data/store" - if os.path.exists(path): - shutil.rmtree(path) - store = DirectoryStore(path) - # should only be created on demand - assert not os.path.exists(path) - store["foo"] = b"bar" - assert os.path.isdir(path) - - # check correct permissions - # regression test for https://github.com/zarr-developers/zarr-python/issues/325 - stat = os.stat(path) - mode = stat.st_mode & 0o666 - umask = os.umask(0) - os.umask(umask) - assert mode == (0o666 & ~umask) - - # test behaviour with file path - with tempfile.NamedTemporaryFile() as f: - with pytest.raises(ValueError): - DirectoryStore(f.name) - - def test_init_pathlib(self): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - DirectoryStore(pathlib.Path(path)) - - def test_pickle_ext(self): - store = self.create_store() - store2 = pickle.loads(pickle.dumps(store)) - - # check path is preserved - assert store.path == store2.path - - # check point to same underlying directory - assert self.root + "xxx" not in store - store2[self.root + "xxx"] = b"yyy" - assert b"yyy" == ensure_bytes(store[self.root + "xxx"]) - - def test_setdel(self): - store = self.create_store() - setdel_hierarchy_checks(store, self.root) - - def test_normalize_keys(self): - store = self.create_store(normalize_keys=True) - store[self.root + "FOO"] = b"bar" - assert self.root + "FOO" in store - assert self.root + "foo" in store - - def test_listing_keys_slash(self): - def mock_walker_slash(_path): - yield from [ - # trailing slash in first key - ("root_with_slash/", ["d1", "g1"], [".zgroup"]), - ("root_with_slash/d1", [], [".zarray"]), - ("root_with_slash/g1", [], [".zgroup"]), - ] - - res = set(DirectoryStore._keys_fast("root_with_slash/", walker=mock_walker_slash)) - assert res == {".zgroup", "g1/.zgroup", "d1/.zarray"} - - def test_listing_keys_no_slash(self): - def mock_walker_no_slash(_path): - yield from [ - # no trailing slash in first key - ("root_with_no_slash", ["d1", "g1"], [".zgroup"]), - ("root_with_no_slash/d1", [], [".zarray"]), - ("root_with_no_slash/g1", [], [".zgroup"]), - ] - - res = set(DirectoryStore._keys_fast("root_with_no_slash", mock_walker_no_slash)) - assert res == {".zgroup", "g1/.zgroup", "d1/.zarray"} - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestFSStore(StoreTests): - def create_store(self, normalize_keys=False, dimension_separator=".", path=None, **kwargs): - if path is None: - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - - store = FSStore( - path, normalize_keys=normalize_keys, dimension_separator=dimension_separator, **kwargs - ) - return store - - def test_init_array(self): - store = self.create_store() - init_array(store, shape=1000, chunks=100) - - # check metadata - assert array_meta_key in store - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - assert meta["dimension_separator"] == "." - - def test_dimension_separator(self): - for x in (".", "/"): - store = self.create_store(dimension_separator=x) - norm = store._normalize_key - assert ".zarray" == norm(".zarray") - assert ".zarray" == norm("/.zarray") - assert ".zgroup" == norm("/.zgroup") - assert "group/.zarray" == norm("group/.zarray") - assert "group/.zgroup" == norm("group/.zgroup") - assert "group/.zarray" == norm("/group/.zarray") - assert "group/.zgroup" == norm("/group/.zgroup") - - def test_complex(self): - path1 = tempfile.mkdtemp() - path2 = tempfile.mkdtemp() - store = self.create_store( - path="simplecache::file://" + path1, - simplecache={"same_names": True, "cache_storage": path2}, - ) - assert not store - assert not os.listdir(path1) - assert not os.listdir(path2) - store[self.root + "foo"] = b"hello" - assert "foo" in os.listdir(str(path1) + "/" + self.root) - assert self.root + "foo" in store - assert "foo" in os.listdir(str(path2)) - assert store[self.root + "foo"] == b"hello" - - def test_deep_ndim(self): - store = self.create_store() - path = None if self.version == 2 else "group1" - foo = zarr.v2.open_group(store=store, path=path) - bar = foo.create_group("bar") - baz = bar.create_dataset("baz", shape=(4, 4, 4), chunks=(2, 2, 2), dtype="i8") - baz[:] = 1 - if self.version == 2: - assert set(store.listdir()) == {".zgroup", "bar"} - else: - assert set(store.listdir()) == {"data", "meta", "zarr.v2.json"} - assert set(store.listdir("meta/root/" + path)) == {"bar", "bar.group.json"} - assert set(store.listdir("data/root/" + path)) == {"bar"} - assert foo["bar"]["baz"][(0, 0, 0)] == 1 - - def test_not_fsspec(self): - path = tempfile.mkdtemp() - with pytest.raises(ValueError, match="storage_options"): - zarr.v2.open_array(path, mode="w", storage_options={"some": "kwargs"}) - with pytest.raises(ValueError, match="storage_options"): - zarr.v2.open_group(path, mode="w", storage_options={"some": "kwargs"}) - zarr.v2.open_array("file://" + path, mode="w", shape=(1,), dtype="f8") - - def test_create(self): - path1 = tempfile.mkdtemp() - path2 = tempfile.mkdtemp() - g = zarr.v2.open_group("file://" + path1, mode="w", storage_options={"auto_mkdir": True}) - a = g.create_dataset("data", shape=(8,)) - a[:4] = [0, 1, 2, 3] - assert "data" in os.listdir(path1) - assert ".zgroup" in os.listdir(path1) - - # consolidated metadata (GH#915) - consolidate_metadata("file://" + path1) - assert ".zmetadata" in os.listdir(path1) - - g = zarr.v2.open_group( - "simplecache::file://" + path1, - mode="r", - storage_options={"cache_storage": path2, "same_names": True}, - ) - assert g.data[:].tolist() == [0, 1, 2, 3, 0, 0, 0, 0] - with pytest.raises(PermissionError): - g.data[:] = 1 - - @pytest.mark.parametrize("mode,allowed", [("r", False), ("r+", True)]) - def test_modify_consolidated(self, mode, allowed): - url = "file://" + tempfile.mkdtemp() - - # create - root = zarr.v2.open_group(url, mode="w") - root.zeros("baz", shape=(10000, 10000), chunks=(1000, 1000), dtype="i4") - zarr.v2.consolidate_metadata(url) - - # reopen and modify - root = zarr.v2.open_consolidated(url, mode=mode) - if allowed: - root["baz"][0, 0] = 7 - - root = zarr.v2.open_consolidated(url, mode="r") - assert root["baz"][0, 0] == 7 - else: - with pytest.raises(zarr.v2.errors.ReadOnlyError): - root["baz"][0, 0] = 7 - - @pytest.mark.parametrize("mode", ["r", "r+"]) - def test_modify_consolidated_metadata_raises(self, mode): - url = "file://" + tempfile.mkdtemp() - - # create - root = zarr.v2.open_group(url, mode="w") - root.zeros("baz", shape=(10000, 10000), chunks=(1000, 1000), dtype="i4") - zarr.v2.consolidate_metadata(url) - - # reopen and modify - root = zarr.v2.open_consolidated(url, mode=mode) - with pytest.raises(zarr.v2.errors.ReadOnlyError): - root["baz"].resize(100, 100) - - def test_read_only(self): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = self.create_store(path=path) - store[self.root + "foo"] = b"bar" - - store = self.create_store(path=path, mode="r") - - with pytest.raises(PermissionError): - store[self.root + "foo"] = b"hex" - - with pytest.raises(PermissionError): - del store[self.root + "foo"] - - with pytest.raises(PermissionError): - store.delitems([self.root + "foo"]) - - with pytest.raises(PermissionError): - store.setitems({self.root + "foo": b"baz"}) - - with pytest.raises(PermissionError): - store.clear() - - with pytest.raises(PermissionError): - store.rmdir(self.root + "anydir") - - assert store[self.root + "foo"] == b"bar" - - def test_eq(self): - store1 = self.create_store(path="anypath") - store2 = self.create_store(path="anypath") - assert store1 == store2 - - @pytest.mark.usefixtures("s3") - def test_s3(self): - g = zarr.v2.open_group("s3://test/out.zarr", mode="w", storage_options=self.s3so) - a = g.create_dataset("data", shape=(8,)) - a[:4] = [0, 1, 2, 3] - - g = zarr.v2.open_group("s3://test/out.zarr", mode="r", storage_options=self.s3so) - - assert g.data[:].tolist() == [0, 1, 2, 3, 0, 0, 0, 0] - - # test via convenience - g = zarr.v2.open("s3://test/out.zarr", mode="r", storage_options=self.s3so) - assert g.data[:].tolist() == [0, 1, 2, 3, 0, 0, 0, 0] - - @pytest.mark.usefixtures("s3") - def test_s3_complex(self): - g = zarr.v2.open_group("s3://test/out.zarr", mode="w", storage_options=self.s3so) - expected = np.empty((8, 8, 8), dtype="int64") - expected[:] = -1 - a = g.create_dataset( - "data", shape=(8, 8, 8), fill_value=-1, chunks=(1, 1, 1), overwrite=True - ) - expected[0] = 0 - expected[3] = 3 - expected[6, 6, 6] = 6 - a[6, 6, 6] = 6 - a[:4] = expected[:4] - - b = g.create_dataset( - "data_f", - shape=(8,), - chunks=(1,), - dtype=[("foo", "S3"), ("bar", "i4")], - fill_value=(b"b", 1), - ) - b[:4] = (b"aaa", 2) - g2 = zarr.v2.open_group("s3://test/out.zarr", mode="r", storage_options=self.s3so) - - assert (g2.data[:] == expected).all() - a.chunk_store.fs.invalidate_cache("test/out.zarr/data") - a[:] = 5 - assert (a[:] == 5).all() - - assert g2.data_f["foo"].tolist() == [b"aaa"] * 4 + [b"b"] * 4 - with pytest.raises(PermissionError): - g2.data[:] = 5 - - with pytest.raises(PermissionError): - g2.store.setitems({}) - - with pytest.raises(PermissionError): - # even though overwrite=True, store is read-only, so fails - g2.create_dataset( - "data", shape=(8, 8, 8), fill_value=-1, chunks=(1, 1, 1), overwrite=True - ) - - a = g.create_dataset( - "data", shape=(8, 8, 8), fill_value=-1, chunks=(1, 1, 1), overwrite=True - ) - assert (a[:] == -np.ones((8, 8, 8))).all() - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestFSStoreWithKeySeparator(StoreTests): - def create_store(self, normalize_keys=False, key_separator=".", **kwargs): - # Since the user is passing key_separator, that will take priority. - skip_if_nested_chunks(**kwargs) - - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - return FSStore(path, normalize_keys=normalize_keys, key_separator=key_separator) - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestFSStoreFromFilesystem(StoreTests): - def create_store(self, normalize_keys=False, dimension_separator=".", path=None, **kwargs): - import fsspec - - fs = fsspec.filesystem("file") - - if path is None: - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - - with pytest.raises(ValueError): - # can't specify storage_options when passing an - # existing fs object - _ = FSStore(path, fs=fs, auto_mkdir=True) - - store = FSStore( - path, - normalize_keys=normalize_keys, - dimension_separator=dimension_separator, - fs=fs, - **kwargs, - ) - - return store - - -@pytest.fixture() -def s3(request): - # writable local S3 system - pytest.skip("old v3 tests are disabled", allow_module_level=True) - - import shlex - import subprocess - import time - - if "BOTO_CONFIG" not in os.environ: # pragma: no cover - os.environ["BOTO_CONFIG"] = "/dev/null" - if "AWS_ACCESS_KEY_ID" not in os.environ: # pragma: no cover - os.environ["AWS_ACCESS_KEY_ID"] = "foo" - if "AWS_SECRET_ACCESS_KEY" not in os.environ: # pragma: no cover - os.environ["AWS_SECRET_ACCESS_KEY"] = "bar" - requests = pytest.importorskip("requests") - s3fs = pytest.importorskip("s3fs") - pytest.importorskip("moto") - - port = 5556 - endpoint_uri = "http://127.0.0.1:%d/" % port - proc = subprocess.Popen( - shlex.split("moto_server s3 -p %d" % port), - stderr=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - ) - - timeout = 5 - while timeout > 0: - try: - r = requests.get(endpoint_uri) - if r.ok: - break - except Exception: # pragma: no cover - pass - timeout -= 0.1 # pragma: no cover - time.sleep(0.1) # pragma: no cover - s3so = dict(client_kwargs={"endpoint_url": endpoint_uri}, use_listings_cache=False) - s3fs.S3FileSystem.clear_instance_cache() - s3 = s3fs.S3FileSystem(anon=False, **s3so) - s3.mkdir("test") - request.cls.s3so = s3so - yield - proc.terminate() - proc.wait() - - -class TestNestedDirectoryStore(TestDirectoryStore): - def create_store(self, normalize_keys=False, **kwargs): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = NestedDirectoryStore(path, normalize_keys=normalize_keys, **kwargs) - return store - - def test_init_array(self): - store = self.create_store() - assert store._dimension_separator == "/" - init_array(store, shape=1000, chunks=100) - - # check metadata - assert array_meta_key in store - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - assert meta["dimension_separator"] == "/" - - def test_chunk_nesting(self): - store = self.create_store() - # any path where last segment looks like a chunk key gets special handling - store[self.root + "0.0"] = b"xxx" - assert b"xxx" == store[self.root + "0.0"] - # assert b'xxx' == store['0/0'] - store[self.root + "foo/10.20.30"] = b"yyy" - assert b"yyy" == store[self.root + "foo/10.20.30"] - # assert b'yyy' == store['foo/10/20/30'] - store[self.root + "42"] = b"zzz" - assert b"zzz" == store[self.root + "42"] - - def test_listdir(self): - store = self.create_store() - z = zarr.v2.zeros((10, 10), chunks=(5, 5), store=store) - z[:] = 1 # write to all chunks - for k in store.listdir(): - assert store.get(k) is not None - - -class TestNestedDirectoryStoreNone: - def test_value_error(self): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = NestedDirectoryStore(path, normalize_keys=True, dimension_separator=None) - assert store._dimension_separator == "/" - - -class TestNestedDirectoryStoreWithWrongValue: - def test_value_error(self): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - with pytest.raises(ValueError): - NestedDirectoryStore(path, normalize_keys=True, dimension_separator=".") - - -class TestN5Store(TestNestedDirectoryStore): - def create_store(self, normalize_keys=False): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = N5Store(path, normalize_keys=normalize_keys) - return store - - def test_equal(self): - store_a = self.create_store() - store_b = N5Store(store_a.path) - assert store_a == store_b - - @pytest.mark.parametrize("zarr_meta_key", [".zarray", ".zattrs", ".zgroup"]) - def test_del_zarr_meta_key(self, zarr_meta_key): - store = self.create_store() - store[n5_attrs_key] = json_dumps({"foo": "bar"}) - del store[zarr_meta_key] - assert n5_attrs_key not in store - - def test_chunk_nesting(self): - store = self.create_store() - store["0.0"] = b"xxx" - assert "0.0" in store - assert b"xxx" == store["0.0"] - # assert b'xxx' == store['0/0'] - store["foo/10.20.30"] = b"yyy" - assert "foo/10.20.30" in store - assert b"yyy" == store["foo/10.20.30"] - # N5 reverses axis order - assert b"yyy" == store["foo/30/20/10"] - del store["foo/10.20.30"] - assert "foo/30/20/10" not in store - store["42"] = b"zzz" - assert "42" in store - assert b"zzz" == store["42"] - - def test_init_array(self): - store = self.create_store() - init_array(store, shape=1000, chunks=100) - - # check metadata - assert array_meta_key in store - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - # N5Store wraps the actual compressor - compressor_config = meta["compressor"]["compressor_config"] - assert default_compressor.get_config() == compressor_config - # N5Store always has a fill value of 0 - assert meta["fill_value"] == 0 - assert meta["dimension_separator"] == "." - # Top-level groups AND arrays should have - # the n5 keyword in metadata - raw_n5_meta = json.loads(store[n5_attrs_key]) - assert raw_n5_meta.get("n5", None) == N5_FORMAT - - def test_init_array_path(self): - path = "foo/bar" - store = self.create_store() - init_array(store, shape=1000, chunks=100, path=path) - - # check metadata - key = path + "/" + array_meta_key - assert key in store - meta = store._metadata_class.decode_array_metadata(store[key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - # N5Store wraps the actual compressor - compressor_config = meta["compressor"]["compressor_config"] - assert default_compressor.get_config() == compressor_config - # N5Store always has a fill value of 0 - assert meta["fill_value"] == 0 - - def test_init_array_compat(self): - store = self.create_store() - init_array(store, shape=1000, chunks=100, compressor="none") - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - # N5Store wraps the actual compressor - compressor_config = meta["compressor"]["compressor_config"] - assert compressor_config is None - - def test_init_array_overwrite(self): - self._test_init_array_overwrite("C") - - def test_init_array_overwrite_path(self): - self._test_init_array_overwrite_path("C") - - def test_init_array_overwrite_chunk_store(self): - self._test_init_array_overwrite_chunk_store("C") - - def test_init_group_overwrite(self): - self._test_init_group_overwrite("C") - - def test_init_group_overwrite_path(self): - self._test_init_group_overwrite_path("C") - - def test_init_group_overwrite_chunk_store(self): - self._test_init_group_overwrite_chunk_store("C") - - def test_init_group(self): - store = self.create_store() - init_group(store) - store[".zattrs"] = json_dumps({"foo": "bar"}) - # check metadata - assert group_meta_key in store - assert group_meta_key in store.listdir() - assert group_meta_key in store.listdir("") - meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - - def test_filters(self): - all_filters, all_errors = zip( - *[ - (None, does_not_raise()), - ([], does_not_raise()), - ([AsType("f4", "f8")], pytest.raises(ValueError)), - ] - ) - for filters, error in zip(all_filters, all_errors): - store = self.create_store() - with error: - init_array(store, shape=1000, chunks=100, filters=filters) - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestN5FSStore(TestFSStore): - def create_store(self, normalize_keys=False, path=None, **kwargs): - if path is None: - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - - store = N5FSStore(path, normalize_keys=normalize_keys, **kwargs) - return store - - def test_equal(self): - store_a = self.create_store() - store_b = N5FSStore(store_a.path) - assert store_a == store_b - - # This is copied wholesale from the N5Store tests. The same test could - # be run by making TestN5FSStore inherit from both TestFSStore and - # TestN5Store, but a direct copy is arguably more explicit. - - @pytest.mark.parametrize("zarr_meta_key", [".zarray", ".zattrs", ".zgroup"]) - def test_del_zarr_meta_key(self, zarr_meta_key): - store = self.create_store() - store[n5_attrs_key] = json_dumps({"foo": "bar"}) - del store[zarr_meta_key] - assert n5_attrs_key not in store - - def test_chunk_nesting(self): - store = self.create_store() - store["0.0"] = b"xxx" - assert "0.0" in store - assert b"xxx" == store["0.0"] - # assert b'xxx' == store['0/0'] - store["foo/10.20.30"] = b"yyy" - assert "foo/10.20.30" in store - assert b"yyy" == store["foo/10.20.30"] - # N5 reverses axis order - assert b"yyy" == store["foo/30/20/10"] - del store["foo/10.20.30"] - assert "foo/30/20/10" not in store - store["42"] = b"zzz" - assert "42" in store - assert b"zzz" == store["42"] - - def test_init_array(self): - store = self.create_store() - init_array(store, shape=1000, chunks=100) - - # check metadata - assert array_meta_key in store - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - # N5Store wraps the actual compressor - compressor_config = meta["compressor"]["compressor_config"] - assert default_compressor.get_config() == compressor_config - # N5Store always has a fill value of 0 - assert meta["fill_value"] == 0 - assert meta["dimension_separator"] == "." - # Top-level groups AND arrays should have - # the n5 keyword in metadata - raw_n5_meta = json.loads(store[n5_attrs_key]) - assert raw_n5_meta.get("n5", None) == N5_FORMAT - - def test_init_array_path(self): - path = "foo/bar" - store = self.create_store() - init_array(store, shape=1000, chunks=100, path=path) - - # check metadata - key = path + "/" + array_meta_key - assert key in store - meta = store._metadata_class.decode_array_metadata(store[key]) - assert ZARR_FORMAT == meta["zarr_format"] - assert (1000,) == meta["shape"] - assert (100,) == meta["chunks"] - assert np.dtype(None) == meta["dtype"] - # N5Store wraps the actual compressor - compressor_config = meta["compressor"]["compressor_config"] - assert default_compressor.get_config() == compressor_config - # N5Store always has a fill value of 0 - assert meta["fill_value"] == 0 - - def test_init_array_compat(self): - store = self.create_store() - init_array(store, shape=1000, chunks=100, compressor="none") - meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) - # N5Store wraps the actual compressor - compressor_config = meta["compressor"]["compressor_config"] - assert compressor_config is None - - def test_init_array_overwrite(self): - self._test_init_array_overwrite("C") - - def test_init_array_overwrite_path(self): - self._test_init_array_overwrite_path("C") - - def test_init_array_overwrite_chunk_store(self): - self._test_init_array_overwrite_chunk_store("C") - - def test_init_group_overwrite(self): - self._test_init_group_overwrite("C") - - def test_init_group_overwrite_path(self): - self._test_init_group_overwrite_path("C") - - def test_init_group_overwrite_chunk_store(self): - self._test_init_group_overwrite_chunk_store("C") - - def test_dimension_separator(self): - with pytest.warns(UserWarning, match="dimension_separator"): - self.create_store(dimension_separator="/") - - def test_init_group(self): - store = self.create_store() - init_group(store) - store[".zattrs"] = json_dumps({"foo": "bar"}) - # check metadata - assert group_meta_key in store - assert group_meta_key in store.listdir() - assert group_meta_key in store.listdir("") - meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) - assert ZARR_FORMAT == meta["zarr_format"] - - def test_filters(self): - all_filters, all_errors = zip( - *[ - (None, does_not_raise()), - ([], does_not_raise()), - ([AsType("f4", "f8")], pytest.raises(ValueError)), - ] - ) - for filters, error in zip(all_filters, all_errors): - store = self.create_store() - with error: - init_array(store, shape=1000, chunks=100, filters=filters) - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestNestedFSStore(TestNestedDirectoryStore): - def create_store(self, normalize_keys=False, path=None, **kwargs): - if path is None: - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = FSStore( - path, normalize_keys=normalize_keys, dimension_separator="/", auto_mkdir=True, **kwargs - ) - return store - - def test_numbered_groups(self): - # Create an array - store = self.create_store() - group = zarr.v2.group(store=store) - arr = group.create_dataset("0", shape=(10, 10)) - arr[1] = 1 - - # Read it back - store = self.create_store(path=store.path) - zarr.v2.open_group(store.path)["0"] - - -class TestTempStore(StoreTests): - def create_store(self, **kwargs): - skip_if_nested_chunks(**kwargs) - return TempStore(**kwargs) - - def test_setdel(self): - store = self.create_store() - setdel_hierarchy_checks(store, self.root) - - -class TestZipStore(StoreTests): - ZipStoreClass = ZipStore - - def create_store(self, **kwargs): - path = mktemp(suffix=".zip") - atexit.register(os.remove, path) - store = ZipStore(path, mode="w", **kwargs) - return store - - def test_mode(self): - with self.ZipStoreClass("data/store.zip", mode="w") as store: - store[self.root + "foo"] = b"bar" - store = self.ZipStoreClass("data/store.zip", mode="r") - with pytest.raises(PermissionError): - store[self.root + "foo"] = b"bar" - with pytest.raises(PermissionError): - store.clear() - - def test_flush(self): - store = self.ZipStoreClass("data/store.zip", mode="w") - store[self.root + "foo"] = b"bar" - store.flush() - assert store[self.root + "foo"] == b"bar" - store.close() - - store = self.ZipStoreClass("data/store.zip", mode="r") - store.flush() # no-op - - def test_context_manager(self): - with self.create_store() as store: - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"qux" - assert 2 == len(store) - - def test_pop(self): - # override because not implemented - store = self.create_store() - store[self.root + "foo"] = b"bar" - with pytest.raises(NotImplementedError): - store.pop(self.root + "foo") - - def test_popitem(self): - # override because not implemented - store = self.create_store() - store[self.root + "foo"] = b"bar" - with pytest.raises(NotImplementedError): - store.popitem() - - def test_permissions(self): - store = self.ZipStoreClass("data/store.zip", mode="w") - foo_key = "foo" if self.version == 2 else self.root + "foo" - # TODO: cannot provide key ending in / for v3 - # how to create an empty folder in that case? - baz_key = "baz/" if self.version == 2 else self.root + "baz" - store[foo_key] = b"bar" - store[baz_key] = b"" - - store.flush() - store.close() - z = ZipFile("data/store.zip", "r") - info = z.getinfo(foo_key) - perm = oct(info.external_attr >> 16) - assert perm == "0o644" - info = z.getinfo(baz_key) - perm = oct(info.external_attr >> 16) - # only for posix platforms - if os.name == "posix": - if self.version == 2: - assert perm == "0o40775" - else: - # baz/ on v2, but baz on v3, so not a directory - assert perm == "0o644" - z.close() - - def test_store_and_retrieve_ndarray(self): - store = ZipStore("data/store.zip") - x = np.array([[1, 2], [3, 4]]) - store["foo"] = x - y = np.frombuffer(store["foo"], dtype=x.dtype).reshape(x.shape) - assert np.array_equiv(y, x) - - -class TestDBMStore(StoreTests): - def create_store(self, dimension_separator=None): - path = mktemp(suffix=".anydbm") - atexit.register(atexit_rmglob, path + "*") - # create store using default dbm implementation - store = DBMStore(path, flag="n", dimension_separator=dimension_separator) - return store - - def test_context_manager(self): - with self.create_store() as store: - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"qux" - assert 2 == len(store) - - -class TestDBMStoreDumb(TestDBMStore): - def create_store(self, **kwargs): - path = mktemp(suffix=".dumbdbm") - atexit.register(atexit_rmglob, path + "*") - - import dbm.dumb as dumbdbm - - store = DBMStore(path, flag="n", open=dumbdbm.open, **kwargs) - return store - - -class TestDBMStoreGnu(TestDBMStore): - def create_store(self, **kwargs): - gdbm = pytest.importorskip("dbm.gnu") - path = mktemp(suffix=".gdbm") # pragma: no cover - atexit.register(os.remove, path) # pragma: no cover - store = DBMStore( - path, flag="n", open=gdbm.open, write_lock=False, **kwargs - ) # pragma: no cover - return store # pragma: no cover - - -class TestDBMStoreNDBM(TestDBMStore): - def create_store(self, **kwargs): - ndbm = pytest.importorskip("dbm.ndbm") - path = mktemp(suffix=".ndbm") # pragma: no cover - atexit.register(atexit_rmglob, path + "*") # pragma: no cover - store = DBMStore(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover - return store # pragma: no cover - - -class TestDBMStoreBerkeleyDB(TestDBMStore): - def create_store(self, **kwargs): - bsddb3 = pytest.importorskip("bsddb3") - path = mktemp(suffix=".dbm") - atexit.register(os.remove, path) - store = DBMStore(path, flag="n", open=bsddb3.btopen, write_lock=False, **kwargs) - return store - - -class TestLMDBStore(StoreTests): - def create_store(self, **kwargs): - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - buffers = True - store = LMDBStore(path, buffers=buffers, **kwargs) - return store - - def test_context_manager(self): - with self.create_store() as store: - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"qux" - assert 2 == len(store) - - -class TestSQLiteStore(StoreTests): - def create_store(self, **kwargs): - pytest.importorskip("sqlite3") - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStore(path, **kwargs) - return store - - def test_underscore_in_name(self): - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStore(path) - store["a"] = b"aaa" - store["a_b"] = b"aa_bb" - store.rmdir("a") - assert "a_b" in store - - -class TestSQLiteStoreInMemory(TestSQLiteStore): - def create_store(self, **kwargs): - pytest.importorskip("sqlite3") - store = SQLiteStore(":memory:", **kwargs) - return store - - def test_pickle(self): - # setup store - store = self.create_store() - store[self.root + "foo"] = b"bar" - store[self.root + "baz"] = b"quux" - - # round-trip through pickle - with pytest.raises(PicklingError): - pickle.dumps(store) - - -@skip_test_env_var("ZARR_TEST_MONGO") -class TestMongoDBStore(StoreTests): - def create_store(self, **kwargs): - pytest.importorskip("pymongo") - store = MongoDBStore( - host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs - ) - # start with an empty store - store.clear() - return store - - -@skip_test_env_var("ZARR_TEST_REDIS") -class TestRedisStore(StoreTests): - def create_store(self, **kwargs): - # TODO: this is the default host for Redis on Travis, - # we probably want to generalize this though - pytest.importorskip("redis") - store = RedisStore(host="localhost", port=6379, **kwargs) - # start with an empty store - store.clear() - return store - - -class TestLRUStoreCache(StoreTests): - CountingClass = CountingDict - LRUStoreClass = LRUStoreCache - - def create_store(self, **kwargs): - # wrapper therefore no dimension_separator argument - skip_if_nested_chunks(**kwargs) - return self.LRUStoreClass(dict(), max_size=2**27) - - def test_cache_values_no_max_size(self): - # setup store - store = self.CountingClass() - foo_key = self.root + "foo" - bar_key = self.root + "bar" - store[foo_key] = b"xxx" - store[bar_key] = b"yyy" - assert 0 == store.counter["__getitem__", foo_key] - assert 1 == store.counter["__setitem__", foo_key] - assert 0 == store.counter["__getitem__", bar_key] - assert 1 == store.counter["__setitem__", bar_key] - - # setup cache - cache = self.LRUStoreClass(store, max_size=None) - assert 0 == cache.hits - assert 0 == cache.misses - - # test first __getitem__, cache miss - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 1 == store.counter["__setitem__", foo_key] - assert 0 == cache.hits - assert 1 == cache.misses - - # test second __getitem__, cache hit - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 1 == store.counter["__setitem__", foo_key] - assert 1 == cache.hits - assert 1 == cache.misses - - # test __setitem__, __getitem__ - cache[foo_key] = b"zzz" - assert 1 == store.counter["__getitem__", foo_key] - assert 2 == store.counter["__setitem__", foo_key] - # should be a cache hit - assert b"zzz" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 2 == store.counter["__setitem__", foo_key] - assert 2 == cache.hits - assert 1 == cache.misses - - # manually invalidate all cached values - cache.invalidate_values() - assert b"zzz" == cache[foo_key] - assert 2 == store.counter["__getitem__", foo_key] - assert 2 == store.counter["__setitem__", foo_key] - cache.invalidate() - assert b"zzz" == cache[foo_key] - assert 3 == store.counter["__getitem__", foo_key] - assert 2 == store.counter["__setitem__", foo_key] - - # test __delitem__ - del cache[foo_key] - with pytest.raises(KeyError): - # noinspection PyStatementEffect - cache[foo_key] - with pytest.raises(KeyError): - # noinspection PyStatementEffect - store[foo_key] - - # verify other keys untouched - assert 0 == store.counter["__getitem__", bar_key] - assert 1 == store.counter["__setitem__", bar_key] - - def test_cache_values_with_max_size(self): - # setup store - store = self.CountingClass() - foo_key = self.root + "foo" - bar_key = self.root + "bar" - store[foo_key] = b"xxx" - store[bar_key] = b"yyy" - assert 0 == store.counter["__getitem__", foo_key] - assert 0 == store.counter["__getitem__", bar_key] - # setup cache - can only hold one item - cache = self.LRUStoreClass(store, max_size=5) - assert 0 == cache.hits - assert 0 == cache.misses - - # test first 'foo' __getitem__, cache miss - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 0 == cache.hits - assert 1 == cache.misses - - # test second 'foo' __getitem__, cache hit - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 1 == cache.hits - assert 1 == cache.misses - - # test first 'bar' __getitem__, cache miss - assert b"yyy" == cache[bar_key] - assert 1 == store.counter["__getitem__", bar_key] - assert 1 == cache.hits - assert 2 == cache.misses - - # test second 'bar' __getitem__, cache hit - assert b"yyy" == cache[bar_key] - assert 1 == store.counter["__getitem__", bar_key] - assert 2 == cache.hits - assert 2 == cache.misses - - # test 'foo' __getitem__, should have been evicted, cache miss - assert b"xxx" == cache[foo_key] - assert 2 == store.counter["__getitem__", foo_key] - assert 2 == cache.hits - assert 3 == cache.misses - - # test 'bar' __getitem__, should have been evicted, cache miss - assert b"yyy" == cache[bar_key] - assert 2 == store.counter["__getitem__", bar_key] - assert 2 == cache.hits - assert 4 == cache.misses - - # setup store - store = self.CountingClass() - store[foo_key] = b"xxx" - store[bar_key] = b"yyy" - assert 0 == store.counter["__getitem__", foo_key] - assert 0 == store.counter["__getitem__", bar_key] - # setup cache - can hold two items - cache = self.LRUStoreClass(store, max_size=6) - assert 0 == cache.hits - assert 0 == cache.misses - - # test first 'foo' __getitem__, cache miss - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 0 == cache.hits - assert 1 == cache.misses - - # test second 'foo' __getitem__, cache hit - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 1 == cache.hits - assert 1 == cache.misses - - # test first 'bar' __getitem__, cache miss - assert b"yyy" == cache[bar_key] - assert 1 == store.counter["__getitem__", bar_key] - assert 1 == cache.hits - assert 2 == cache.misses - - # test second 'bar' __getitem__, cache hit - assert b"yyy" == cache[bar_key] - assert 1 == store.counter["__getitem__", bar_key] - assert 2 == cache.hits - assert 2 == cache.misses - - # test 'foo' __getitem__, should still be cached - assert b"xxx" == cache[foo_key] - assert 1 == store.counter["__getitem__", foo_key] - assert 3 == cache.hits - assert 2 == cache.misses - - # test 'bar' __getitem__, should still be cached - assert b"yyy" == cache[bar_key] - assert 1 == store.counter["__getitem__", bar_key] - assert 4 == cache.hits - assert 2 == cache.misses - - def test_cache_keys(self): - # setup - store = self.CountingClass() - foo_key = self.root + "foo" - bar_key = self.root + "bar" - baz_key = self.root + "baz" - store[foo_key] = b"xxx" - store[bar_key] = b"yyy" - assert 0 == store.counter["__contains__", foo_key] - assert 0 == store.counter["__iter__"] - assert 0 == store.counter["keys"] - cache = self.LRUStoreClass(store, max_size=None) - - # keys should be cached on first call - keys = sorted(cache.keys()) - assert keys == [bar_key, foo_key] - assert 1 == store.counter["keys"] - # keys should now be cached - assert keys == sorted(cache.keys()) - assert 1 == store.counter["keys"] - assert foo_key in cache - assert 1 == store.counter["__contains__", foo_key] - # the next check for `foo_key` is cached - assert foo_key in cache - assert 1 == store.counter["__contains__", foo_key] - assert keys == sorted(cache) - assert 0 == store.counter["__iter__"] - assert 1 == store.counter["keys"] - - # cache should be cleared if store is modified - crude but simple for now - cache[baz_key] = b"zzz" - keys = sorted(cache.keys()) - assert keys == [bar_key, baz_key, foo_key] - assert 2 == store.counter["keys"] - # keys should now be cached - assert keys == sorted(cache.keys()) - assert 2 == store.counter["keys"] - - # manually invalidate keys - cache.invalidate_keys() - keys = sorted(cache.keys()) - assert keys == [bar_key, baz_key, foo_key] - assert 3 == store.counter["keys"] - assert 1 == store.counter["__contains__", foo_key] - assert 0 == store.counter["__iter__"] - cache.invalidate_keys() - keys = sorted(cache) - assert keys == [bar_key, baz_key, foo_key] - assert 4 == store.counter["keys"] - assert 1 == store.counter["__contains__", foo_key] - assert 0 == store.counter["__iter__"] - cache.invalidate_keys() - assert foo_key in cache - assert 4 == store.counter["keys"] - assert 2 == store.counter["__contains__", foo_key] - assert 0 == store.counter["__iter__"] - - # check these would get counted if called directly - assert foo_key in store - assert 3 == store.counter["__contains__", foo_key] - assert keys == sorted(store) - assert 1 == store.counter["__iter__"] - - -def test_getsize(): - store = KVStore(dict()) - store["foo"] = b"aaa" - store["bar"] = b"bbbb" - store["baz/quux"] = b"ccccc" - assert 7 == getsize(store) - assert 5 == getsize(store, "baz") - - store = KVStore(dict()) - store["boo"] = None - assert -1 == getsize(store) - - -@pytest.mark.parametrize("dict_store", [False, True]) -def test_migrate_1to2(dict_store): - # N.B., version 1 did not support hierarchies, so we only have to be - # concerned about migrating a single array at the root of the store - - # setup - store = dict() if dict_store else KVStore(dict()) - meta = dict( - shape=(100,), - chunks=(10,), - dtype=np.dtype("f4"), - compression="zlib", - compression_opts=1, - fill_value=None, - order="C", - ) - meta_json = meta_v1.encode_metadata(meta) - store["meta"] = meta_json - store["attrs"] = json.dumps(dict()).encode("ascii") - - # run migration - migrate_1to2(store) - - # check results - assert "meta" not in store - assert array_meta_key in store - assert "attrs" not in store - assert attrs_key in store - meta_migrated = decode_array_metadata(store[array_meta_key]) - assert 2 == meta_migrated["zarr_format"] - - # preserved fields - for f in "shape", "chunks", "dtype", "fill_value", "order": - assert meta[f] == meta_migrated[f] - - # migrate should have added empty filters field - assert meta_migrated["filters"] is None - - # check compression and compression_opts migrated to compressor - assert "compression" not in meta_migrated - assert "compression_opts" not in meta_migrated - assert meta_migrated["compressor"] == Zlib(1).get_config() - - # check dict compression_opts - store = dict() if dict_store else KVStore(dict()) - meta["compression"] = "blosc" - meta["compression_opts"] = dict(cname="lz4", clevel=5, shuffle=1) - meta_json = meta_v1.encode_metadata(meta) - store["meta"] = meta_json - store["attrs"] = json.dumps(dict()).encode("ascii") - migrate_1to2(store) - meta_migrated = decode_array_metadata(store[array_meta_key]) - assert "compression" not in meta_migrated - assert "compression_opts" not in meta_migrated - assert meta_migrated["compressor"] == Blosc(cname="lz4", clevel=5, shuffle=1).get_config() - - # check 'none' compression is migrated to None (null in JSON) - store = dict() if dict_store else KVStore(dict()) - meta["compression"] = "none" - meta_json = meta_v1.encode_metadata(meta) - store["meta"] = meta_json - store["attrs"] = json.dumps(dict()).encode("ascii") - migrate_1to2(store) - meta_migrated = decode_array_metadata(store[array_meta_key]) - assert "compression" not in meta_migrated - assert "compression_opts" not in meta_migrated - assert meta_migrated["compressor"] is None - - -def test_format_compatibility(): - # This test is intended to catch any unintended changes that break the ability to - # read data stored with a previous minor version (which should be format-compatible). - - # fixture data - fixture_path = pathlib.Path(__file__).parent / "fixture" / "test_format_compatibility" - fixture = group(store=DirectoryStore(fixture_path)) - - # set seed to get consistent random data - np.random.seed(42) - - arrays_chunks = [ - (np.arange(1111, dtype="" == actual[-8:] - - -def test_tree_get_icon(): - assert tree_get_icon("Array") == tree_array_icon - assert tree_get_icon("Group") == tree_group_icon - with pytest.raises(ValueError): - tree_get_icon("Baz") - - -@mock.patch.dict("sys.modules", {"ipytree": None}) -def test_tree_widget_missing_ipytree(): - pattern = ( - "Run `pip install zarr[jupyter]` or `conda install ipytree`" - "to get the required ipytree dependency for displaying the tree " - "widget. If using jupyterlab<3, you also need to run " - "`jupyter labextension install ipytree`" - ) - with pytest.raises(ImportError, match=re.escape(pattern)): - tree_widget(None, None, None) - - -def test_retry_call(): - class Fixture: - def __init__(self, pass_on=1): - self.c = 0 - self.pass_on = pass_on - - def __call__(self): - self.c += 1 - if self.c != self.pass_on: - raise PermissionError - - for x in range(1, 11): - # Any number of failures less than 10 will be accepted. - fixture = Fixture(pass_on=x) - retry_call(fixture, exceptions=(PermissionError,), wait=0) - assert fixture.c == x - - def fail(x): - # Failures after 10 will cause an error to be raised. - retry_call(Fixture(pass_on=x), exceptions=(Exception,), wait=0) - - for x in range(11, 15): - pytest.raises(PermissionError, fail, x) - - -def test_flatten(): - assert list( - flatten( - [ - "0", - [ - "1", - [ - "2", - [ - "3", - [ - 4, - ], - ], - ], - ], - ] - ) - ) == ["0", "1", "2", "3", 4] - assert list(flatten("foo")) == ["f", "o", "o"] - assert list(flatten(["foo"])) == ["foo"] - - -def test_all_equal(): - assert all_equal(0, np.zeros((10, 10, 10))) - assert not all_equal(1, np.zeros((10, 10, 10))) - - assert all_equal(1, np.ones((10, 10, 10))) - assert not all_equal(1, 1 + np.ones((10, 10, 10))) - - assert all_equal(np.nan, np.array([np.nan, np.nan])) - assert not all_equal(np.nan, np.array([np.nan, 1.0])) - - assert all_equal({"a": -1}, np.array([{"a": -1}, {"a": -1}], dtype="object")) - assert not all_equal({"a": -1}, np.array([{"a": -1}, {"a": 2}], dtype="object")) - - assert all_equal(np.timedelta64(999, "D"), np.array([999, 999], dtype="timedelta64[D]")) - assert not all_equal(np.timedelta64(999, "D"), np.array([999, 998], dtype="timedelta64[D]")) - - # all_equal(None, *) always returns False - assert not all_equal(None, np.array([None, None])) - assert not all_equal(None, np.array([None, 10])) - - -def test_json_dumps_numpy_dtype(): - assert json_dumps(np.int64(0)) == json_dumps(0) - assert json_dumps(np.float32(0)) == json_dumps(float(0)) - # Check that we raise the error of the superclass for unsupported object - with pytest.raises(TypeError): - json_dumps(Array) - - -def test_constant_map(): - val = object() - m = ConstantMap(keys=[1, 2], constant=val) - assert len(m) == 2 - assert m[1] is val - assert m[2] is val - assert 1 in m - assert 0 not in m - with pytest.raises(KeyError): - m[0] - assert repr(m) == repr({1: val, 2: val}) diff --git a/tests/v2/util.py b/tests/v2/util.py deleted file mode 100644 index 12c5e379f6..0000000000 --- a/tests/v2/util.py +++ /dev/null @@ -1,91 +0,0 @@ -import collections -import os -import tempfile -from typing import Any, Mapping, Sequence -from zarr.v2.context import Context - -from zarr.v2.storage import Store - -import pytest - - -class CountingDict(Store): - def __init__(self): - self.wrapped = dict() - self.counter = collections.Counter() - - def __len__(self): - self.counter["__len__"] += 1 - return len(self.wrapped) - - def keys(self): - self.counter["keys"] += 1 - return self.wrapped.keys() - - def __iter__(self): - self.counter["__iter__"] += 1 - return iter(self.wrapped) - - def __contains__(self, item): - self.counter["__contains__", item] += 1 - return item in self.wrapped - - def __getitem__(self, item): - self.counter["__getitem__", item] += 1 - return self.wrapped[item] - - def __setitem__(self, key, value): - self.counter["__setitem__", key] += 1 - self.wrapped[key] = value - - def __delitem__(self, key): - self.counter["__delitem__", key] += 1 - del self.wrapped[key] - - def getitems( - self, keys: Sequence[str], *, contexts: Mapping[str, Context] - ) -> Mapping[str, Any]: - for key in keys: - self.counter["__getitem__", key] += 1 - return {k: self.wrapped[k] for k in keys if k in self.wrapped} - - -def skip_test_env_var(name): - """Checks for environment variables indicating whether tests requiring services should be run""" - value = os.environ.get(name, "0") - return pytest.mark.skipif(value == "0", reason="Tests not enabled via environment variable") - - -try: - import fsspec # noqa: F401 - - have_fsspec = True -except ImportError: # pragma: no cover - have_fsspec = False - - -def abs_container(): - from azure.core.exceptions import ResourceExistsError - import azure.storage.blob as asb - - URL = "http://127.0.0.1:10000" - ACCOUNT_NAME = "devstoreaccount1" - KEY = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" - CONN_STR = ( - f"DefaultEndpointsProtocol=http;AccountName={ACCOUNT_NAME};" - f"AccountKey={KEY};BlobEndpoint={URL}/{ACCOUNT_NAME};" - ) - - blob_service_client = asb.BlobServiceClient.from_connection_string(CONN_STR) - try: - container_client = blob_service_client.create_container("test") - except ResourceExistsError: - container_client = blob_service_client.get_container_client("test") - - return container_client - - -def mktemp(**kwargs): - f = tempfile.NamedTemporaryFile(**kwargs) - f.close() - return f.name diff --git a/tests/v3/test_codecs/test_codecs.py b/tests/v3/test_codecs/test_codecs.py index 57103d17c2..8e98cf20f5 100644 --- a/tests/v3/test_codecs/test_codecs.py +++ b/tests/v3/test_codecs/test_codecs.py @@ -7,7 +7,6 @@ import numpy as np import pytest -import zarr.v2.creation from zarr import Array, AsyncArray, config from zarr.codecs import ( BytesCodec, @@ -18,7 +17,6 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.indexing import Selection, morton_order_iter from zarr.store import StorePath -from zarr.testing.utils import assert_bytes_equal if TYPE_CHECKING: from zarr.abc.codec import Codec @@ -118,21 +116,6 @@ async def test_order( assert not read_data.flags["F_CONTIGUOUS"] assert read_data.flags["C_CONTIGUOUS"] - if not with_sharding: - # Compare with zarr-python - z = zarr.v2.creation.create( - shape=data.shape, - chunks=(32, 8), - dtype=" None: assert await store.get(f"{path}/c0/0", prototype=default_buffer_prototype()) is None -@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) -async def test_zarr_compat(store: Store) -> None: - data = np.zeros((16, 18), dtype="uint16") - path = "zarr_compat3" - spath = StorePath(store, path) - a = await AsyncArray.create( - spath, - shape=data.shape, - chunk_shape=(10, 10), - dtype=data.dtype, - chunk_key_encoding=("v2", "."), - fill_value=1, - ) - - z2 = zarr.v2.creation.create( - shape=data.shape, - chunks=(10, 10), - dtype=data.dtype, - compressor=None, - fill_value=1, - ) - - await _AsyncArrayProxy(a)[:16, :18].set(data) - z2[:16, :18] = data - assert np.array_equal(data, await _AsyncArrayProxy(a)[:16, :18].get()) - assert np.array_equal(data, z2[:16, :18]) - - assert_bytes_equal( - z2._store["0.0"], await store.get(f"{path}/0.0", prototype=default_buffer_prototype()) - ) - assert_bytes_equal( - z2._store["0.1"], await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) - ) - assert_bytes_equal( - z2._store["1.0"], await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) - ) - assert_bytes_equal( - z2._store["1.1"], await store.get(f"{path}/1.1", prototype=default_buffer_prototype()) - ) - - -@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) -async def test_zarr_compat_F(store: Store) -> None: - data = np.zeros((16, 18), dtype="uint16", order="F") - path = "zarr_compatF3" - spath = StorePath(store, path) - a = await AsyncArray.create( - spath, - shape=data.shape, - chunk_shape=(10, 10), - dtype=data.dtype, - chunk_key_encoding=("v2", "."), - fill_value=1, - codecs=[TransposeCodec(order=order_from_dim("F", data.ndim)), BytesCodec()], - ) - - z2 = zarr.v2.creation.create( - shape=data.shape, - chunks=(10, 10), - dtype=data.dtype, - compressor=None, - order="F", - fill_value=1, - ) - - await _AsyncArrayProxy(a)[:16, :18].set(data) - z2[:16, :18] = data - assert np.array_equal(data, await _AsyncArrayProxy(a)[:16, :18].get()) - assert np.array_equal(data, z2[:16, :18]) - - assert_bytes_equal( - z2._store["0.0"], await store.get(f"{path}/0.0", prototype=default_buffer_prototype()) - ) - assert_bytes_equal( - z2._store["0.1"], await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) - ) - assert_bytes_equal( - z2._store["1.0"], await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) - ) - assert_bytes_equal( - z2._store["1.1"], await store.get(f"{path}/1.1", prototype=default_buffer_prototype()) - ) - - @pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) async def test_dimension_names(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) diff --git a/tests/v3/test_codecs/test_endian.py b/tests/v3/test_codecs/test_endian.py index 3c36c90b81..5b5b2eb899 100644 --- a/tests/v3/test_codecs/test_endian.py +++ b/tests/v3/test_codecs/test_endian.py @@ -3,13 +3,10 @@ import numpy as np import pytest -import zarr.v2.creation from zarr import AsyncArray from zarr.abc.store import Store from zarr.codecs import BytesCodec -from zarr.core.buffer import default_buffer_prototype from zarr.store.common import StorePath -from zarr.testing.utils import assert_bytes_equal from .test_codecs import _AsyncArrayProxy @@ -34,19 +31,6 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: readback_data = await _AsyncArrayProxy(a)[:, :].get() assert np.array_equal(data, readback_data) - # Compare with v2 - z = zarr.v2.creation.create( - shape=data.shape, - chunks=(16, 16), - dtype=">u2" if endian == "big" else "u2", "