From 4b205013528455579c4d9bb79408ec4436d712f8 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 4 Apr 2024 13:14:41 +0200 Subject: [PATCH 01/22] chore: add deprecation warnings to v3 classes / functions --- src/zarr/_storage/absstore.py | 4 ++ src/zarr/_storage/store.py | 20 +++++++++ src/zarr/_storage/v3.py | 43 ++++++++++++++++++++ src/zarr/_storage/v3_storage_transformers.py | 4 ++ src/zarr/meta.py | 4 ++ 5 files changed, 75 insertions(+) diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py index f62529f096..f8382714cb 100644 --- a/src/zarr/_storage/absstore.py +++ b/src/zarr/_storage/absstore.py @@ -1,5 +1,6 @@ """This module contains storage classes related to Azure Blob Storage (ABS)""" +from typing_extensions import deprecated import warnings from numcodecs.compat import ensure_bytes from zarr.util import normalize_storage_path @@ -224,6 +225,9 @@ def clear(self): self.rmdir() +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class ABSStoreV3(ABSStore, StoreV3): def list(self): return list(self.keys()) diff --git a/src/zarr/_storage/store.py b/src/zarr/_storage/store.py index 8daedae48f..cacb265bfd 100644 --- a/src/zarr/_storage/store.py +++ b/src/zarr/_storage/store.py @@ -10,6 +10,8 @@ from zarr.util import normalize_storage_path from zarr.context import Context +from typing_extensions import deprecated + # v2 store keys array_meta_key = ".zarray" group_meta_key = ".zgroup" @@ -182,6 +184,9 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys(self, path) +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class StoreV3(BaseStore): _store_version = 3 _metadata_class = Metadata3 @@ -405,6 +410,9 @@ def _ensure_store(store): ) +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class StorageTransformer(MutableMapping, abc.ABC): """Base class for storage transformers. The methods simply pass on the data as-is and should be overwritten by sub-classes.""" @@ -560,6 +568,9 @@ def _path_to_prefix(path: Optional[str]) -> str: return prefix +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) def _get_hierarchy_metadata(store: StoreV3) -> Mapping[str, Any]: version = getattr(store, "_store_version", 2) if version < 3: @@ -569,12 +580,18 @@ def _get_hierarchy_metadata(store: StoreV3) -> Mapping[str, Any]: return store._metadata_class.decode_hierarchy_metadata(store["zarr.json"]) +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) def _get_metadata_suffix(store: StoreV3) -> str: if "zarr.json" in store: return _get_hierarchy_metadata(store)["metadata_key_suffix"] return ".json" +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) def _rename_metadata_v3(store: StoreV3, src_path: str, dst_path: str) -> bool: """Rename source or group metadata file associated with src_path.""" any_renamed = False @@ -628,6 +645,9 @@ def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: del store[key] +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: meta_dir = meta_root + path diff --git a/src/zarr/_storage/v3.py b/src/zarr/_storage/v3.py index 8ab54984b7..1d909df792 100644 --- a/src/zarr/_storage/v3.py +++ b/src/zarr/_storage/v3.py @@ -4,6 +4,7 @@ from collections.abc import MutableMapping from threading import Lock from typing import Union, Dict, Any +from typing_extensions import deprecated from zarr.errors import ( MetadataError, @@ -71,6 +72,9 @@ StoreLike = Union[BaseStore, MutableMapping] +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class RmdirV3: """Mixin class that can be used to ensure override of any existing v2 rmdir class.""" @@ -79,6 +83,9 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys_v3(self, path) # type: ignore +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class KVStoreV3(RmdirV3, KVStore, StoreV3): def list(self): return list(self._mutable_mapping.keys()) @@ -117,6 +124,9 @@ def _get_files_and_dirs_from_path(store, path): return files, dirs +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class FSStoreV3(FSStore, StoreV3): # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) @@ -224,6 +234,9 @@ def get_partial_values(self, key_ranges): return results +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class MemoryStoreV3(MemoryStore, StoreV3): def __init__(self, root=None, cls=dict, dimension_separator=None): if root is None: @@ -306,6 +319,9 @@ def rmdir(self, path: Path = None): MemoryStoreV3.__doc__ = MemoryStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class DirectoryStoreV3(DirectoryStore, StoreV3): def list(self): return list(self.keys()) @@ -369,6 +385,9 @@ def rmdir(self, path=None): DirectoryStoreV3.__doc__ = DirectoryStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class ZipStoreV3(ZipStore, StoreV3): def list(self): return list(self.keys()) @@ -407,6 +426,9 @@ def getsize(self, path=None): ZipStoreV3.__doc__ = ZipStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class RedisStoreV3(RmdirV3, RedisStore, StoreV3): def list(self): return list(self.keys()) @@ -419,6 +441,9 @@ def __setitem__(self, key, value): RedisStoreV3.__doc__ = RedisStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class MongoDBStoreV3(RmdirV3, MongoDBStore, StoreV3): def list(self): return list(self.keys()) @@ -431,6 +456,9 @@ def __setitem__(self, key, value): MongoDBStoreV3.__doc__ = MongoDBStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class DBMStoreV3(RmdirV3, DBMStore, StoreV3): def list(self): return list(self.keys()) @@ -443,6 +471,9 @@ def __setitem__(self, key, value): DBMStoreV3.__doc__ = DBMStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class LMDBStoreV3(RmdirV3, LMDBStore, StoreV3): def list(self): return list(self.keys()) @@ -455,6 +486,9 @@ def __setitem__(self, key, value): LMDBStoreV3.__doc__ = LMDBStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class SQLiteStoreV3(SQLiteStore, StoreV3): def list(self): return list(self.keys()) @@ -503,6 +537,9 @@ def rmdir(self, path=None): SQLiteStoreV3.__doc__ = SQLiteStore.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class LRUStoreCacheV3(RmdirV3, LRUStoreCache, StoreV3): def __init__(self, store, max_size: int): self._store = StoreV3._ensure_store(store) @@ -526,6 +563,9 @@ def __setitem__(self, key, value): LRUStoreCacheV3.__doc__ = LRUStoreCache.__doc__ +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class ConsolidatedMetadataStoreV3(ConsolidatedMetadataStore, StoreV3): """A layer over other storage, where the metadata has been consolidated into a single key. @@ -580,6 +620,9 @@ def rmdir(self, key): raise ReadOnlyError() +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) def _normalize_store_arg_v3(store: Any, storage_options=None, mode="r") -> BaseStore: # default to v2 store for backward compatibility zarr_version = getattr(store, "_store_version", 3) diff --git a/src/zarr/_storage/v3_storage_transformers.py b/src/zarr/_storage/v3_storage_transformers.py index 3090aea28c..dd49b8de35 100644 --- a/src/zarr/_storage/v3_storage_transformers.py +++ b/src/zarr/_storage/v3_storage_transformers.py @@ -2,6 +2,7 @@ import itertools import os from typing import NamedTuple, Tuple, Optional, Union, Iterator +from typing_extensions import deprecated from numcodecs.compat import ensure_bytes import numpy as np @@ -97,6 +98,9 @@ def __init__(self, _type, test_value) -> None: self.test_value = test_value +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class ShardingStorageTransformer(StorageTransformer): # lgtm[py/missing-equals] """Implements sharding as a storage transformer, as described in the spec: https://zarr-specs.readthedocs.io/en/latest/extensions/storage-transformers/sharding/v1.0.html diff --git a/src/zarr/meta.py b/src/zarr/meta.py index bd1f4ee037..939d882f4e 100644 --- a/src/zarr/meta.py +++ b/src/zarr/meta.py @@ -1,6 +1,7 @@ import base64 import itertools from collections.abc import Mapping +from typing_extensions import deprecated import numcodecs import numpy as np @@ -302,6 +303,9 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return v +@deprecated( + "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" +) class Metadata3(Metadata2): ZARR_FORMAT = ZARR_FORMAT_v3 From eab9fd271bb9d323a403f94967987def22e9cb31 Mon Sep 17 00:00:00 2001 From: "Daniel Jahn (dahn)" Date: Sat, 6 Apr 2024 10:48:13 +0200 Subject: [PATCH 02/22] Resolve Mypy erorrs in `v3` branch (#1692) * refactor(v3): Using appropriate types * fix(v3): Typing fixes + minor code fixes * fix(v3): _sync_iter works with coroutines * docs(v3/store/core.py): clearer comment * fix(metadata.py): Use Any outside TYPE_CHECKING for Pydantic * fix(zarr/v3): correct zarr format + remove unused method * fix(v3/store/core.py): Potential suggestion on handling str store_like * refactor(zarr/v3): Add more typing * ci(.pre-commit-config.yaml): zarr v3 mypy checks turned on in pre-commit --- .pre-commit-config.yaml | 1 - src/zarr/v3/abc/metadata.py | 3 +- src/zarr/v3/array.py | 2 +- src/zarr/v3/chunk_grids.py | 2 +- src/zarr/v3/chunk_key_encodings.py | 6 ++-- src/zarr/v3/codecs/transpose.py | 8 +++--- src/zarr/v3/group.py | 44 ++++++++++++++++++------------ src/zarr/v3/metadata.py | 6 ++-- src/zarr/v3/store/core.py | 15 ++-------- src/zarr/v3/store/local.py | 2 +- src/zarr/v3/sync.py | 8 ++---- 11 files changed, 47 insertions(+), 50 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 79344604a5..10aff8b4c6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,6 @@ repos: hooks: - id: mypy files: src - exclude: ^src/zarr/v3 args: [] additional_dependencies: - types-redis diff --git a/src/zarr/v3/abc/metadata.py b/src/zarr/v3/abc/metadata.py index bdd2f86d59..4fcabf72a1 100644 --- a/src/zarr/v3/abc/metadata.py +++ b/src/zarr/v3/abc/metadata.py @@ -5,11 +5,12 @@ from typing import Dict from typing_extensions import Self -from dataclasses import fields +from dataclasses import fields, dataclass from zarr.v3.common import JSON +@dataclass(frozen=True) class Metadata: def to_dict(self) -> JSON: """ diff --git a/src/zarr/v3/array.py b/src/zarr/v3/array.py index 632f7d8ec7..c0a00a624e 100644 --- a/src/zarr/v3/array.py +++ b/src/zarr/v3/array.py @@ -182,7 +182,7 @@ def shape(self) -> ChunkCoords: @property def size(self) -> int: - return np.prod(self.metadata.shape) + return np.prod(self.metadata.shape).item() @property def dtype(self) -> np.dtype: diff --git a/src/zarr/v3/chunk_grids.py b/src/zarr/v3/chunk_grids.py index 6c48323798..b0a2a7bb36 100644 --- a/src/zarr/v3/chunk_grids.py +++ b/src/zarr/v3/chunk_grids.py @@ -20,7 +20,7 @@ class ChunkGrid(Metadata): @classmethod def from_dict(cls, data: Dict[str, JSON]) -> ChunkGrid: if isinstance(data, ChunkGrid): - return data # type: ignore + return data name_parsed, _ = parse_named_configuration(data) if name_parsed == "regular": diff --git a/src/zarr/v3/chunk_key_encodings.py b/src/zarr/v3/chunk_key_encodings.py index e4339240e3..9889a2f04a 100644 --- a/src/zarr/v3/chunk_key_encodings.py +++ b/src/zarr/v3/chunk_key_encodings.py @@ -1,6 +1,6 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Dict, Literal +from typing import TYPE_CHECKING, Dict, Literal, cast from dataclasses import dataclass from zarr.v3.abc.metadata import Metadata @@ -19,7 +19,7 @@ def parse_separator(data: JSON) -> SeparatorLiteral: if data not in (".", "/"): raise ValueError(f"Expected an '.' or '/' separator. Got {data} instead.") - return data # type: ignore + return cast(SeparatorLiteral, data) @dataclass(frozen=True) @@ -35,7 +35,7 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: @classmethod def from_dict(cls, data: Dict[str, JSON]) -> ChunkKeyEncoding: if isinstance(data, ChunkKeyEncoding): - return data # type: ignore + return data name_parsed, configuration_parsed = parse_named_configuration(data) if name_parsed == "default": diff --git a/src/zarr/v3/codecs/transpose.py b/src/zarr/v3/codecs/transpose.py index f214d1e7f1..b663230e35 100644 --- a/src/zarr/v3/codecs/transpose.py +++ b/src/zarr/v3/codecs/transpose.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Dict, Iterable +from typing import TYPE_CHECKING, Dict, Iterable, Union, cast from dataclasses import dataclass, replace @@ -16,12 +16,12 @@ from zarr.v3.codecs.registry import register_codec -def parse_transpose_order(data: JSON) -> Tuple[int]: +def parse_transpose_order(data: Union[JSON, Iterable[int]]) -> Tuple[int, ...]: if not isinstance(data, Iterable): raise TypeError(f"Expected an iterable. Got {data} instead.") if not all(isinstance(a, int) for a in data): raise TypeError(f"Expected an iterable of integers. Got {data} instead.") - return tuple(data) # type: ignore[return-value] + return tuple(cast(Iterable[int], data)) @dataclass(frozen=True) @@ -31,7 +31,7 @@ class TransposeCodec(ArrayArrayCodec): order: Tuple[int, ...] def __init__(self, *, order: ChunkCoordsLike) -> None: - order_parsed = parse_transpose_order(order) # type: ignore[arg-type] + order_parsed = parse_transpose_order(order) object.__setattr__(self, "order", order_parsed) diff --git a/src/zarr/v3/group.py b/src/zarr/v3/group.py index acd5ca0d62..0012a77a81 100644 --- a/src/zarr/v3/group.py +++ b/src/zarr/v3/group.py @@ -4,7 +4,7 @@ import asyncio import json import logging -from typing import Any, Dict, Literal, Optional, Union, AsyncIterator, Iterator, List +from typing import Any, Dict, Literal, Optional, Union, AsyncIterator, List from zarr.v3.abc.metadata import Metadata from zarr.v3.array import AsyncArray, Array @@ -46,11 +46,11 @@ def to_bytes(self) -> Dict[str, bytes]: return {ZARR_JSON: json.dumps(self.to_dict()).encode()} else: return { - ZGROUP_JSON: self.zarr_format, + ZGROUP_JSON: json.dumps({"zarr_format": 2}).encode(), ZATTRS_JSON: json.dumps(self.attributes).encode(), } - def __init__(self, attributes: Dict[str, Any] = None, zarr_format: Literal[2, 3] = 3): + def __init__(self, attributes: Optional[Dict[str, Any]] = None, zarr_format: Literal[2, 3] = 3): attributes_parsed = parse_attributes(attributes) zarr_format_parsed = parse_zarr_format(zarr_format) @@ -104,7 +104,7 @@ async def open( zarr_format: Literal[2, 3] = 3, ) -> AsyncGroup: store_path = make_store_path(store) - zarr_json_bytes = await (store_path / ZARR_JSON).get_async() + zarr_json_bytes = await (store_path / ZARR_JSON).get() assert zarr_json_bytes is not None # TODO: consider trying to autodiscover the zarr-format here @@ -139,7 +139,7 @@ def from_dict( store_path: StorePath, data: Dict[str, Any], runtime_configuration: RuntimeConfiguration, - ) -> Group: + ) -> AsyncGroup: group = cls( metadata=GroupMetadata.from_dict(data), store_path=store_path, @@ -168,10 +168,12 @@ async def getitem( zarr_json = json.loads(zarr_json_bytes) if zarr_json["node_type"] == "group": return type(self).from_dict(store_path, zarr_json, self.runtime_configuration) - if zarr_json["node_type"] == "array": + elif zarr_json["node_type"] == "array": return AsyncArray.from_dict( store_path, zarr_json, runtime_configuration=self.runtime_configuration ) + else: + raise ValueError(f"unexpected node_type: {zarr_json['node_type']}") elif self.metadata.zarr_format == 2: # Q: how do we like optimistically fetching .zgroup, .zarray, and .zattrs? # This guarantees that we will always make at least one extra request to the store @@ -271,7 +273,7 @@ def __repr__(self): async def nchildren(self) -> int: raise NotImplementedError - async def children(self) -> AsyncIterator[AsyncArray, AsyncGroup]: + async def children(self) -> AsyncIterator[Union[AsyncArray, AsyncGroup]]: raise NotImplementedError async def contains(self, child: str) -> bool: @@ -381,8 +383,12 @@ async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group new_metadata = replace(self.metadata, attributes=new_attributes) # Write new metadata - await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) - return replace(self, metadata=new_metadata) + to_save = new_metadata.to_bytes() + awaitables = [(self.store_path / key).set(value) for key, value in to_save.items()] + await asyncio.gather(*awaitables) + + async_group = replace(self._async_group, metadata=new_metadata) + return replace(self, _async_group=async_group) @property def metadata(self) -> GroupMetadata: @@ -396,34 +402,38 @@ def attrs(self) -> Attributes: def info(self): return self._async_group.info + @property + def store_path(self) -> StorePath: + return self._async_group.store_path + def update_attributes(self, new_attributes: Dict[str, Any]): self._sync(self._async_group.update_attributes(new_attributes)) return self @property def nchildren(self) -> int: - return self._sync(self._async_group.nchildren) + return self._sync(self._async_group.nchildren()) @property - def children(self) -> List[Array, Group]: - _children = self._sync_iter(self._async_group.children) + def children(self) -> List[Union[Array, Group]]: + _children = self._sync_iter(self._async_group.children()) return [Array(obj) if isinstance(obj, AsyncArray) else Group(obj) for obj in _children] def __contains__(self, child) -> bool: return self._sync(self._async_group.contains(child)) - def group_keys(self) -> Iterator[str]: - return self._sync_iter(self._async_group.group_keys) + def group_keys(self) -> List[str]: + return self._sync_iter(self._async_group.group_keys()) def groups(self) -> List[Group]: # TODO: in v2 this was a generator that return key: Group - return [Group(obj) for obj in self._sync_iter(self._async_group.groups)] + return [Group(obj) for obj in self._sync_iter(self._async_group.groups())] def array_keys(self) -> List[str]: - return self._sync_iter(self._async_group.array_keys) + return self._sync_iter(self._async_group.array_keys()) def arrays(self) -> List[Array]: - return [Array(obj) for obj in self._sync_iter(self._async_group.arrays)] + return [Array(obj) for obj in self._sync_iter(self._async_group.arrays())] def tree(self, expand=False, level=None) -> Any: return self._sync(self._async_group.tree(expand=expand, level=level)) diff --git a/src/zarr/v3/metadata.py b/src/zarr/v3/metadata.py index de3055abdc..a5e8927311 100644 --- a/src/zarr/v3/metadata.py +++ b/src/zarr/v3/metadata.py @@ -1,6 +1,6 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, cast, Dict, Iterable +from typing import TYPE_CHECKING, cast, Dict, Iterable, Any from dataclasses import dataclass, field import json import numpy as np @@ -10,7 +10,7 @@ if TYPE_CHECKING: - from typing import Any, Literal, Union, List, Optional, Tuple + from typing import Literal, Union, List, Optional, Tuple from zarr.v3.codecs.pipeline import CodecPipeline @@ -244,7 +244,7 @@ class ArrayV2Metadata(Metadata): filters: Optional[List[Dict[str, Any]]] = None dimension_separator: Literal[".", "/"] = "." compressor: Optional[Dict[str, Any]] = None - attributes: Optional[Dict[str, Any]] = field(default_factory=dict) + attributes: Optional[Dict[str, Any]] = cast(Dict[str, Any], field(default_factory=dict)) zarr_format: Literal[2] = field(init=False, default=2) def __init__( diff --git a/src/zarr/v3/store/core.py b/src/zarr/v3/store/core.py index 0ef1c8569e..16714d9e30 100644 --- a/src/zarr/v3/store/core.py +++ b/src/zarr/v3/store/core.py @@ -5,6 +5,7 @@ from zarr.v3.common import BytesLike from zarr.v3.abc.store import Store +from zarr.v3.store.local import LocalStore def _dereference_path(root: str, path: str) -> str: @@ -24,10 +25,6 @@ def __init__(self, store: Store, path: Optional[str] = None): self.store = store self.path = path or "" - @classmethod - def from_path(cls, pth: Path) -> StorePath: - return cls(Store.from_path(pth)) - async def get( self, byte_range: Optional[Tuple[int, Optional[int]]] = None ) -> Optional[BytesLike]: @@ -70,14 +67,6 @@ def make_store_path(store_like: StoreLike) -> StorePath: return store_like elif isinstance(store_like, Store): return StorePath(store_like) - # elif isinstance(store_like, Path): - # return StorePath(Store.from_path(store_like)) elif isinstance(store_like, str): - try: - from upath import UPath - - return StorePath(Store.from_path(UPath(store_like))) - except ImportError as e: - raise e - # return StorePath(LocalStore(Path(store_like))) + return StorePath(LocalStore(Path(store_like))) raise TypeError diff --git a/src/zarr/v3/store/local.py b/src/zarr/v3/store/local.py index a62eea20f7..c3da110450 100644 --- a/src/zarr/v3/store/local.py +++ b/src/zarr/v3/store/local.py @@ -146,7 +146,7 @@ async def list_prefix(self, prefix: str) -> List[str]: """ def _list_prefix(root: Path, prefix: str) -> List[str]: - files = [p for p in (root / prefix).rglob("*") if p.is_file()] + files = [str(p) for p in (root / prefix).rglob("*") if p.is_file()] return files return await to_thread(_list_prefix, self.root, prefix) diff --git a/src/zarr/v3/sync.py b/src/zarr/v3/sync.py index f0996c019e..fcc8e7b275 100644 --- a/src/zarr/v3/sync.py +++ b/src/zarr/v3/sync.py @@ -5,7 +5,6 @@ from typing import ( Any, AsyncIterator, - Callable, Coroutine, List, Optional, @@ -112,11 +111,10 @@ def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T: # this should allow us to better type the sync wrapper return sync(coroutine, loop=self._sync_configuration.asyncio_loop) - def _sync_iter( - self, func: Callable[P, AsyncIterator[T]], *args: P.args, **kwargs: P.kwargs - ) -> List[T]: + def _sync_iter(self, coroutine: Coroutine[Any, Any, AsyncIterator[T]]) -> List[T]: async def iter_to_list() -> List[T]: # TODO: replace with generators so we don't materialize the entire iterator at once - return [item async for item in func(*args, **kwargs)] + async_iterator = await coroutine + return [item async for item in async_iterator] return self._sync(iter_to_list()) From 48c14d3e6621dd62476356b7effe29f521f4c80b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:29:26 -0400 Subject: [PATCH 03/22] Specify hatch envs using GitHub actions matrix for v3 tests (#1728) * Specify v3 hatch envs using GitHub actions matrix * Update .github/workflows/test-v3.yml Co-authored-by: Joe Hamman * Update .github/workflows/test-v3.yml Co-authored-by: Joe Hamman * test on 3.12 too * no 3.12 --------- Co-authored-by: Joe Hamman Co-authored-by: Joe Hamman --- .github/workflows/test-v3.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-v3.yml b/.github/workflows/test-v3.yml index bdc6e99299..e767541c75 100644 --- a/.github/workflows/test-v3.yml +++ b/.github/workflows/test-v3.yml @@ -10,15 +10,22 @@ on: branches: [ v3 ] jobs: - run-tests: + test: + name: py=${{ matrix.python-version }}, np=${{ matrix.numpy-version }}, deps=${{ matrix.dependency-set }} runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.10', '3.11'] + numpy-version: ['1.24', '1.26'] + dependency-set: ["minimal", "optional"] + steps: - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: ${{ matrix.python-version }} cache: 'pip' - name: Install Hatch run: | @@ -29,8 +36,8 @@ jobs: hatch env create - name: Run Tests run: | - hatch run test:run + hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run - name: Run mypy continue-on-error: true run: | - hatch run test:run-mypy \ No newline at end of file + hatch run test:run-mypy From 962b5377c025e3ea5c42a35da4a16f491e410024 Mon Sep 17 00:00:00 2001 From: Saransh Chopra Date: Mon, 8 Apr 2024 19:42:20 +0200 Subject: [PATCH 04/22] black -> ruff format + cleanup (#1639) * black -> ruff + cleanup * format * Preserve git blame * pre-commit fix --- .flake8 | 2 -- .git-blame-ignore-revs | 2 ++ .pre-commit-config.yaml | 17 ++++------- bench/compress_normal.py | 1 - pyproject.toml | 18 ++--------- src/zarr/_storage/absstore.py | 3 +- src/zarr/_storage/store.py | 1 - src/zarr/_storage/v3.py | 1 - src/zarr/attrs.py | 6 ---- src/zarr/convenience.py | 20 +++--------- src/zarr/core.py | 2 +- src/zarr/creation.py | 2 -- src/zarr/hierarchy.py | 46 ++++++++++++++-------------- src/zarr/indexing.py | 35 ++++----------------- src/zarr/meta.py | 1 - src/zarr/n5.py | 57 ----------------------------------- src/zarr/storage.py | 9 +----- src/zarr/util.py | 16 ++++------ src/zarr/v3/config.py | 1 - src/zarr/v3/group.py | 1 - src/zarr/v3/store/local.py | 3 +- src/zarr/v3/sync.py | 1 - tests/test_attrs.py | 6 ---- tests/test_convenience.py | 7 ----- tests/test_creation.py | 9 ------ tests/test_dim_separator.py | 1 - tests/test_filters.py | 12 -------- tests/test_group_v3.py | 2 -- tests/test_hierarchy.py | 3 -- tests/test_indexing.py | 35 --------------------- tests/test_info.py | 1 - tests/test_meta.py | 27 ++--------------- tests/test_storage.py | 1 - tests/test_sync.py | 2 -- tests/test_util.py | 2 -- tests/v3/test_metadata.py | 1 + 36 files changed, 58 insertions(+), 296 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 7da1f9608e..0000000000 --- a/.flake8 +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -max-line-length = 100 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 53bf4633f0..9e0316032f 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,2 +1,4 @@ # lint codebase with black and ruff 4e348d6b80c96da461fd866576c971b8a659ba15 +# migrate from black to ruff format +22cea005629913208a85799372e045f353744add diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10aff8b4c6..d4aee4ce86 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,27 +7,22 @@ default_language_version: python: python3 repos: - repo: https://github.com/charliermarsh/ruff-pre-commit - # Ruff version. - rev: 'v0.0.224' + rev: 'v0.2.1' hooks: - id: ruff - # Respect `exclude` and `extend-exclude` settings. - args: ["--force-exclude"] - - repo: https://github.com/psf/black - rev: 22.12.0 - hooks: - - id: black + args: ["--fix", "--show-fixes"] + - id: ruff-format - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-yaml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.7.1 + rev: v1.8.0 hooks: - id: mypy files: src diff --git a/bench/compress_normal.py b/bench/compress_normal.py index 9f1655541c..803d54b76b 100644 --- a/bench/compress_normal.py +++ b/bench/compress_normal.py @@ -8,7 +8,6 @@ from zarr import blosc if __name__ == "__main__": - sys.path.insert(0, "..") # setup diff --git a/pyproject.toml b/pyproject.toml index 3933376b12..9f21a84aee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,7 +127,8 @@ serve = "sphinx-autobuild docs docs/_build --ignore 'docs/_autoapi/**/*' --host [tool.ruff] line-length = 100 -exclude = [ +force-exclude = true +extend-exclude = [ ".bzr", ".direnv", ".eggs", @@ -146,21 +147,6 @@ exclude = [ "docs" ] -[tool.black] -line-length = 100 -exclude = ''' -/( - \.git - | \.mypy_cache - | \.venv - | _build - | buck-out - | build - | dist - | docs -)/ -''' - [tool.mypy] python_version = "3.8" ignore_missing_imports = true diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py index f8382714cb..5fd709d02a 100644 --- a/src/zarr/_storage/absstore.py +++ b/src/zarr/_storage/absstore.py @@ -88,7 +88,7 @@ def __init__( "https://{}.blob.core.windows.net/".format(account_name), container, credential=account_key, - **blob_service_kwargs + **blob_service_kwargs, ) self.client = client @@ -244,7 +244,6 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def rmdir(self, path=None): - if not path: # Currently allowing clear to delete everything as in v2 diff --git a/src/zarr/_storage/store.py b/src/zarr/_storage/store.py index cacb265bfd..44a22ae34e 100644 --- a/src/zarr/_storage/store.py +++ b/src/zarr/_storage/store.py @@ -649,7 +649,6 @@ def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" ) def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: - meta_dir = meta_root + path meta_dir = meta_dir.rstrip("/") _rmdir_from_keys(store, meta_dir) diff --git a/src/zarr/_storage/v3.py b/src/zarr/_storage/v3.py index 1d909df792..a9dbbee743 100644 --- a/src/zarr/_storage/v3.py +++ b/src/zarr/_storage/v3.py @@ -128,7 +128,6 @@ def _get_files_and_dirs_from_path(store, path): "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" ) class FSStoreV3(FSStore, StoreV3): - # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) _META_KEYS = () diff --git a/src/zarr/attrs.py b/src/zarr/attrs.py index 01fc617b3c..e967c5b853 100644 --- a/src/zarr/attrs.py +++ b/src/zarr/attrs.py @@ -26,7 +26,6 @@ class Attributes(MutableMapping): """ def __init__(self, store, key=".zattrs", read_only=False, cache=True, synchronizer=None): - self._version = getattr(store, "_store_version", 2) _Store = Store if self._version == 2 else StoreV3 self.store = _Store._ensure_store(store) @@ -73,7 +72,6 @@ def __getitem__(self, item): return self.asdict()[item] def _write_op(self, f, *args, **kwargs): - # guard condition if self.read_only: raise PermissionError("attributes are read-only") @@ -89,7 +87,6 @@ def __setitem__(self, item, value): self._write_op(self._setitem_nosync, item, value) def _setitem_nosync(self, item, value): - # load existing data d = self._get_nosync() @@ -106,7 +103,6 @@ def __delitem__(self, item): self._write_op(self._delitem_nosync, item) def _delitem_nosync(self, key): - # load existing data d = self._get_nosync() @@ -128,7 +124,6 @@ def put(self, d): self._write_op(self._put_nosync, dict(attributes=d)) def _put_nosync(self, d): - d_to_check = d if self._version == 2 else d["attributes"] if not all(isinstance(item, str) for item in d_to_check): # TODO: Raise an error for non-string keys @@ -178,7 +173,6 @@ def update(self, *args, **kwargs): self._write_op(self._update_nosync, *args, **kwargs) def _update_nosync(self, *args, **kwargs): - # load existing data d = self._get_nosync() diff --git a/src/zarr/convenience.py b/src/zarr/convenience.py index 0ee8a8d323..9c0deeea47 100644 --- a/src/zarr/convenience.py +++ b/src/zarr/convenience.py @@ -675,10 +675,8 @@ def copy_store( # setup logging with _LogWriter(log) as log: - # iterate over source keys for source_key in sorted(source.keys()): - # filter to keys under source path if source_store_version == 2: if not source_key.startswith(source_path): @@ -757,7 +755,7 @@ def copy( log=None, if_exists="raise", dry_run=False, - **create_kws + **create_kws, ): """Copy the `source` array or group into the `dest` group. @@ -878,7 +876,6 @@ def copy( # setup logging with _LogWriter(log) as log: - # do the copying n_copied, n_skipped, n_bytes_copied = _copy( log, @@ -890,7 +887,7 @@ def copy( without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, - **create_kws + **create_kws, ) # log a final message with a summary of what happened @@ -948,12 +945,10 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ # take action if do_copy: - # log a message about what we're going to do log("copy {} {} {}".format(source.name, source.shape, source.dtype)) if not dry_run: - # clear the way if exists: del dest[name] @@ -1038,12 +1033,10 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ # take action if do_copy: - # log action log("copy {}".format(source.name)) if not dry_run: - # clear the way if exists_array: del dest[name] @@ -1056,7 +1049,6 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ grp.attrs.update(source.attrs) else: - # setup for dry run without creating any groups in the # destination if dest is not None: @@ -1076,7 +1068,7 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, - **create_kws + **create_kws, ) n_copied += c n_skipped += s @@ -1099,7 +1091,7 @@ def copy_all( log=None, if_exists="raise", dry_run=False, - **create_kws + **create_kws, ): """Copy all children of the `source` group into the `dest` group. @@ -1189,7 +1181,6 @@ def copy_all( # setup logging with _LogWriter(log) as log: - for k in source.keys(): c, s, b = _copy( log, @@ -1201,7 +1192,7 @@ def copy_all( without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, - **create_kws + **create_kws, ) n_copied += c n_skipped += s @@ -1262,7 +1253,6 @@ def is_zarr_key(key): return key.endswith(".zarray") or key.endswith(".zgroup") or key.endswith(".zattrs") else: - assert_zarr_v3_api_available() sfx = _get_metadata_suffix(store) # type: ignore diff --git a/src/zarr/core.py b/src/zarr/core.py index c07a31e95f..d22a9d79c3 100644 --- a/src/zarr/core.py +++ b/src/zarr/core.py @@ -2536,7 +2536,7 @@ def hexdigest(self, hashname="sha1"): checksum = binascii.hexlify(self.digest(hashname=hashname)) # This is a bytes object on Python 3 and we want a str. - if type(checksum) is not str: + if not isinstance(checksum, str): checksum = checksum.decode("utf8") return checksum diff --git a/src/zarr/creation.py b/src/zarr/creation.py index 726d0b5932..6227f90b7b 100644 --- a/src/zarr/creation.py +++ b/src/zarr/creation.py @@ -234,7 +234,6 @@ def create( def _kwargs_compat(compressor, fill_value, kwargs): - # to be compatible with h5py, as well as backwards-compatible with Zarr # 1.x, accept 'compression' and 'compression_opts' keyword arguments @@ -697,7 +696,6 @@ def open_array( def _like_args(a, kwargs): - shape, chunks = _get_shape_chunks(a) if shape is not None: kwargs.setdefault("shape", shape) diff --git a/src/zarr/hierarchy.py b/src/zarr/hierarchy.py index 3361969f08..1c9848e647 100644 --- a/src/zarr/hierarchy.py +++ b/src/zarr/hierarchy.py @@ -145,7 +145,7 @@ def __init__( synchronizer=None, zarr_version=None, *, - meta_array=None + meta_array=None, ): store: BaseStore = _normalize_store_arg(store, zarr_version=zarr_version) if zarr_version is None: @@ -591,7 +591,25 @@ def groups(self): for key in sorted(listdir(self._store, self._path)): path = self._key_prefix + key if contains_group(self._store, path, explicit_only=False): - yield key, Group( + yield ( + key, + Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + ), + ) + + else: + for key in self.group_keys(): + path = self._key_prefix + key + yield ( + key, + Group( self._store, path=path, read_only=self._read_only, @@ -599,19 +617,7 @@ def groups(self): cache_attrs=self.attrs.cache, synchronizer=self._synchronizer, zarr_version=self._version, - ) - - else: - for key in self.group_keys(): - path = self._key_prefix + key - yield key, Group( - self._store, - path=path, - read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - zarr_version=self._version, + ), ) def array_keys(self, recurse=False): @@ -919,7 +925,6 @@ def tree(self, expand=False, level=None): return TreeViewer(self, expand=expand, level=level) def _write_op(self, f, *args, **kwargs): - # guard condition if self._read_only: raise ReadOnlyError() @@ -1094,7 +1099,6 @@ def create_dataset(self, name, **kwargs): return self._write_op(self._create_dataset_nosync, name, **kwargs) def _create_dataset_nosync(self, name, data=None, **kwargs): - assert "mode" not in kwargs path = self._item_path(name) @@ -1138,11 +1142,9 @@ def require_dataset(self, name, shape, dtype=None, exact=False, **kwargs): ) def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, **kwargs): - path = self._item_path(name) if contains_array(self._store, path): - # array already exists at path, validate that it is the right shape and type synchronizer = kwargs.get("synchronizer", self._synchronizer) @@ -1235,7 +1237,7 @@ def _full_nosync(self, name, fill_value, **kwargs): path=path, chunk_store=self._chunk_store, fill_value=fill_value, - **kwargs + **kwargs, ) def array(self, name, data, **kwargs): @@ -1361,7 +1363,7 @@ def group( path=None, *, zarr_version=None, - meta_array=None + meta_array=None, ): """Create a group. @@ -1452,7 +1454,7 @@ def open_group( storage_options=None, *, zarr_version=None, - meta_array=None + meta_array=None, ): """Open a group using file-mode-like semantics. diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 487cc8b9d9..b72d5a255d 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -111,7 +111,6 @@ def is_pure_orthogonal_indexing(selection, ndim): def normalize_integer_selection(dim_sel, dim_len): - # normalize type to int dim_sel = int(dim_sel) @@ -145,7 +144,6 @@ def normalize_integer_selection(dim_sel, dim_len): class IntDimIndexer: def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize dim_sel = normalize_integer_selection(dim_sel, dim_len) @@ -169,7 +167,6 @@ def ceildiv(a, b): class SliceDimIndexer: def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize self.start, self.stop, self.step = dim_sel.indices(dim_len) if self.step < 1: @@ -182,14 +179,12 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) def __iter__(self): - # figure out the range of chunks we need to visit dim_chunk_ix_from = self.start // self.dim_chunk_len dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) # iterate over chunks in range for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - # compute offsets for chunk within overall array dim_offset = dim_chunk_ix * self.dim_chunk_len dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) @@ -237,7 +232,6 @@ def check_selection_length(selection, shape): def replace_ellipsis(selection, shape): - selection = ensure_tuple(selection) # count number of ellipsis present @@ -330,14 +324,12 @@ def is_basic_selection(selection): # noinspection PyProtectedMember class BasicIndexer: def __init__(self, selection, array): - # handle ellipsis selection = replace_ellipsis(selection, array._shape) # setup per-dimension indexers dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -358,7 +350,6 @@ def __init__(self, selection, array): def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) out_selection = tuple( @@ -370,7 +361,6 @@ def __iter__(self): class BoolArrayDimIndexer: def __init__(self, dim_sel, dim_len, dim_chunk_len): - # check number of dimensions if not is_bool_array(dim_sel, 1): raise IndexError( @@ -380,8 +370,9 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # check shape if dim_sel.shape[0] != dim_len: raise IndexError( - "Boolean array has the wrong length for dimension; " - "expected {}, got {}".format(dim_len, dim_sel.shape[0]) + "Boolean array has the wrong length for dimension; " "expected {}, got {}".format( + dim_len, dim_sel.shape[0] + ) ) # store attributes @@ -402,10 +393,8 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] def __iter__(self): - # iterate over chunks with at least one item for dim_chunk_ix in self.dim_chunk_ixs: - # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] @@ -472,7 +461,6 @@ def __init__( boundscheck=True, order=Order.UNKNOWN, ): - # ensure 1d array dim_sel = np.asanyarray(dim_sel) if not is_integer_array(dim_sel, 1): @@ -526,9 +514,7 @@ def __init__( self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) def __iter__(self): - for dim_chunk_ix in self.dim_chunk_ixs: - # find region in output if dim_chunk_ix == 0: start = 0 @@ -602,7 +588,6 @@ def oindex_set(a, selection, value): # noinspection PyProtectedMember class OrthogonalIndexer: def __init__(self, selection, array): - # handle ellipsis selection = replace_ellipsis(selection, array._shape) @@ -612,7 +597,6 @@ def __init__(self, selection, array): # setup per-dimension indexers dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -649,7 +633,6 @@ def __init__(self, selection, array): def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) out_selection = tuple( @@ -658,7 +641,6 @@ def __iter__(self): # handle advanced indexing arrays orthogonally if self.is_advanced: - # N.B., numpy doesn't support orthogonal indexing directly as yet, # so need to work around via np.ix_. Also np.ix_ does not support a # mixture of arrays and slices or integers, so need to convert slices @@ -692,7 +674,6 @@ def __setitem__(self, selection, value): # noinspection PyProtectedMember class BlockIndexer: def __init__(self, selection, array): - # handle ellipsis selection = replace_ellipsis(selection, array._shape) @@ -794,7 +775,6 @@ def is_mask_selection(selection, array): # noinspection PyProtectedMember class CoordinateIndexer: def __init__(self, selection, array): - # some initial normalization selection = ensure_tuple(selection) selection = tuple([i] if is_integer(i) else i for i in selection) @@ -810,7 +790,6 @@ def __init__(self, selection, array): # handle wraparound, boundscheck for dim_sel, dim_len in zip(selection, array.shape): - # handle wraparound wraparound_indices(dim_sel, dim_len) @@ -861,10 +840,8 @@ def __init__(self, selection, array): self.chunk_mixs = np.unravel_index(self.chunk_rixs, array._cdata_shape) def __iter__(self): - # iterate over chunks for i, chunk_rix in enumerate(self.chunk_rixs): - chunk_coords = tuple(m[i] for m in self.chunk_mixs) if chunk_rix == 0: start = 0 @@ -891,7 +868,6 @@ def __iter__(self): # noinspection PyProtectedMember class MaskIndexer(CoordinateIndexer): def __init__(self, selection, array): - # some initial normalization selection = ensure_tuple(selection) selection = replace_lists(selection) @@ -944,8 +920,9 @@ def check_fields(fields, dtype): # check type if not isinstance(fields, (str, list, tuple)): raise IndexError( - "'fields' argument must be a string or list of strings; found " - "{!r}".format(type(fields)) + "'fields' argument must be a string or list of strings; found " "{!r}".format( + type(fields) + ) ) if fields: if dtype.names is None: diff --git a/src/zarr/meta.py b/src/zarr/meta.py index 939d882f4e..34a4f33d1e 100644 --- a/src/zarr/meta.py +++ b/src/zarr/meta.py @@ -90,7 +90,6 @@ class Metadata2: @classmethod def parse_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - # Here we allow that a store may return an already-parsed metadata object, # or a string of JSON that we will parse here. We allow for an already-parsed # object to accommodate a consolidated metadata store, where all the metadata for diff --git a/src/zarr/n5.py b/src/zarr/n5.py index 7e73905527..44b44e69e2 100644 --- a/src/zarr/n5.py +++ b/src/zarr/n5.py @@ -72,21 +72,18 @@ class N5Store(NestedDirectoryStore): def __getitem__(self, key: str) -> bytes: if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) return json_dumps(value) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) top_level = key == zarr_array_meta_key value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) return json_dumps(value) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) value = attrs_to_zarr(self._load_n5_attrs(key_new)) @@ -104,9 +101,7 @@ def __getitem__(self, key: str) -> bytes: return super().__getitem__(key_new) def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) n5_attrs = self._load_n5_attrs(key_new) @@ -115,7 +110,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) top_level = key == zarr_array_meta_key n5_attrs = self._load_n5_attrs(key_new) @@ -123,7 +117,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) n5_attrs = self._load_n5_attrs(key_new) @@ -166,9 +159,7 @@ def __delitem__(self, key: str): super().__delitem__(key_new) def __contains__(self, key): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) if key_new not in self: return False @@ -176,18 +167,15 @@ def __contains__(self, key): return "dimensions" not in self._load_n5_attrs(key_new) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) # array if attributes contain 'dimensions' return "dimensions" in self._load_n5_attrs(key_new) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) return self._contains_attrs(key_new) elif is_chunk_key(key): - key_new = invert_chunk_coords(key) else: key_new = key @@ -198,7 +186,6 @@ def __eq__(self, other): return isinstance(other, N5Store) and self.path == other.path def listdir(self, path: Optional[str] = None): - if path is not None: path = invert_chunk_coords(path) path = cast(str, path) @@ -208,7 +195,6 @@ def listdir(self, path: Optional[str] = None): children = super().listdir(path=path) if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files children.remove(n5_attrs_key) children.append(zarr_array_meta_key) @@ -234,7 +220,6 @@ def listdir(self, path: Optional[str] = None): return sorted(new_children) elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files children.remove(n5_attrs_key) children.append(zarr_group_meta_key) @@ -244,7 +229,6 @@ def listdir(self, path: Optional[str] = None): return sorted(children) else: - return children def _load_n5_attrs(self, path: str) -> Dict[str, Any]: @@ -255,7 +239,6 @@ def _load_n5_attrs(self, path: str) -> Dict[str, Any]: return {} def _is_group(self, path: str): - if path is None: attrs_key = n5_attrs_key else: @@ -265,7 +248,6 @@ def _is_group(self, path: str): return len(n5_attrs) > 0 and "dimensions" not in n5_attrs def _is_array(self, path: str): - if path is None: attrs_key = n5_attrs_key else: @@ -274,7 +256,6 @@ def _is_array(self, path: str): return "dimensions" in self._load_n5_attrs(attrs_key) def _contains_attrs(self, path: str): - if path is None: attrs_key = n5_attrs_key else: @@ -376,21 +357,18 @@ def _normalize_key(self, key: str): def __getitem__(self, key: str) -> bytes: if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) return json_dumps(value) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) top_level = key == zarr_array_meta_key value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) return json_dumps(value) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) value = attrs_to_zarr(self._load_n5_attrs(key_new)) @@ -409,7 +387,6 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: Any): if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) n5_attrs = self._load_n5_attrs(key_new) @@ -418,7 +395,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) top_level = key == zarr_array_meta_key n5_attrs = self._load_n5_attrs(key_new) @@ -427,7 +403,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) n5_attrs = self._load_n5_attrs(key_new) @@ -456,7 +431,6 @@ def __setitem__(self, key: str, value: Any): super().__setitem__(key_new, value) def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): key_new = key.replace(zarr_group_meta_key, self._group_meta_key) elif key.endswith(zarr_array_meta_key): @@ -471,7 +445,6 @@ def __delitem__(self, key: str): def __contains__(self, key: Any): if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) if key_new not in self: return False @@ -479,13 +452,11 @@ def __contains__(self, key: Any): return "dimensions" not in self._load_n5_attrs(key_new) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) # array if attributes contain 'dimensions' return "dimensions" in self._load_n5_attrs(key_new) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) return self._contains_attrs(key_new) @@ -508,7 +479,6 @@ def listdir(self, path: Optional[str] = None): # doesn't provide. children = super().listdir(path=path) if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files children.remove(self._array_meta_key) children.append(zarr_array_meta_key) @@ -532,7 +502,6 @@ def listdir(self, path: Optional[str] = None): return sorted(new_children) elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files children.remove(self._group_meta_key) children.append(zarr_group_meta_key) @@ -550,7 +519,6 @@ def _load_n5_attrs(self, path: str): return {} def _is_group(self, path: Optional[str]): - if path is None: attrs_key = self._attrs_key else: @@ -560,7 +528,6 @@ def _is_group(self, path: Optional[str]): return len(n5_attrs) > 0 and "dimensions" not in n5_attrs def _is_array(self, path: Optional[str]): - if path is None: attrs_key = self._attrs_key else: @@ -569,7 +536,6 @@ def _is_array(self, path: Optional[str]): return "dimensions" in self._load_n5_attrs(attrs_key) def _contains_attrs(self, path: Optional[str]): - if path is None: attrs_key = self._attrs_key else: @@ -712,7 +678,6 @@ def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: - if compressor_config is None: return {"type": "raw"} else: @@ -726,19 +691,16 @@ def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict n5_config = {"type": codec_id} if codec_id == "bz2": - n5_config["type"] = "bzip2" n5_config["blockSize"] = _compressor_config["level"] elif codec_id == "blosc": - n5_config["cname"] = _compressor_config["cname"] n5_config["clevel"] = _compressor_config["clevel"] n5_config["shuffle"] = _compressor_config["shuffle"] n5_config["blocksize"] = _compressor_config["blocksize"] elif codec_id == "lzma": - # Switch to XZ for N5 if we are using the default XZ format. # Note: 4 is the default, which is lzma.CHECK_CRC64. if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: @@ -760,50 +722,42 @@ def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict n5_config["preset"] = 6 elif codec_id == "zlib": - n5_config["type"] = "gzip" n5_config["level"] = _compressor_config["level"] n5_config["useZlib"] = True elif codec_id == "gzip": - n5_config["type"] = "gzip" n5_config["level"] = _compressor_config["level"] n5_config["useZlib"] = False else: - n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) return n5_config def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: - codec_id = compressor_config["type"] zarr_config = {"id": codec_id} if codec_id == "bzip2": - zarr_config["id"] = "bz2" zarr_config["level"] = compressor_config["blockSize"] elif codec_id == "blosc": - zarr_config["cname"] = compressor_config["cname"] zarr_config["clevel"] = compressor_config["clevel"] zarr_config["shuffle"] = compressor_config["shuffle"] zarr_config["blocksize"] = compressor_config["blocksize"] elif codec_id == "lzma": - zarr_config["format"] = compressor_config["format"] zarr_config["check"] = compressor_config["check"] zarr_config["preset"] = compressor_config["preset"] zarr_config["filters"] = compressor_config["filters"] elif codec_id == "xz": - zarr_config["id"] = "lzma" zarr_config["format"] = 1 # lzma.FORMAT_XZ zarr_config["check"] = -1 @@ -811,7 +765,6 @@ def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dic zarr_config["filters"] = None elif codec_id == "gzip": - if "useZlib" in compressor_config and compressor_config["useZlib"]: zarr_config["id"] = "zlib" zarr_config["level"] = compressor_config["level"] @@ -820,22 +773,18 @@ def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dic zarr_config["level"] = compressor_config["level"] elif codec_id == "raw": - return None else: - zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) return zarr_config class N5ChunkWrapper(Codec): - codec_id = "n5_wrapper" def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): - self.dtype = np.dtype(dtype) self.chunk_shape = tuple(chunk_shape) # is the dtype a little endian format? @@ -860,7 +809,6 @@ def get_config(self): return config def encode(self, chunk): - assert chunk.flags.c_contiguous header = self._create_header(chunk) @@ -872,12 +820,10 @@ def encode(self, chunk): return header + chunk.tobytes(order="A") def decode(self, chunk, out=None) -> bytes: - len_header, chunk_shape = self._read_header(chunk) chunk = chunk[len_header:] if out is not None: - # out should only be used if we read a complete chunk assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( self.chunk_shape, chunk_shape @@ -895,7 +841,6 @@ def decode(self, chunk, out=None) -> bytes: return out else: - if self._compressor: chunk = self._compressor.decode(chunk) @@ -915,7 +860,6 @@ def decode(self, chunk, out=None) -> bytes: @staticmethod def _create_header(chunk): - mode = struct.pack(">H", 0) num_dims = struct.pack(">H", len(chunk.shape)) shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) @@ -924,7 +868,6 @@ def _create_header(chunk): @staticmethod def _read_header(chunk): - num_dims = struct.unpack(">H", chunk[2:4])[0] shape = tuple( struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) diff --git a/src/zarr/storage.py b/src/zarr/storage.py index b36f804ebd..e7bd0c4cf4 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -482,7 +482,6 @@ def _init_array_metadata( dimension_separator=None, storage_transformers=(), ): - store_version = getattr(store, "_store_version", 2) path = normalize_storage_path(path) @@ -687,7 +686,6 @@ def _init_group_metadata( path: Optional[str] = None, chunk_store: Optional[StoreLike] = None, ): - store_version = getattr(store, "_store_version", 2) path = normalize_storage_path(path) @@ -1055,7 +1053,6 @@ class DirectoryStore(Store): """ def __init__(self, path, normalize_keys=False, dimension_separator=None): - # guard conditions path = os.path.abspath(path) if os.path.exists(path) and not os.path.isdir(path): @@ -1415,7 +1412,6 @@ def _normalize_key(self, key): def getitems( self, keys: Sequence[str], *, contexts: Mapping[str, Context] ) -> Mapping[str, Any]: - keys_transformed = [self._normalize_key(key) for key in keys] results = self.map.getitems(keys_transformed, on_error="omit") # The function calling this method may not recognize the transformed keys @@ -1768,7 +1764,6 @@ def __init__( mode="a", dimension_separator=None, ): - # store properties path = os.path.abspath(path) self.path = path @@ -2707,9 +2702,7 @@ def listdir(self, path=None): SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m FROM zarr WHERE k LIKE (? || "{sep}%") ) ORDER BY l ASC - """.format( - sep=sep - ), + """.format(sep=sep), (path, path), ) keys = list(map(operator.itemgetter(0), keys)) diff --git a/src/zarr/util.py b/src/zarr/util.py index ea0dd9fcec..270a444524 100644 --- a/src/zarr/util.py +++ b/src/zarr/util.py @@ -180,7 +180,6 @@ def normalize_chunks(chunks: Any, shape: Tuple[int, ...], typesize: int) -> Tupl def normalize_dtype(dtype: Union[str, np.dtype], object_codec) -> Tuple[np.dtype, Any]: - # convenience API for object arrays if inspect.isclass(dtype): dtype = dtype.__name__ # type: ignore @@ -245,7 +244,6 @@ def is_total_slice(item, shape: Tuple[int]) -> bool: def normalize_resize_args(old_shape, *args): - # normalize new shape argument if len(args) == 1: new_shape = args[0] @@ -294,7 +292,6 @@ def normalize_dimension_separator(sep: Optional[str]) -> Optional[str]: def normalize_fill_value(fill_value, dtype: np.dtype): - if fill_value is None or dtype.hasobject: # no fill value pass @@ -309,8 +306,9 @@ def normalize_fill_value(fill_value, dtype: np.dtype): if not isinstance(fill_value, str): raise ValueError( - "fill_value {!r} is not valid for dtype {}; must be a " - "unicode string".format(fill_value, dtype) + "fill_value {!r} is not valid for dtype {}; must be a " "unicode string".format( + fill_value, dtype + ) ) else: @@ -324,15 +322,15 @@ def normalize_fill_value(fill_value, dtype: np.dtype): except Exception as e: # re-raise with our own error message to be helpful raise ValueError( - "fill_value {!r} is not valid for dtype {}; nested " - "exception: {}".format(fill_value, dtype, e) + "fill_value {!r} is not valid for dtype {}; nested " "exception: {}".format( + fill_value, dtype, e + ) ) return fill_value def normalize_storage_path(path: Union[str, bytes, None]) -> str: - # handle bytes if isinstance(path, bytes): path = str(path, "ascii") @@ -342,7 +340,6 @@ def normalize_storage_path(path: Union[str, bytes, None]) -> str: path = str(path) if path: - # convert backslash to forward slash path = path.replace("\\", "/") @@ -506,7 +503,6 @@ def tree_widget(group, expand, level): class TreeViewer: def __init__(self, group, expand=False, level=None): - self.group = group self.expand = expand self.level = level diff --git a/src/zarr/v3/config.py b/src/zarr/v3/config.py index 98a25994c4..cebe5c1b09 100644 --- a/src/zarr/v3/config.py +++ b/src/zarr/v3/config.py @@ -43,7 +43,6 @@ def __init__( concurrency: Optional[int] = None, asyncio_loop: Optional[AbstractEventLoop] = None, ): - order_parsed = parse_indexing_order(order) concurrency_parsed = parse_concurrency(concurrency) asyncio_loop_parsed = parse_asyncio_loop(asyncio_loop) diff --git a/src/zarr/v3/group.py b/src/zarr/v3/group.py index 0012a77a81..fcd2fea215 100644 --- a/src/zarr/v3/group.py +++ b/src/zarr/v3/group.py @@ -151,7 +151,6 @@ async def getitem( self, key: str, ) -> Union[AsyncArray, AsyncGroup]: - store_path = self.store_path / key if self.metadata.zarr_format == 3: diff --git a/src/zarr/v3/store/local.py b/src/zarr/v3/store/local.py index c3da110450..5d22b30e9a 100644 --- a/src/zarr/v3/store/local.py +++ b/src/zarr/v3/store/local.py @@ -46,7 +46,6 @@ def _put( class LocalStore(Store): - supports_writes: bool = True supports_partial_writes: bool = True supports_listing: bool = True @@ -126,6 +125,7 @@ async def list(self) -> List[str]: ------- list[str] """ + # Q: do we want to return strings or Paths? def _list(root: Path) -> List[str]: files = [str(p) for p in root.rglob("") if p.is_file()] @@ -166,7 +166,6 @@ async def list_dir(self, prefix: str) -> List[str]: """ def _list_dir(root: Path, prefix: str) -> List[str]: - base = root / prefix to_strip = str(base) + "/" try: diff --git a/src/zarr/v3/sync.py b/src/zarr/v3/sync.py index fcc8e7b275..2e94a815cc 100644 --- a/src/zarr/v3/sync.py +++ b/src/zarr/v3/sync.py @@ -103,7 +103,6 @@ def _get_loop(): class SyncMixin: - _sync_configuration: SyncConfiguration def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T: diff --git a/tests/test_attrs.py b/tests/test_attrs.py index a5ce4bac89..7e3377f664 100644 --- a/tests/test_attrs.py +++ b/tests/test_attrs.py @@ -30,7 +30,6 @@ def init_attributes(self, store, read_only=False, cache=True, zarr_version=2): return Attributes(store, key=root + "attrs", read_only=read_only, cache=cache) def test_storage(self, zarr_version): - store = _init_store(zarr_version) root = ".z" if zarr_version == 2 else meta_root attrs_key = root + "attrs" @@ -50,7 +49,6 @@ def test_storage(self, zarr_version): assert dict(foo="bar", baz=42) == d def test_utf8_encoding(self, zarr_version): - project_root = pathlib.Path(zarr.__file__).resolve().parent.parent fixdir = project_root / "fixture" testdir = fixdir / "utf8attrs" @@ -67,7 +65,6 @@ def test_utf8_encoding(self, zarr_version): assert fixture["utf8attrs"].attrs.asdict() == dict(foo="た") def test_get_set_del_contains(self, zarr_version): - store = _init_store(zarr_version) a = self.init_attributes(store, zarr_version=zarr_version) assert "foo" not in a @@ -84,7 +81,6 @@ def test_get_set_del_contains(self, zarr_version): a["foo"] def test_update_put(self, zarr_version): - store = _init_store(zarr_version) a = self.init_attributes(store, zarr_version=zarr_version) assert "foo" not in a @@ -102,7 +98,6 @@ def test_update_put(self, zarr_version): assert "baz" not in a def test_iterators(self, zarr_version): - store = _init_store(zarr_version) a = self.init_attributes(store, zarr_version=zarr_version) assert 0 == len(a) @@ -232,7 +227,6 @@ def test_caching_on(self, zarr_version): assert get_cnt == store.counter["__getitem__", attrs_key] def test_caching_off(self, zarr_version): - # setup store store = CountingDict() if zarr_version == 2 else CountingDictV3() attrs_key = ".zattrs" if zarr_version == 2 else "meta/root/attrs" diff --git a/tests/test_convenience.py b/tests/test_convenience.py index 0970a9e1aa..7cb4db7a35 100644 --- a/tests/test_convenience.py +++ b/tests/test_convenience.py @@ -57,7 +57,6 @@ def _init_creation_kwargs(zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) def test_open_array(path_type, zarr_version): - store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) @@ -86,7 +85,6 @@ def test_open_array(path_type, zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) def test_open_group(path_type, zarr_version): - store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) @@ -210,7 +208,6 @@ def test_tree(zarr_version): def test_consolidate_metadata( with_chunk_store, zarr_version, listable, monkeypatch, stores_from_path ): - # setup initial data if stores_from_path: store = tempfile.mkdtemp() @@ -399,7 +396,6 @@ def test_save_array_separator(tmpdir, options): class TestCopyStore(unittest.TestCase): - _version = 2 def setUp(self): @@ -536,7 +532,6 @@ def test_if_exists(self): @pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") class TestCopyStoreV3(TestCopyStore): - _version = 3 def setUp(self): @@ -557,7 +552,6 @@ def test_mismatched_store_versions(self): def check_copied_array(original, copied, without_attrs=False, expect_props=None): - # setup source_h5py = original.__module__.startswith("h5py.") dest_h5py = copied.__module__.startswith("h5py.") @@ -621,7 +615,6 @@ def check_copied_array(original, copied, without_attrs=False, expect_props=None) def check_copied_group(original, copied, without_attrs=False, expect_props=None, shallow=False): - # setup if expect_props is None: expect_props = dict() diff --git a/tests/test_creation.py b/tests/test_creation.py index 9307b81b52..27ce00bc8a 100644 --- a/tests/test_creation.py +++ b/tests/test_creation.py @@ -74,7 +74,6 @@ def _init_creation_kwargs(zarr_version, at_root=True): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_array(zarr_version, at_root): - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version kwargs = _init_creation_kwargs(zarr_version, at_root) @@ -213,7 +212,6 @@ def test_full_additional_dtypes(zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_open_array(zarr_version, at_root, dimension_separator): - store = "data/array.zarr" kwargs = _init_creation_kwargs(zarr_version, at_root) @@ -329,7 +327,6 @@ def test_open_array(zarr_version, at_root, dimension_separator): def test_open_array_none(): - # open with both store and zarr_version = None z = open_array(mode="w", shape=100, chunks=10) assert isinstance(z, Array) @@ -339,7 +336,6 @@ def test_open_array_none(): @pytest.mark.parametrize("dimension_separator", [".", "/", None]) @pytest.mark.parametrize("zarr_version", _VERSIONS2) def test_open_array_infer_separator_from_store(zarr_version, dimension_separator): - if zarr_version == 3: StoreClass = DirectoryStoreV3 path = "data" @@ -370,7 +366,6 @@ def test_open_array_infer_separator_from_store(zarr_version, dimension_separator # TODO: N5 support for v3 @pytest.mark.parametrize("zarr_version", [None, 2]) def test_open_array_n5(zarr_version): - store = "data/array.zarr" kwargs = _init_creation_kwargs(zarr_version) @@ -409,7 +404,6 @@ def test_open_array_n5(zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_open_array_dict_store(zarr_version, at_root): - # dict will become a KVStore store = dict() kwargs = _init_creation_kwargs(zarr_version, at_root) @@ -503,7 +497,6 @@ def test_empty_like(zarr_version, at_root): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_zeros_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version @@ -529,7 +522,6 @@ def test_zeros_like(zarr_version, at_root): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_ones_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version @@ -556,7 +548,6 @@ def test_ones_like(zarr_version, at_root): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_full_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version diff --git a/tests/test_dim_separator.py b/tests/test_dim_separator.py index 83f4d3b5b9..4276d1829d 100644 --- a/tests/test_dim_separator.py +++ b/tests/test_dim_separator.py @@ -46,7 +46,6 @@ def dataset(tmpdir, request): static = project_root / "fixture" / suffix if not static.exists(): # pragma: no cover - if "nested" in which: # No way to reproduce the nested_legacy file via code generator = NestedDirectoryStore diff --git a/tests/test_filters.py b/tests/test_filters.py index d55be9145f..fc63cdca8d 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -30,7 +30,6 @@ def test_array_with_delta_filter(): - # setup astype = "u1" dtype = "i8" @@ -38,7 +37,6 @@ def test_array_with_delta_filter(): data = np.arange(100, dtype=dtype) for compressor in compressors: - a = array(data, chunks=10, compressor=compressor, filters=filters) # check round-trip @@ -57,7 +55,6 @@ def test_array_with_delta_filter(): def test_array_with_astype_filter(): - # setup encode_dtype = "i1" decode_dtype = "i8" @@ -68,7 +65,6 @@ def test_array_with_astype_filter(): data = np.arange(shape, dtype=decode_dtype) for compressor in compressors: - a = array(data, chunks=chunks, compressor=compressor, filters=filters) # check round-trip @@ -88,7 +84,6 @@ def test_array_with_astype_filter(): def test_array_with_scaleoffset_filter(): - # setup astype = "u1" dtype = "f8" @@ -97,7 +92,6 @@ def test_array_with_scaleoffset_filter(): data = np.linspace(1000, 1001, 34, dtype="f8") for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip @@ -116,7 +110,6 @@ def test_array_with_scaleoffset_filter(): def test_array_with_quantize_filter(): - # setup dtype = "f8" digits = 3 @@ -125,7 +118,6 @@ def test_array_with_quantize_filter(): data = np.linspace(0, 1, 34, dtype=dtype) for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip @@ -144,14 +136,12 @@ def test_array_with_quantize_filter(): def test_array_with_packbits_filter(): - # setup flt = PackBits() filters = [flt] data = np.random.randint(0, 2, size=100, dtype=bool) for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip @@ -170,14 +160,12 @@ def test_array_with_packbits_filter(): def test_array_with_categorize_filter(): - # setup data = np.random.choice(["foo", "bar", "baz"], size=100) flt = Categorize(dtype=data.dtype, labels=["foo", "bar", "baz"]) filters = [flt] for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip diff --git a/tests/test_group_v3.py b/tests/test_group_v3.py index 1498d6779b..f5b5dde86d 100644 --- a/tests/test_group_v3.py +++ b/tests/test_group_v3.py @@ -14,7 +14,6 @@ def store_path(tmpdir): def test_group(store_path) -> None: - agroup = AsyncGroup( metadata=GroupMetadata(), store_path=store_path, @@ -57,7 +56,6 @@ def test_group(store_path) -> None: def test_group_sync_constructor(store_path) -> None: - group = Group.create( store=store_path, attributes={"title": "test 123"}, diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 3eaa4743dd..6d4b1ff54c 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -1085,7 +1085,6 @@ def test_paths(self): g1.store.close() def test_pickle(self): - # setup group g = self.create_group() d = g.create_dataset("foo/bar", shape=100, chunks=10) @@ -1113,7 +1112,6 @@ def test_pickle(self): g2.store.close() def test_context_manager(self): - with self.create_group() as g: d = g.create_dataset("foo/bar", shape=100, chunks=10) d[:] = np.arange(100) @@ -1375,7 +1373,6 @@ def create_store(): return store, None def test_context_manager(self): - with self.create_group() as g: store = g.store d = g.create_dataset("foo/bar", shape=100, chunks=10) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 1835206819..d441f3b8fa 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -17,7 +17,6 @@ def test_normalize_integer_selection(): - assert 1 == normalize_integer_selection(1, 100) assert 99 == normalize_integer_selection(-1, 100) with pytest.raises(IndexError): @@ -29,7 +28,6 @@ def test_normalize_integer_selection(): def test_replace_ellipsis(): - # 1D, single item assert (0,) == replace_ellipsis(0, (100,)) @@ -68,7 +66,6 @@ def test_replace_ellipsis(): def test_get_basic_selection_0d(): - # setup a = np.array(42) z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) @@ -191,7 +188,6 @@ def _test_get_basic_selection(a, z, selection): # noinspection PyStatementEffect def test_get_basic_selection_1d(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -264,7 +260,6 @@ def test_get_basic_selection_1d(): # noinspection PyStatementEffect def test_get_basic_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -423,7 +418,6 @@ def test_fancy_indexing_doesnt_mix_with_implicit_slicing(): def test_set_basic_selection_0d(): - # setup v = np.array(42) a = np.zeros_like(v) @@ -479,7 +473,6 @@ def _test_get_orthogonal_selection(a, z, selection): # noinspection PyStatementEffect def test_get_orthogonal_selection_1d_bool(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -502,7 +495,6 @@ def test_get_orthogonal_selection_1d_bool(): # noinspection PyStatementEffect def test_get_orthogonal_selection_1d_int(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -561,7 +553,6 @@ def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): # noinspection PyStatementEffect def test_get_orthogonal_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -570,7 +561,6 @@ def test_get_orthogonal_selection_2d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -641,7 +631,6 @@ def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): def test_get_orthogonal_selection_3d(): - # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) @@ -650,7 +639,6 @@ def test_get_orthogonal_selection_3d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -673,7 +661,6 @@ def test_get_orthogonal_selection_3d(): def test_orthogonal_indexing_edge_cases(): - a = np.arange(6).reshape(1, 2, 3) z = zarr.create(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) z[:] = a @@ -706,7 +693,6 @@ def _test_set_orthogonal_selection(v, a, z, selection): def test_set_orthogonal_selection_1d(): - # setup v = np.arange(1050, dtype=int) a = np.empty(v.shape, dtype=int) @@ -715,7 +701,6 @@ def test_set_orthogonal_selection_1d(): # test with different degrees of sparseness np.random.seed(42) for p in 0.5, 0.1, 0.01: - # boolean arrays ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) _test_set_orthogonal_selection(v, a, z, ix) @@ -734,7 +719,6 @@ def test_set_orthogonal_selection_1d(): def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): - selections = [ # index both axes with array (ix0, ix1), @@ -749,7 +733,6 @@ def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): def test_set_orthogonal_selection_2d(): - # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) @@ -758,7 +741,6 @@ def test_set_orthogonal_selection_2d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -780,7 +762,6 @@ def test_set_orthogonal_selection_2d(): def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): - selections = ( # single value (84, 42, 4), @@ -807,7 +788,6 @@ def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): def test_set_orthogonal_selection_3d(): - # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) a = np.empty_like(v) @@ -816,7 +796,6 @@ def test_set_orthogonal_selection_3d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -888,7 +867,6 @@ def _test_get_coordinate_selection(a, z, selection): # noinspection PyStatementEffect def test_get_coordinate_selection_1d(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -932,7 +910,6 @@ def test_get_coordinate_selection_1d(): def test_get_coordinate_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -1027,7 +1004,6 @@ def test_set_coordinate_selection_1d(): def test_set_coordinate_selection_2d(): - # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) @@ -1258,7 +1234,6 @@ def _test_get_mask_selection(a, z, selection): # noinspection PyStatementEffect def test_get_mask_selection_1d(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -1285,7 +1260,6 @@ def test_get_mask_selection_1d(): # noinspection PyStatementEffect def test_get_mask_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -1318,7 +1292,6 @@ def _test_set_mask_selection(v, a, z, selection): def test_set_mask_selection_1d(): - # setup v = np.arange(1050, dtype=int) a = np.empty_like(v) @@ -1338,7 +1311,6 @@ def test_set_mask_selection_1d(): def test_set_mask_selection_2d(): - # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) @@ -1352,7 +1324,6 @@ def test_set_mask_selection_2d(): def test_get_selection_out(): - # basic selections a = np.arange(1050) z = zarr.create(shape=1050, chunks=100, dtype=a.dtype) @@ -1426,7 +1397,6 @@ def test_get_selection_out(): def test_get_selections_with_fields(): - a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=None) @@ -1444,7 +1414,6 @@ def test_get_selections_with_fields(): ] for fields in fields_fixture: - # total selection expect = a[fields] actual = z.get_basic_selection(Ellipsis, fields=fields) @@ -1534,7 +1503,6 @@ def test_get_selections_with_fields(): def test_set_selections_with_fields(): - v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) a = np.empty_like(v) @@ -1553,7 +1521,6 @@ def test_set_selections_with_fields(): ] for fields in fields_fixture: - # currently multi-field assignment is not supported in numpy, so we won't support # it either if isinstance(fields, list) and len(fields) > 1: @@ -1567,7 +1534,6 @@ def test_set_selections_with_fields(): z.set_mask_selection([True, False, True], v, fields=fields) else: - if isinstance(fields, list) and len(fields) == 1: # work around numpy does not support multi-field assignment even if there # is only one field @@ -1752,7 +1718,6 @@ def test_accessed_chunks(shape, chunks, ops): z = zarr.create(shape=shape, chunks=chunks, store=store) for ii, (optype, slices) in enumerate(ops): - # Resolve the slices into the accessed chunks for each dimension chunks_per_dim = [] for N, C, sl in zip(shape, chunks, slices): diff --git a/tests/test_info.py b/tests/test_info.py index 7fb6feb11b..96eae999f4 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize("array_size", [10, 15000]) def test_info(array_size): - # setup g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) g.create_group("foo") diff --git a/tests/test_meta.py b/tests/test_meta.py index db50560c8e..50f51929ef 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -34,7 +34,6 @@ def assert_json_equal(expect, actual): def test_encode_decode_array_1(): - meta = dict( shape=(100,), chunks=(10,), @@ -76,7 +75,6 @@ def test_encode_decode_array_1(): def test_encode_decode_array_2(): - # some variations df = Delta(astype=" Date: Mon, 8 Apr 2024 13:43:18 -0400 Subject: [PATCH 05/22] Remove outdated dev install docs from installation.rst and link to contributing.rst (#1643) Co-authored-by: Joe Hamman --- docs/installation.rst | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 8553d451cb..3d4ac41072 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -19,13 +19,4 @@ latest GitHub main:: $ pip install git+https://github.com/zarr-developers/zarr-python.git -To work with Zarr source code in development, install from GitHub:: - - $ git clone --recursive https://github.com/zarr-developers/zarr-python.git - $ cd zarr-python - $ python -m pip install -e . - -To verify that Zarr has been fully installed, run the test suite:: - - $ pip install pytest - $ python -m pytest -v --pyargs zarr +To work with Zarr source code in development, see `Contributing `_. \ No newline at end of file From bbede29084e0577c766fdf50782bd8341826977f Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 10 Apr 2024 17:28:50 +0200 Subject: [PATCH 06/22] chore: remove old v3 implementation --- src/zarr/__init__.py | 16 - src/zarr/_storage/absstore.py | 59 +- src/zarr/_storage/store.py | 494 +------------- src/zarr/_storage/v3.py | 668 ------------------- src/zarr/_storage/v3_storage_transformers.py | 386 ----------- src/zarr/attrs.py | 67 +- src/zarr/convenience.py | 108 +-- src/zarr/core.py | 172 ++--- src/zarr/creation.py | 54 +- src/zarr/hierarchy.py | 235 +------ src/zarr/meta.py | 283 +------- src/zarr/storage.py | 209 +----- tests/test_attrs.py | 125 ++-- tests/test_convenience.py | 347 ++-------- tests/test_core.py | 666 +----------------- tests/test_creation.py | 215 ++---- tests/test_hierarchy.py | 638 +++--------------- tests/test_meta.py | 142 ---- tests/test_sync.py | 12 +- tests/util.py | 5 - 20 files changed, 423 insertions(+), 4478 deletions(-) delete mode 100644 src/zarr/_storage/v3.py delete mode 100644 src/zarr/_storage/v3_storage_transformers.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index b3c1e05b7e..601b1295ab 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -31,7 +31,6 @@ from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group from zarr.n5 import N5Store, N5FSStore -from zarr._storage.store import v3_api_available from zarr.storage import ( ABSStore, DBMStore, @@ -53,18 +52,3 @@ # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") - -if v3_api_available: - from zarr._storage.v3 import ( - ABSStoreV3, - DBMStoreV3, - KVStoreV3, - DirectoryStoreV3, - LMDBStoreV3, - LRUStoreCacheV3, - MemoryStoreV3, - MongoDBStoreV3, - RedisStoreV3, - SQLiteStoreV3, - ZipStoreV3, - ) diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py index 5fd709d02a..d8e292535c 100644 --- a/src/zarr/_storage/absstore.py +++ b/src/zarr/_storage/absstore.py @@ -1,10 +1,9 @@ """This module contains storage classes related to Azure Blob Storage (ABS)""" -from typing_extensions import deprecated import warnings from numcodecs.compat import ensure_bytes from zarr.util import normalize_storage_path -from zarr._storage.store import _get_metadata_suffix, data_root, meta_root, Store, StoreV3 +from zarr._storage.store import Store __doctest_requires__ = { ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], @@ -223,59 +222,3 @@ def getsize(self, path=None): def clear(self): self.rmdir() - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class ABSStoreV3(ABSStore, StoreV3): - def list(self): - return list(self.keys()) - - def __eq__(self, other): - return ( - isinstance(other, ABSStoreV3) - and self.client == other.client - and self.prefix == other.prefix - ) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def rmdir(self, path=None): - if not path: - # Currently allowing clear to delete everything as in v2 - - # If we disallow an empty path then we will need to modify - # TestABSStoreV3 to have the create_store method use a prefix. - ABSStore.rmdir(self, "") - return - - meta_dir = meta_root + path - meta_dir = meta_dir.rstrip("/") - ABSStore.rmdir(self, meta_dir) - - # remove data folder - data_dir = data_root + path - data_dir = data_dir.rstrip("/") - ABSStore.rmdir(self, data_dir) - - # remove metadata files - sfx = _get_metadata_suffix(self) - array_meta_file = meta_dir + ".array" + sfx - if array_meta_file in self: - del self[array_meta_file] - group_meta_file = meta_dir + ".group" + sfx - if group_meta_file in self: - del self[group_meta_file] - - # TODO: adapt the v2 getsize method to work for v3 - # For now, calling the generic keys-based _getsize - def getsize(self, path=None): - from zarr.storage import _getsize # avoid circular import - - return _getsize(self, path) - - -ABSStoreV3.__doc__ = ABSStore.__doc__ diff --git a/src/zarr/_storage/store.py b/src/zarr/_storage/store.py index 44a22ae34e..f10d65be57 100644 --- a/src/zarr/_storage/store.py +++ b/src/zarr/_storage/store.py @@ -1,38 +1,18 @@ -import abc -import os -from collections import defaultdict from collections.abc import MutableMapping -from copy import copy -from string import ascii_letters, digits -from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, List, Mapping, Optional, Sequence, Union -from zarr.meta import Metadata2, Metadata3 +from zarr.meta import Metadata2 from zarr.util import normalize_storage_path from zarr.context import Context -from typing_extensions import deprecated # v2 store keys array_meta_key = ".zarray" group_meta_key = ".zgroup" attrs_key = ".zattrs" -# v3 paths -meta_root = "meta/root/" -data_root = "data/root/" - DEFAULT_ZARR_VERSION = 2 -v3_api_available = os.environ.get("ZARR_V3_EXPERIMENTAL_API", "0").lower() not in ["0", "false"] - - -def assert_zarr_v3_api_available(): - if not v3_api_available: - raise NotImplementedError( - "# V3 reading and writing is experimental! To enable support, set:\n" - "ZARR_V3_EXPERIMENTAL_API=1" - ) # pragma: no cover - class BaseStore(MutableMapping): """Abstract base class for store implementations. @@ -184,377 +164,6 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys(self, path) -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class StoreV3(BaseStore): - _store_version = 3 - _metadata_class = Metadata3 - _valid_key_characters = set(ascii_letters + digits + "/.-_") - - def _valid_key(self, key: str) -> bool: - """ - Verify that a key conforms to the specification. - - A key is any string containing only character in the range a-z, A-Z, - 0-9, or in the set /.-_ it will return True if that's the case, False - otherwise. - """ - if not isinstance(key, str) or not key.isascii(): - return False - if set(key) - self._valid_key_characters: - return False - return True - - def _validate_key(self, key: str): - """ - Verify that a key conforms to the v3 specification. - - A key is any string containing only character in the range a-z, A-Z, - 0-9, or in the set /.-_ it will return True if that's the case, False - otherwise. - - In spec v3, keys can only start with the prefix meta/, data/ or be - exactly zarr.json and should not end with /. This should not be exposed - to the user, and is a store implementation detail, so this method will - raise a ValueError in that case. - """ - if not self._valid_key(key): - raise ValueError( - f"Keys must be ascii strings and may only contain the " - f"characters {''.join(sorted(self._valid_key_characters))}" - ) - - if ( - not key.startswith("data/") - and (not key.startswith("meta/")) - and (not key == "zarr.json") - # TODO: Possibly allow key == ".zmetadata" too if we write a - # consolidated metadata spec corresponding to this? - ): - raise ValueError("keys starts with unexpected value: `{}`".format(key)) - - if key.endswith("/"): - raise ValueError("keys may not end in /") - - def list_prefix(self, prefix): - if prefix.startswith("/"): - raise ValueError("prefix must not begin with /") - # TODO: force prefix to end with /? - return [k for k in self.list() if k.startswith(prefix)] - - def erase(self, key): - self.__delitem__(key) - - def erase_prefix(self, prefix): - assert prefix.endswith("/") - - if prefix == "/": - all_keys = self.list() - else: - all_keys = self.list_prefix(prefix) - for key in all_keys: - self.erase(key) - - def list_dir(self, prefix): - """ - TODO: carefully test this with trailing/leading slashes - """ - if prefix: # allow prefix = "" ? - assert prefix.endswith("/") - - all_keys = self.list_prefix(prefix) - len_prefix = len(prefix) - keys = [] - prefixes = [] - for k in all_keys: - trail = k[len_prefix:] - if "/" not in trail: - keys.append(prefix + trail) - else: - prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") - return keys, list(set(prefixes)) - - def list(self): - return list(self.keys()) - - def __contains__(self, key): - return key in self.list() - - @abc.abstractmethod - def __setitem__(self, key, value): - """Set a value.""" - - @abc.abstractmethod - def __getitem__(self, key): - """Get a value.""" - - @abc.abstractmethod - def rmdir(self, path=None): - """Remove a data path and all its subkeys and related metadata. - Expects a path without the data or meta root prefix.""" - - @property - def supports_efficient_get_partial_values(self): - return False - - def get_partial_values( - self, key_ranges: Sequence[Tuple[str, Tuple[int, Optional[int]]]] - ) -> List[Union[bytes, memoryview, bytearray]]: - """Get multiple partial values. - key_ranges can be an iterable of key, range pairs, - where a range specifies two integers range_start and range_length - as a tuple, (range_start, range_length). - range_length may be None to indicate to read until the end. - range_start may be negative to start reading range_start bytes - from the end of the file. - A key may occur multiple times with different ranges. - Inserts None for missing keys into the returned list.""" - results: List[Union[bytes, memoryview, bytearray]] = [None] * len(key_ranges) # type: ignore[list-item] # noqa: E501 - indexed_ranges_by_key: Dict[str, List[Tuple[int, Tuple[int, Optional[int]]]]] = defaultdict( - list - ) - for i, (key, range_) in enumerate(key_ranges): - indexed_ranges_by_key[key].append((i, range_)) - for key, indexed_ranges in indexed_ranges_by_key.items(): - try: - value = self[key] - except KeyError: # pragma: no cover - continue - for i, (range_from, range_length) in indexed_ranges: - if range_length is None: - results[i] = value[range_from:] - else: - results[i] = value[range_from : range_from + range_length] - return results - - def supports_efficient_set_partial_values(self): - return False - - def set_partial_values(self, key_start_values): - """Set multiple partial values. - key_start_values can be an iterable of key, start and value triplets - as tuples, (key, start, value), where start defines the offset in bytes. - A key may occur multiple times with different starts and non-overlapping values. - Also, start may only be beyond the current value if other values fill the gap. - start may be negative to start writing start bytes from the current - end of the file, ending the file with the new value.""" - unique_keys = set(next(zip(*key_start_values))) - values = {} - for key in unique_keys: - old_value = self.get(key) - values[key] = None if old_value is None else bytearray(old_value) - for key, start, value in key_start_values: - if values[key] is None: - assert start == 0 - values[key] = value - else: - if start > len(values[key]): # pragma: no cover - raise ValueError( - f"Cannot set value at start {start}, " - + f"since it is beyond the data at key {key}, " - + f"having length {len(values[key])}." - ) - if start < 0: - values[key][start:] = value - else: - values[key][start : start + len(value)] = value - for key, value in values.items(): - self[key] = value - - def clear(self): - """Remove all items from store.""" - self.erase_prefix("/") - - def __eq__(self, other): - return NotImplemented - - @staticmethod - def _ensure_store(store): - """ - We want to make sure internally that zarr stores are always a class - with a specific interface derived from ``Store``, which is slightly - different than ``MutableMapping``. - - We'll do this conversion in a few places automatically - """ - from zarr._storage.v3 import KVStoreV3 # avoid circular import - - if store is None: - return None - elif isinstance(store, StoreV3): - return store - elif isinstance(store, Store): - raise ValueError(f"cannot initialize a v3 store with a v{store._store_version} store") - elif isinstance(store, MutableMapping): - return KVStoreV3(store) - else: - for attr in [ - "keys", - "values", - "get", - "__setitem__", - "__getitem__", - "__delitem__", - "__contains__", - ]: - if not hasattr(store, attr): - break - else: - return KVStoreV3(store) - - raise ValueError( - "v3 stores must be subclasses of StoreV3, " - "if your store exposes the MutableMapping interface wrap it in " - f"Zarr.storage.KVStoreV3. Got {store}" - ) - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class StorageTransformer(MutableMapping, abc.ABC): - """Base class for storage transformers. The methods simply pass on the data as-is - and should be overwritten by sub-classes.""" - - _store_version = 3 - _metadata_class = Metadata3 - - def __init__(self, _type) -> None: - if _type not in self.valid_types: # pragma: no cover - raise ValueError( - f"Storage transformer cannot be initialized with type {_type}, " - + f"must be one of {list(self.valid_types)}." - ) - self.type = _type - self._inner_store = None - - def _copy_for_array(self, array, inner_store): - transformer_copy = copy(self) - transformer_copy._inner_store = inner_store - return transformer_copy - - @abc.abstractproperty - def extension_uri(self): - pass # pragma: no cover - - @abc.abstractproperty - def valid_types(self): - pass # pragma: no cover - - def get_config(self): - """Return a dictionary holding configuration parameters for this - storage transformer. All values must be compatible with JSON encoding.""" - # Override in sub-class if need special encoding of config values. - # By default, assume all non-private members are configuration - # parameters except for type . - return {k: v for k, v in self.__dict__.items() if not k.startswith("_") and k != "type"} - - @classmethod - def from_config(cls, _type, config): - """Instantiate storage transformer from a configuration object.""" - # override in sub-class if need special decoding of config values - - # by default, assume constructor accepts configuration parameters as - # keyword arguments without any special decoding - return cls(_type, **config) - - @property - def inner_store(self) -> Union["StorageTransformer", StoreV3]: - assert ( - self._inner_store is not None - ), "inner_store is not initialized, first get a copy via _copy_for_array." - return self._inner_store - - # The following implementations are usually fine to keep as-is: - - def __eq__(self, other): - return ( - type(self) == type(other) - and self._inner_store == other._inner_store - and self.get_config() == other.get_config() - ) - - def erase(self, key): - self.__delitem__(key) - - def list(self): - return list(self.keys()) - - def list_dir(self, prefix): - return StoreV3.list_dir(self, prefix) - - def is_readable(self): - return self.inner_store.is_readable() - - def is_writeable(self): - return self.inner_store.is_writeable() - - def is_listable(self): - return self.inner_store.is_listable() - - def is_erasable(self): - return self.inner_store.is_erasable() - - def clear(self): - return self.inner_store.clear() - - def __enter__(self): - return self.inner_store.__enter__() - - def __exit__(self, exc_type, exc_value, traceback): - return self.inner_store.__exit__(exc_type, exc_value, traceback) - - def close(self) -> None: - return self.inner_store.close() - - # The following implementations might need to be re-implemented - # by subclasses implementing storage transformers: - - def rename(self, src_path: str, dst_path: str) -> None: - return self.inner_store.rename(src_path, dst_path) - - def list_prefix(self, prefix): - return self.inner_store.list_prefix(prefix) - - def erase_prefix(self, prefix): - return self.inner_store.erase_prefix(prefix) - - def rmdir(self, path=None): - return self.inner_store.rmdir(path) - - def __contains__(self, key): - return self.inner_store.__contains__(key) - - def __setitem__(self, key, value): - return self.inner_store.__setitem__(key, value) - - def __getitem__(self, key): - return self.inner_store.__getitem__(key) - - def __delitem__(self, key): - return self.inner_store.__delitem__(key) - - def __iter__(self): - return self.inner_store.__iter__() - - def __len__(self): - return self.inner_store.__len__() - - @property - def supports_efficient_get_partial_values(self): - return self.inner_store.supports_efficient_get_partial_values - - def get_partial_values(self, key_ranges): - return self.inner_store.get_partial_values(key_ranges) - - def supports_efficient_set_partial_values(self): - return self.inner_store.supports_efficient_set_partial_values() - - def set_partial_values(self, key_start_values): - return self.inner_store.set_partial_values(key_start_values) - - # allow MutableMapping for backwards compatibility StoreLike = Union[BaseStore, MutableMapping] @@ -568,49 +177,6 @@ def _path_to_prefix(path: Optional[str]) -> str: return prefix -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -def _get_hierarchy_metadata(store: StoreV3) -> Mapping[str, Any]: - version = getattr(store, "_store_version", 2) - if version < 3: - raise ValueError("zarr.json hierarchy metadata not stored for " f"zarr v{version} stores") - if "zarr.json" not in store: - raise ValueError("zarr.json metadata not found in store") - return store._metadata_class.decode_hierarchy_metadata(store["zarr.json"]) - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -def _get_metadata_suffix(store: StoreV3) -> str: - if "zarr.json" in store: - return _get_hierarchy_metadata(store)["metadata_key_suffix"] - return ".json" - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -def _rename_metadata_v3(store: StoreV3, src_path: str, dst_path: str) -> bool: - """Rename source or group metadata file associated with src_path.""" - any_renamed = False - sfx = _get_metadata_suffix(store) - src_path = src_path.rstrip("/") - dst_path = dst_path.rstrip("/") - _src_array_json = meta_root + src_path + ".array" + sfx - if _src_array_json in store: - new_key = meta_root + dst_path + ".array" + sfx - store[new_key] = store.pop(_src_array_json) - any_renamed = True - _src_group_json = meta_root + src_path + ".group" + sfx - if _src_group_json in store: - new_key = meta_root + dst_path + ".group" + sfx - store[new_key] = store.pop(_src_group_json) - any_renamed = True - return any_renamed - - def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: # assume path already normalized src_prefix = _path_to_prefix(src_path) @@ -622,19 +188,7 @@ def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: new_key = dst_prefix + key.lstrip(src_prefix) store[new_key] = store.pop(key) else: - any_renamed = False - for root_prefix in [meta_root, data_root]: - _src_prefix = root_prefix + src_prefix - _dst_prefix = root_prefix + dst_prefix - for key in store.list_prefix(_src_prefix): # type: ignore - new_key = _dst_prefix + key[len(_src_prefix) :] - store[new_key] = store.pop(key) - any_renamed = True - any_meta_renamed = _rename_metadata_v3(store, src_path, dst_path) # type: ignore - any_renamed = any_meta_renamed or any_renamed - - if not any_renamed: - raise ValueError(f"no item {src_path} found to rename") + raise NotImplementedError("This function only supports Zarr version 2.") def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: @@ -645,29 +199,6 @@ def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: del store[key] -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: - meta_dir = meta_root + path - meta_dir = meta_dir.rstrip("/") - _rmdir_from_keys(store, meta_dir) - - # remove data folder - data_dir = data_root + path - data_dir = data_dir.rstrip("/") - _rmdir_from_keys(store, data_dir) - - # remove metadata files - sfx = _get_metadata_suffix(store) - array_meta_file = meta_dir + ".array" + sfx - if array_meta_file in store: - store.erase(array_meta_file) # type: ignore - group_meta_file = meta_dir + ".group" + sfx - if group_meta_file in store: - store.erase(group_meta_file) # type: ignore - - def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str]: # assume path already normalized prefix = _path_to_prefix(path) @@ -682,11 +213,7 @@ def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: if getattr(store, "_store_version", 2) == 3: - sfx = _get_metadata_suffix(store) # type: ignore - if prefix: - key = meta_root + prefix.rstrip("/") + ".array" + sfx - else: - key = meta_root[:-1] + ".array" + sfx + raise NotImplementedError("This function only supports Zarr version 2.") else: key = prefix + array_meta_key return key @@ -694,11 +221,7 @@ def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: if getattr(store, "_store_version", 2) == 3: - sfx = _get_metadata_suffix(store) # type: ignore - if prefix: - key = meta_root + prefix.rstrip("/") + ".group" + sfx - else: - key = meta_root[:-1] + ".group" + sfx + raise NotImplementedError("This function only supports Zarr version 2.") else: key = prefix + group_meta_key return key @@ -706,12 +229,7 @@ def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: if getattr(store, "_store_version", 2) == 3: - # for v3, attributes are stored in the array metadata - sfx = _get_metadata_suffix(store) # type: ignore - if prefix: - key = meta_root + prefix.rstrip("/") + ".array" + sfx - else: - key = meta_root[:-1] + ".array" + sfx + raise NotImplementedError("This function only supports Zarr version 2.") else: key = prefix + attrs_key return key diff --git a/src/zarr/_storage/v3.py b/src/zarr/_storage/v3.py deleted file mode 100644 index a9dbbee743..0000000000 --- a/src/zarr/_storage/v3.py +++ /dev/null @@ -1,668 +0,0 @@ -import os -import shutil -from collections import OrderedDict -from collections.abc import MutableMapping -from threading import Lock -from typing import Union, Dict, Any -from typing_extensions import deprecated - -from zarr.errors import ( - MetadataError, - ReadOnlyError, -) -from zarr.util import buffer_size, json_loads, normalize_storage_path - -from zarr._storage.absstore import ABSStoreV3 # noqa: F401 -from zarr._storage.store import ( # noqa: F401 - _get_hierarchy_metadata, - _get_metadata_suffix, - _listdir_from_keys, - _rename_from_keys, - _rename_metadata_v3, - _rmdir_from_keys, - _rmdir_from_keys_v3, - _path_to_prefix, - _prefix_to_array_key, - _prefix_to_group_key, - array_meta_key, - attrs_key, - data_root, - group_meta_key, - meta_root, - BaseStore, - Store, - StoreV3, -) -from zarr.storage import ( - DBMStore, - ConsolidatedMetadataStore, - DirectoryStore, - FSStore, - KVStore, - LMDBStore, - LRUStoreCache, - MemoryStore, - MongoDBStore, - RedisStore, - SQLiteStore, - ZipStore, - _getsize, -) - -__doctest_requires__ = { - ("RedisStore", "RedisStore.*"): ["redis"], - ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], - ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], -} - - -try: - # noinspection PyUnresolvedReferences - from zarr.codecs import Blosc - - default_compressor = Blosc() -except ImportError: # pragma: no cover - from zarr.codecs import Zlib - - default_compressor = Zlib() - - -Path = Union[str, bytes, None] -# allow MutableMapping for backwards compatibility -StoreLike = Union[BaseStore, MutableMapping] - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class RmdirV3: - """Mixin class that can be used to ensure override of any existing v2 rmdir class.""" - - def rmdir(self, path: str = "") -> None: - path = normalize_storage_path(path) - _rmdir_from_keys_v3(self, path) # type: ignore - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class KVStoreV3(RmdirV3, KVStore, StoreV3): - def list(self): - return list(self._mutable_mapping.keys()) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def __eq__(self, other): - return isinstance(other, KVStoreV3) and self._mutable_mapping == other._mutable_mapping - - -KVStoreV3.__doc__ = KVStore.__doc__ - - -def _get_files_and_dirs_from_path(store, path): - path = normalize_storage_path(path) - - files = [] - # add array metadata file if present - array_key = _prefix_to_array_key(store, path) - if array_key in store: - files.append(os.path.join(store.path, array_key)) - - # add group metadata file if present - group_key = _prefix_to_group_key(store, path) - if group_key in store: - files.append(os.path.join(store.path, group_key)) - - dirs = [] - # add array and group folders if present - for d in [data_root + path, meta_root + path]: - dir_path = os.path.join(store.path, d) - if os.path.exists(dir_path): - dirs.append(dir_path) - return files, dirs - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class FSStoreV3(FSStore, StoreV3): - # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) - _META_KEYS = () - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def _default_key_separator(self): - if self.key_separator is None: - self.key_separator = "/" - - def list(self): - return list(self.keys()) - - def _normalize_key(self, key): - key = normalize_storage_path(key).lstrip("/") - return key.lower() if self.normalize_keys else key - - def getsize(self, path=None): - size = 0 - if path is None or path == "": - # size of both the data and meta subdirs - dirs = [] - for d in ["data/root", "meta/root"]: - dir_path = os.path.join(self.path, d) - if os.path.exists(dir_path): - dirs.append(dir_path) - elif path in self: - # access individual element by full path - return buffer_size(self[path]) - else: - files, dirs = _get_files_and_dirs_from_path(self, path) - for file in files: - size += os.path.getsize(file) - for d in dirs: - size += self.fs.du(d, total=True, maxdepth=None) - return size - - def setitems(self, values): - if self.mode == "r": - raise ReadOnlyError() - values = {self._normalize_key(key): val for key, val in values.items()} - - # initialize the /data/root/... folder corresponding to the array! - # Note: tests.test_core_v3.TestArrayWithFSStoreV3PartialRead fails - # without this explicit creation of directories - subdirectories = set(os.path.dirname(v) for v in values.keys()) - for subdirectory in subdirectories: - data_dir = os.path.join(self.path, subdirectory) - if not self.fs.exists(data_dir): - self.fs.mkdir(data_dir) - - self.map.setitems(values) - - def rmdir(self, path=None): - if self.mode == "r": - raise ReadOnlyError() - if path: - for base in [meta_root, data_root]: - store_path = self.dir_path(base + path) - if self.fs.isdir(store_path): - self.fs.rm(store_path, recursive=True) - - # remove any associated metadata files - sfx = _get_metadata_suffix(self) - meta_dir = (meta_root + path).rstrip("/") - array_meta_file = meta_dir + ".array" + sfx - self.pop(array_meta_file, None) - group_meta_file = meta_dir + ".group" + sfx - self.pop(group_meta_file, None) - else: - store_path = self.dir_path(path) - if self.fs.isdir(store_path): - self.fs.rm(store_path, recursive=True) - - @property - def supports_efficient_get_partial_values(self): - return True - - def get_partial_values(self, key_ranges): - """Get multiple partial values. - key_ranges can be an iterable of key, range pairs, - where a range specifies two integers range_start and range_length - as a tuple, (range_start, range_length). - range_length may be None to indicate to read until the end. - range_start may be negative to start reading range_start bytes - from the end of the file. - A key may occur multiple times with different ranges. - Inserts None for missing keys into the returned list.""" - results = [] - for key, (range_start, range_length) in key_ranges: - key = self._normalize_key(key) - path = self.dir_path(key) - try: - if range_start is None or range_length is None: - end = None - else: - end = range_start + range_length - result = self.fs.cat_file(path, start=range_start, end=end) - except self.map.missing_exceptions: - result = None - results.append(result) - return results - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class MemoryStoreV3(MemoryStore, StoreV3): - def __init__(self, root=None, cls=dict, dimension_separator=None): - if root is None: - self.root = cls() - else: - self.root = root - self.cls = cls - self.write_mutex = Lock() - self._dimension_separator = dimension_separator # TODO: modify for v3? - - def __eq__(self, other): - return ( - isinstance(other, MemoryStoreV3) and self.root == other.root and self.cls == other.cls - ) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def list(self): - return list(self.keys()) - - def getsize(self, path: Path = None): - return _getsize(self, path) - - def rename(self, src_path: Path, dst_path: Path): - src_path = normalize_storage_path(src_path) - dst_path = normalize_storage_path(dst_path) - - any_renamed = False - for base in [meta_root, data_root]: - if self.list_prefix(base + src_path): - src_parent, src_key = self._get_parent(base + src_path) - dst_parent, dst_key = self._require_parent(base + dst_path) - - if src_key in src_parent: - dst_parent[dst_key] = src_parent.pop(src_key) - - if base == meta_root: - # check for and move corresponding metadata - sfx = _get_metadata_suffix(self) - src_meta = src_key + ".array" + sfx - if src_meta in src_parent: - dst_meta = dst_key + ".array" + sfx - dst_parent[dst_meta] = src_parent.pop(src_meta) - src_meta = src_key + ".group" + sfx - if src_meta in src_parent: - dst_meta = dst_key + ".group" + sfx - dst_parent[dst_meta] = src_parent.pop(src_meta) - any_renamed = True - any_renamed = _rename_metadata_v3(self, src_path, dst_path) or any_renamed - if not any_renamed: - raise ValueError(f"no item {src_path} found to rename") - - def rmdir(self, path: Path = None): - path = normalize_storage_path(path) - if path: - for base in [meta_root, data_root]: - try: - parent, key = self._get_parent(base + path) - value = parent[key] - except KeyError: - continue - else: - if isinstance(value, self.cls): - del parent[key] - - # remove any associated metadata files - sfx = _get_metadata_suffix(self) - meta_dir = (meta_root + path).rstrip("/") - array_meta_file = meta_dir + ".array" + sfx - self.pop(array_meta_file, None) - group_meta_file = meta_dir + ".group" + sfx - self.pop(group_meta_file, None) - else: - # clear out root - self.root = self.cls() - - -MemoryStoreV3.__doc__ = MemoryStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class DirectoryStoreV3(DirectoryStore, StoreV3): - def list(self): - return list(self.keys()) - - def __eq__(self, other): - return isinstance(other, DirectoryStoreV3) and self.path == other.path - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def getsize(self, path: Path = None): - return _getsize(self, path) - - def rename(self, src_path, dst_path, metadata_key_suffix=".json"): - store_src_path = normalize_storage_path(src_path) - store_dst_path = normalize_storage_path(dst_path) - - dir_path = self.path - any_existed = False - for root_prefix in ["meta", "data"]: - src_path = os.path.join(dir_path, root_prefix, "root", store_src_path) - if os.path.exists(src_path): - any_existed = True - dst_path = os.path.join(dir_path, root_prefix, "root", store_dst_path) - os.renames(src_path, dst_path) - - for suffix in [".array" + metadata_key_suffix, ".group" + metadata_key_suffix]: - src_meta = os.path.join(dir_path, "meta", "root", store_src_path + suffix) - if os.path.exists(src_meta): - any_existed = True - dst_meta = os.path.join(dir_path, "meta", "root", store_dst_path + suffix) - dst_dir = os.path.dirname(dst_meta) - if not os.path.exists(dst_dir): - os.makedirs(dst_dir) - os.rename(src_meta, dst_meta) - if not any_existed: - raise FileNotFoundError("nothing found at src_path") - - def rmdir(self, path=None): - store_path = normalize_storage_path(path) - dir_path = self.path - if store_path: - for base in [meta_root, data_root]: - dir_path = os.path.join(dir_path, base + store_path) - if os.path.isdir(dir_path): - shutil.rmtree(dir_path) - - # remove any associated metadata files - sfx = _get_metadata_suffix(self) - meta_dir = (meta_root + path).rstrip("/") - array_meta_file = meta_dir + ".array" + sfx - self.pop(array_meta_file, None) - group_meta_file = meta_dir + ".group" + sfx - self.pop(group_meta_file, None) - - elif os.path.isdir(dir_path): - shutil.rmtree(dir_path) - - -DirectoryStoreV3.__doc__ = DirectoryStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class ZipStoreV3(ZipStore, StoreV3): - def list(self): - return list(self.keys()) - - def __eq__(self, other): - return ( - isinstance(other, ZipStore) - and self.path == other.path - and self.compression == other.compression - and self.allowZip64 == other.allowZip64 - ) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def getsize(self, path=None): - path = normalize_storage_path(path) - with self.mutex: - children = self.list_prefix(data_root + path) - children += self.list_prefix(meta_root + path) - print(f"path={path}, children={children}") - if children: - size = 0 - for name in children: - info = self.zf.getinfo(name) - size += info.compress_size - return size - elif path in self: - info = self.zf.getinfo(path) - return info.compress_size - else: - return 0 - - -ZipStoreV3.__doc__ = ZipStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class RedisStoreV3(RmdirV3, RedisStore, StoreV3): - def list(self): - return list(self.keys()) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - -RedisStoreV3.__doc__ = RedisStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class MongoDBStoreV3(RmdirV3, MongoDBStore, StoreV3): - def list(self): - return list(self.keys()) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - -MongoDBStoreV3.__doc__ = MongoDBStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class DBMStoreV3(RmdirV3, DBMStore, StoreV3): - def list(self): - return list(self.keys()) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - -DBMStoreV3.__doc__ = DBMStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class LMDBStoreV3(RmdirV3, LMDBStore, StoreV3): - def list(self): - return list(self.keys()) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - -LMDBStoreV3.__doc__ = LMDBStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class SQLiteStoreV3(SQLiteStore, StoreV3): - def list(self): - return list(self.keys()) - - def getsize(self, path=None): - # TODO: why does the query below not work in this case? - # For now fall back to the default _getsize implementation - # size = 0 - # for _path in [data_root + path, meta_root + path]: - # c = self.cursor.execute( - # ''' - # SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - # WHERE k LIKE (? || "%") AND - # 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") - # ''', - # (_path, _path) - # ) - # for item_size, in c: - # size += item_size - # return size - - # fallback to default implementation for now - return _getsize(self, path) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - def rmdir(self, path=None): - path = normalize_storage_path(path) - if path: - for base in [meta_root, data_root]: - with self.lock: - self.cursor.execute('DELETE FROM zarr WHERE k LIKE (? || "/%")', (base + path,)) - # remove any associated metadata files - sfx = _get_metadata_suffix(self) - meta_dir = (meta_root + path).rstrip("/") - array_meta_file = meta_dir + ".array" + sfx - self.pop(array_meta_file, None) - group_meta_file = meta_dir + ".group" + sfx - self.pop(group_meta_file, None) - else: - self.clear() - - -SQLiteStoreV3.__doc__ = SQLiteStore.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class LRUStoreCacheV3(RmdirV3, LRUStoreCache, StoreV3): - def __init__(self, store, max_size: int): - self._store = StoreV3._ensure_store(store) - self._max_size = max_size - self._current_size = 0 - self._keys_cache = None - self._contains_cache = {} - self._listdir_cache: Dict[Path, Any] = dict() - self._values_cache: Dict[Path, Any] = OrderedDict() - self._mutex = Lock() - self.hits = self.misses = 0 - - def list(self): - return list(self.keys()) - - def __setitem__(self, key, value): - self._validate_key(key) - super().__setitem__(key, value) - - -LRUStoreCacheV3.__doc__ = LRUStoreCache.__doc__ - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class ConsolidatedMetadataStoreV3(ConsolidatedMetadataStore, StoreV3): - """A layer over other storage, where the metadata has been consolidated into - a single key. - - The purpose of this class, is to be able to get all of the metadata for - a given array in a single read operation from the underlying storage. - See :func:`zarr.convenience.consolidate_metadata` for how to create this - single metadata key. - - This class loads from the one key, and stores the data in a dict, so that - accessing the keys no longer requires operations on the backend store. - - This class is read-only, and attempts to change the array metadata will - fail, but changing the data is possible. If the backend storage is changed - directly, then the metadata stored here could become obsolete, and - :func:`zarr.convenience.consolidate_metadata` should be called again and the class - re-invoked. The use case is for write once, read many times. - - .. note:: This is an experimental feature. - - Parameters - ---------- - store: Store - Containing the zarr array. - metadata_key: str - The target in the store where all of the metadata are stored. We - assume JSON encoding. - - See Also - -------- - zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated - - """ - - def __init__(self, store: StoreLike, metadata_key=meta_root + "consolidated/.zmetadata"): - self.store = StoreV3._ensure_store(store) - - # retrieve consolidated metadata - meta = json_loads(self.store[metadata_key]) - - # check format of consolidated metadata - consolidated_format = meta.get("zarr_consolidated_format", None) - if consolidated_format != 1: - raise MetadataError( - "unsupported zarr consolidated metadata format: %s" % consolidated_format - ) - - # decode metadata - self.meta_store: Store = KVStoreV3(meta["metadata"]) - - def rmdir(self, key): - raise ReadOnlyError() - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -def _normalize_store_arg_v3(store: Any, storage_options=None, mode="r") -> BaseStore: - # default to v2 store for backward compatibility - zarr_version = getattr(store, "_store_version", 3) - if zarr_version != 3: - raise ValueError("store must be a version 3 store") - if store is None: - store = KVStoreV3(dict()) - # add default zarr.json metadata - store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) - return store - if isinstance(store, os.PathLike): - store = os.fspath(store) - if FSStore._fsspec_installed(): - import fsspec - - if isinstance(store, fsspec.FSMap): - return FSStoreV3( - store.root, - fs=store.fs, - mode=mode, - check=store.check, - create=store.create, - missing_exceptions=store.missing_exceptions, - **(storage_options or {}), - ) - if isinstance(store, str): - if "://" in store or "::" in store: - store = FSStoreV3(store, mode=mode, **(storage_options or {})) - elif storage_options: - raise ValueError("storage_options passed with non-fsspec path") - elif store.endswith(".zip"): - store = ZipStoreV3(store, mode=mode) - elif store.endswith(".n5"): - raise NotImplementedError("N5Store not yet implemented for V3") - # return N5StoreV3(store) - else: - store = DirectoryStoreV3(store) - else: - store = StoreV3._ensure_store(store) - - if "zarr.json" not in store: - # add default zarr.json metadata - store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) - return store diff --git a/src/zarr/_storage/v3_storage_transformers.py b/src/zarr/_storage/v3_storage_transformers.py deleted file mode 100644 index dd49b8de35..0000000000 --- a/src/zarr/_storage/v3_storage_transformers.py +++ /dev/null @@ -1,386 +0,0 @@ -import functools -import itertools -import os -from typing import NamedTuple, Tuple, Optional, Union, Iterator -from typing_extensions import deprecated - -from numcodecs.compat import ensure_bytes -import numpy as np - -from zarr._storage.store import StorageTransformer, StoreV3, _rmdir_from_keys_v3 -from zarr.util import normalize_storage_path - - -MAX_UINT_64 = 2**64 - 1 - - -v3_sharding_available = os.environ.get("ZARR_V3_SHARDING", "0").lower() not in ["0", "false"] - - -def assert_zarr_v3_sharding_available(): - if not v3_sharding_available: - raise NotImplementedError( - "Using V3 sharding is experimental and not yet finalized! To enable support, set:\n" - "ZARR_V3_SHARDING=1" - ) # pragma: no cover - - -class _ShardIndex(NamedTuple): - store: "ShardingStorageTransformer" - # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) - offsets_and_lengths: np.ndarray - - def __localize_chunk__(self, chunk: Tuple[int, ...]) -> Tuple[int, ...]: - return tuple( - chunk_i % shard_i for chunk_i, shard_i in zip(chunk, self.store.chunks_per_shard) - ) - - def is_all_empty(self) -> bool: - return np.array_equiv(self.offsets_and_lengths, MAX_UINT_64) - - def get_chunk_slice(self, chunk: Tuple[int, ...]) -> Optional[slice]: - localized_chunk = self.__localize_chunk__(chunk) - chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk] - if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): - return None - else: - return slice(int(chunk_start), int(chunk_start + chunk_len)) - - def set_chunk_slice(self, chunk: Tuple[int, ...], chunk_slice: Optional[slice]) -> None: - localized_chunk = self.__localize_chunk__(chunk) - if chunk_slice is None: - self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64) - else: - self.offsets_and_lengths[localized_chunk] = ( - chunk_slice.start, - chunk_slice.stop - chunk_slice.start, - ) - - def to_bytes(self) -> bytes: - return self.offsets_and_lengths.tobytes(order="C") - - @classmethod - def from_bytes( - cls, buffer: Union[bytes, bytearray], store: "ShardingStorageTransformer" - ) -> "_ShardIndex": - try: - return cls( - store=store, - offsets_and_lengths=np.frombuffer(bytearray(buffer), dtype=" None: - super().__init__(_type) - assert test_value == self.TEST_CONSTANT - self.test_value = test_value - - -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class ShardingStorageTransformer(StorageTransformer): # lgtm[py/missing-equals] - """Implements sharding as a storage transformer, as described in the spec: - https://zarr-specs.readthedocs.io/en/latest/extensions/storage-transformers/sharding/v1.0.html - https://purl.org/zarr/spec/storage_transformers/sharding/1.0 - """ - - extension_uri = "https://purl.org/zarr/spec/storage_transformers/sharding/1.0" - valid_types = ["indexed"] - - def __init__(self, _type, chunks_per_shard) -> None: - assert_zarr_v3_sharding_available() - super().__init__(_type) - if isinstance(chunks_per_shard, int): - chunks_per_shard = (chunks_per_shard,) - else: - chunks_per_shard = tuple(int(i) for i in chunks_per_shard) - if chunks_per_shard == (): - chunks_per_shard = (1,) - self.chunks_per_shard = chunks_per_shard - self._num_chunks_per_shard = functools.reduce(lambda x, y: x * y, chunks_per_shard, 1) - self._dimension_separator = None - self._data_key_prefix = None - - def _copy_for_array(self, array, inner_store): - transformer_copy = super()._copy_for_array(array, inner_store) - transformer_copy._dimension_separator = array._dimension_separator - transformer_copy._data_key_prefix = array._data_key_prefix - if len(array._shape) > len(self.chunks_per_shard): - # The array shape might be longer when initialized with subdtypes. - # subdtypes dimensions come last, therefore padding chunks_per_shard - # with ones, effectively disabling sharding on the unlisted dimensions. - transformer_copy.chunks_per_shard += (1,) * ( - len(array._shape) - len(self.chunks_per_shard) - ) - return transformer_copy - - @property - def dimension_separator(self) -> str: - assert ( - self._dimension_separator is not None - ), "dimension_separator is not initialized, first get a copy via _copy_for_array." - return self._dimension_separator - - def _is_data_key(self, key: str) -> bool: - assert ( - self._data_key_prefix is not None - ), "data_key_prefix is not initialized, first get a copy via _copy_for_array." - return key.startswith(self._data_key_prefix) - - def _key_to_shard(self, chunk_key: str) -> Tuple[str, Tuple[int, ...]]: - prefix, _, chunk_string = chunk_key.rpartition("c") - chunk_subkeys = ( - tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) - ) - shard_key_tuple = ( - subkey // shard_i for subkey, shard_i in zip(chunk_subkeys, self.chunks_per_shard) - ) - shard_key = prefix + "c" + self.dimension_separator.join(map(str, shard_key_tuple)) - return shard_key, chunk_subkeys - - def _get_index_from_store(self, shard_key: str) -> _ShardIndex: - # At the end of each shard 2*64bit per chunk for offset and length define the index: - index_bytes = self.inner_store.get_partial_values( - [(shard_key, (-16 * self._num_chunks_per_shard, None))] - )[0] - if index_bytes is None: - raise KeyError(shard_key) - return _ShardIndex.from_bytes( - index_bytes, - self, - ) - - def _get_index_from_buffer(self, buffer: Union[bytes, bytearray]) -> _ShardIndex: - # At the end of each shard 2*64bit per chunk for offset and length define the index: - return _ShardIndex.from_bytes(buffer[-16 * self._num_chunks_per_shard :], self) - - def _get_chunks_in_shard(self, shard_key: str) -> Iterator[Tuple[int, ...]]: - _, _, chunk_string = shard_key.rpartition("c") - shard_key_tuple = ( - tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) - ) - for chunk_offset in itertools.product(*(range(i) for i in self.chunks_per_shard)): - yield tuple( - shard_key_i * shards_i + offset_i - for shard_key_i, offset_i, shards_i in zip( - shard_key_tuple, chunk_offset, self.chunks_per_shard - ) - ) - - def __getitem__(self, key): - if self._is_data_key(key): - if self.supports_efficient_get_partial_values: - # Use the partial implementation, which fetches the index separately - value = self.get_partial_values([(key, (0, None))])[0] - if value is None: - raise KeyError(key) - else: - return value - shard_key, chunk_subkey = self._key_to_shard(key) - try: - full_shard_value = self.inner_store[shard_key] - except KeyError: - raise KeyError(key) - index = self._get_index_from_buffer(full_shard_value) - chunk_slice = index.get_chunk_slice(chunk_subkey) - if chunk_slice is not None: - return full_shard_value[chunk_slice] - else: - raise KeyError(key) - else: - return self.inner_store.__getitem__(key) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - chunks_to_read = set(self._get_chunks_in_shard(shard_key)) - chunks_to_read.remove(chunk_subkey) - new_content = {chunk_subkey: value} - try: - if self.supports_efficient_get_partial_values: - index = self._get_index_from_store(shard_key) - full_shard_value = None - else: - full_shard_value = self.inner_store[shard_key] - index = self._get_index_from_buffer(full_shard_value) - except KeyError: - index = _ShardIndex.create_empty(self) - else: - chunk_slices = [ - (chunk_to_read, index.get_chunk_slice(chunk_to_read)) - for chunk_to_read in chunks_to_read - ] - valid_chunk_slices = [ - (chunk_to_read, chunk_slice) - for chunk_to_read, chunk_slice in chunk_slices - if chunk_slice is not None - ] - # use get_partial_values if less than half of the available chunks must be read: - # (This can be changed when set_partial_values can be used efficiently.) - use_partial_get = ( - self.supports_efficient_get_partial_values - and len(valid_chunk_slices) < len(chunk_slices) / 2 - ) - - if use_partial_get: - chunk_values = self.inner_store.get_partial_values( - [ - ( - shard_key, - ( - chunk_slice.start, - chunk_slice.stop - chunk_slice.start, - ), - ) - for _, chunk_slice in valid_chunk_slices - ] - ) - for chunk_value, (chunk_to_read, _) in zip(chunk_values, valid_chunk_slices): - new_content[chunk_to_read] = chunk_value - else: - if full_shard_value is None: - full_shard_value = self.inner_store[shard_key] - for chunk_to_read, chunk_slice in valid_chunk_slices: - if chunk_slice is not None: - new_content[chunk_to_read] = full_shard_value[chunk_slice] - - shard_content = b"" - for chunk_subkey, chunk_content in new_content.items(): - chunk_slice = slice(len(shard_content), len(shard_content) + len(chunk_content)) - index.set_chunk_slice(chunk_subkey, chunk_slice) - shard_content += chunk_content - # Appending the index at the end of the shard: - shard_content += index.to_bytes() - self.inner_store[shard_key] = shard_content - else: # pragma: no cover - self.inner_store[key] = value - - def __delitem__(self, key): - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - try: - index = self._get_index_from_store(shard_key) - except KeyError: - raise KeyError(key) - - index.set_chunk_slice(chunk_subkey, None) - - if index.is_all_empty(): - del self.inner_store[shard_key] - else: - index_bytes = index.to_bytes() - self.inner_store.set_partial_values([(shard_key, -len(index_bytes), index_bytes)]) - else: # pragma: no cover - del self.inner_store[key] - - def _shard_key_to_original_keys(self, key: str) -> Iterator[str]: - if self._is_data_key(key): - index = self._get_index_from_store(key) - prefix, _, _ = key.rpartition("c") - for chunk_tuple in self._get_chunks_in_shard(key): - if index.get_chunk_slice(chunk_tuple) is not None: - yield prefix + "c" + self.dimension_separator.join(map(str, chunk_tuple)) - else: - yield key - - def __iter__(self) -> Iterator[str]: - for key in self.inner_store: - yield from self._shard_key_to_original_keys(key) - - def __len__(self): - return sum(1 for _ in self.keys()) - - def get_partial_values(self, key_ranges): - if self.supports_efficient_get_partial_values: - transformed_key_ranges = [] - cached_indices = {} - none_indices = [] - for i, (key, range_) in enumerate(key_ranges): - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - try: - index = cached_indices[shard_key] - except KeyError: - try: - index = self._get_index_from_store(shard_key) - except KeyError: - none_indices.append(i) - continue - cached_indices[shard_key] = index - chunk_slice = index.get_chunk_slice(chunk_subkey) - if chunk_slice is None: - none_indices.append(i) - continue - range_start, range_length = range_ - if range_length is None: - range_length = chunk_slice.stop - chunk_slice.start - transformed_key_ranges.append( - (shard_key, (range_start + chunk_slice.start, range_length)) - ) - else: # pragma: no cover - transformed_key_ranges.append((key, range_)) - values = self.inner_store.get_partial_values(transformed_key_ranges) - for i in none_indices: - values.insert(i, None) - return values - else: - return StoreV3.get_partial_values(self, key_ranges) - - def supports_efficient_set_partial_values(self): - return False - - def set_partial_values(self, key_start_values): - # This does not yet implement efficient set_partial_values - StoreV3.set_partial_values(self, key_start_values) - - def rename(self, src_path: str, dst_path: str) -> None: - StoreV3.rename(self, src_path, dst_path) # type: ignore[arg-type] - - def list_prefix(self, prefix): - return StoreV3.list_prefix(self, prefix) - - def erase_prefix(self, prefix): - if self._is_data_key(prefix): - StoreV3.erase_prefix(self, prefix) - else: - self.inner_store.erase_prefix(prefix) - - def rmdir(self, path=None): - path = normalize_storage_path(path) - _rmdir_from_keys_v3(self, path) # type: ignore - - def __contains__(self, key): - if self._is_data_key(key): - shard_key, chunk_subkeys = self._key_to_shard(key) - try: - index = self._get_index_from_store(shard_key) - except KeyError: - return False - chunk_slice = index.get_chunk_slice(chunk_subkeys) - return chunk_slice is not None - else: - return self._inner_store.__contains__(key) diff --git a/src/zarr/attrs.py b/src/zarr/attrs.py index e967c5b853..b3749acff9 100644 --- a/src/zarr/attrs.py +++ b/src/zarr/attrs.py @@ -1,7 +1,7 @@ import warnings from collections.abc import MutableMapping -from zarr._storage.store import Store, StoreV3 +from zarr._storage.store import Store from zarr.util import json_dumps @@ -26,8 +26,7 @@ class Attributes(MutableMapping): """ def __init__(self, store, key=".zattrs", read_only=False, cache=True, synchronizer=None): - self._version = getattr(store, "_store_version", 2) - _Store = Store if self._version == 2 else StoreV3 + _Store = Store self.store = _Store._ensure_store(store) self.key = key self.read_only = read_only @@ -40,8 +39,6 @@ def _get_nosync(self): data = self.store[self.key] except KeyError: d = dict() - if self._version > 2: - d["attributes"] = {} else: d = self.store._metadata_class.parse_metadata(data) return d @@ -51,8 +48,6 @@ def asdict(self): if self.cache and self._cached_asdict is not None: return self._cached_asdict d = self._get_nosync() - if self._version == 3: - d = d["attributes"] if self.cache: self._cached_asdict = d return d @@ -60,10 +55,7 @@ def asdict(self): def refresh(self): """Refresh cached attributes from the store.""" if self.cache: - if self._version == 2: - self._cached_asdict = self._get_nosync() - else: - self._cached_asdict = self._get_nosync()["attributes"] + self._cached_asdict = self._get_nosync() def __contains__(self, x): return x in self.asdict() @@ -91,10 +83,8 @@ def _setitem_nosync(self, item, value): d = self._get_nosync() # set key value - if self._version == 2: - d[item] = value - else: - d["attributes"][item] = value + + d[item] = value # _put modified data self._put_nosync(d) @@ -107,10 +97,7 @@ def _delitem_nosync(self, key): d = self._get_nosync() # delete key value - if self._version == 2: - del d[key] - else: - del d["attributes"][key] + del d[key] # _put modified data self._put_nosync(d) @@ -118,13 +105,10 @@ def _delitem_nosync(self, key): def put(self, d): """Overwrite all attributes with the key/value pairs in the provided dictionary `d` in a single operation.""" - if self._version == 2: - self._write_op(self._put_nosync, d) - else: - self._write_op(self._put_nosync, dict(attributes=d)) + self._write_op(self._put_nosync, d) def _put_nosync(self, d): - d_to_check = d if self._version == 2 else d["attributes"] + d_to_check = d if not all(isinstance(item, str) for item in d_to_check): # TODO: Raise an error for non-string keys # raise TypeError("attribute keys must be strings") @@ -139,33 +123,11 @@ def _put_nosync(self, d): except TypeError as ex: # pragma: no cover raise TypeError("attribute keys can not be stringified") from ex - if self._version == 2: - d = d_to_check - else: - d["attributes"] = d_to_check + d = d_to_check - if self._version == 2: - self.store[self.key] = json_dumps(d) - if self.cache: - self._cached_asdict = d - else: - if self.key in self.store: - # Cannot write the attributes directly to JSON, but have to - # store it within the pre-existing attributes key of the v3 - # metadata. - - # Note: this changes the store.counter result in test_caching_on! - - meta = self.store._metadata_class.parse_metadata(self.store[self.key]) - if "attributes" in meta and "filters" in meta["attributes"]: - # need to preserve any existing "filters" attribute - d["attributes"]["filters"] = meta["attributes"]["filters"] - meta["attributes"] = d["attributes"] - else: - meta = d - self.store[self.key] = json_dumps(meta) - if self.cache: - self._cached_asdict = d["attributes"] + self.store[self.key] = json_dumps(d) + if self.cache: + self._cached_asdict = d # noinspection PyMethodOverriding def update(self, *args, **kwargs): @@ -177,10 +139,7 @@ def _update_nosync(self, *args, **kwargs): d = self._get_nosync() # update - if self._version == 2: - d.update(*args, **kwargs) - else: - d["attributes"].update(*args, **kwargs) + d.update(*args, **kwargs) # _put modified data self._put_nosync(d) diff --git a/src/zarr/convenience.py b/src/zarr/convenience.py index 9c0deeea47..615a019dc3 100644 --- a/src/zarr/convenience.py +++ b/src/zarr/convenience.py @@ -3,8 +3,6 @@ import os import re from collections.abc import Mapping, MutableMapping - -from zarr._storage.store import data_root, meta_root, assert_zarr_v3_api_available from zarr.core import Array from zarr.creation import array as _create_array from zarr.creation import open_array @@ -14,14 +12,12 @@ from zarr.hierarchy import open_group from zarr.meta import json_dumps, json_loads from zarr.storage import ( - _get_metadata_suffix, contains_array, contains_group, normalize_store_arg, BaseStore, ConsolidatedMetadataStore, ) -from zarr._storage.v3 import ConsolidatedMetadataStoreV3 from zarr.util import TreeViewer, buffer_size, normalize_storage_path from typing import Union @@ -38,7 +34,7 @@ def _check_and_update_path(store: BaseStore, path): # noinspection PyShadowingBuiltins -def open(store: StoreLike = None, mode: str = "a", *, zarr_version=None, path=None, **kwargs): +def open(store: StoreLike = None, mode: str = "a", *, path=None, **kwargs): """Convenience function to open a group or array using file-mode-like semantics. Parameters @@ -50,10 +46,6 @@ def open(store: StoreLike = None, mode: str = "a", *, zarr_version=None, path=No read/write (must exist); 'a' means read/write (create if doesn't exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). - zarr_version : {2, 3, None}, optional - The zarr protocol version to use. The default value of None will attempt - to infer the version from `store` if possible, otherwise it will fall - back to 2. path : str or None, optional The path within the store to open. **kwargs @@ -101,10 +93,7 @@ def open(store: StoreLike = None, mode: str = "a", *, zarr_version=None, path=No # we pass storage options explicitly, since normalize_store_arg might construct # a store if the input is a fsspec-compatible URL _store: BaseStore = normalize_store_arg( - store, - storage_options=kwargs.pop("storage_options", {}), - mode=mode, - zarr_version=zarr_version, + store, storage_options=kwargs.pop("storage_options", {}), mode=mode ) # path = _check_and_update_path(_store, path) path = normalize_storage_path(path) @@ -135,7 +124,7 @@ def _might_close(path): return isinstance(path, (str, os.PathLike)) -def save_array(store: StoreLike, arr, *, zarr_version=None, path=None, **kwargs): +def save_array(store: StoreLike, arr, *, path=None, **kwargs): """Convenience function to save a NumPy array to the local file system, following a similar API to the NumPy save() function. @@ -145,10 +134,6 @@ def save_array(store: StoreLike, arr, *, zarr_version=None, path=None, **kwargs) Store or path to directory in file system or name of zip file. arr : ndarray NumPy array with data to save. - zarr_version : {2, 3, None}, optional - The zarr protocol version to use when saving. The default value of None - will attempt to infer the version from `store` if possible, otherwise - it will fall back to 2. path : str or None, optional The path within the store where the array will be saved. kwargs @@ -173,19 +158,17 @@ def save_array(store: StoreLike, arr, *, zarr_version=None, path=None, **kwargs) """ may_need_closing = _might_close(store) - _store: BaseStore = normalize_store_arg(store, mode="w", zarr_version=zarr_version) + _store: BaseStore = normalize_store_arg(store, mode="w") path = _check_and_update_path(_store, path) try: - _create_array( - arr, store=_store, overwrite=True, zarr_version=zarr_version, path=path, **kwargs - ) + _create_array(arr, store=_store, overwrite=True, path=path, **kwargs) finally: if may_need_closing: # needed to ensure zip file records are written _store.close() -def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): +def save_group(store: StoreLike, *args, path=None, **kwargs): """Convenience function to save several NumPy arrays to the local file system, following a similar API to the NumPy savez()/savez_compressed() functions. @@ -195,10 +178,6 @@ def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): Store or path to directory in file system or name of zip file. args : ndarray NumPy arrays with data to save. - zarr_version : {2, 3, None}, optional - The zarr protocol version to use when saving. The default value of None - will attempt to infer the version from `store` if possible, otherwise - it will fall back to 2. path : str or None, optional Path within the store where the group will be saved. kwargs @@ -253,22 +232,22 @@ def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): raise ValueError("at least one array must be provided") # handle polymorphic store arg may_need_closing = _might_close(store) - _store: BaseStore = normalize_store_arg(store, mode="w", zarr_version=zarr_version) + _store: BaseStore = normalize_store_arg(store, mode="w") path = _check_and_update_path(_store, path) try: - grp = _create_group(_store, path=path, overwrite=True, zarr_version=zarr_version) + grp = _create_group(_store, path=path, overwrite=True) for i, arr in enumerate(args): k = "arr_{}".format(i) - grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) + grp.create_dataset(k, data=arr, overwrite=True) for k, arr in kwargs.items(): - grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) + grp.create_dataset(k, data=arr, overwrite=True) finally: if may_need_closing: # needed to ensure zip file records are written _store.close() -def save(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): +def save(store: StoreLike, *args, path=None, **kwargs): """Convenience function to save an array or group of arrays to the local file system. Parameters @@ -277,10 +256,6 @@ def save(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): Store or path to directory in file system or name of zip file. args : ndarray NumPy arrays with data to save. - zarr_version : {2, 3, None}, optional - The zarr protocol version to use when saving. The default value of None - will attempt to infer the version from `store` if possible, otherwise - it will fall back to 2. path : str or None, optional The path within the group where the arrays will be saved. kwargs @@ -349,9 +324,9 @@ def save(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): if len(args) == 0 and len(kwargs) == 0: raise ValueError("at least one array must be provided") if len(args) == 1 and len(kwargs) == 0: - save_array(store, args[0], zarr_version=zarr_version, path=path) + save_array(store, args[0], path=path) else: - save_group(store, *args, zarr_version=zarr_version, path=path, **kwargs) + save_group(store, *args, path=path, **kwargs) class LazyLoader(Mapping): @@ -383,17 +358,13 @@ def __repr__(self): return r -def load(store: StoreLike, zarr_version=None, path=None): +def load(store: StoreLike, path=None): """Load data from an array or group into memory. Parameters ---------- store : MutableMapping or string Store or path to directory in file system or name of zip file. - zarr_version : {2, 3, None}, optional - The zarr protocol version to use when loading. The default value of - None will attempt to infer the version from `store` if possible, - otherwise it will fall back to 2. path : str or None, optional The path within the store from which to load. @@ -415,7 +386,7 @@ def load(store: StoreLike, zarr_version=None, path=None): """ # handle polymorphic store arg - _store = normalize_store_arg(store, zarr_version=zarr_version) + _store = normalize_store_arg(store) path = _check_and_update_path(_store, path) if contains_array(_store, path=path): return Array(store=_store, path=path)[...] @@ -669,9 +640,7 @@ def copy_store( raise ValueError("zarr stores must share the same protocol version") if source_store_version > 2: - nchar_root = len(meta_root) - # code below assumes len(meta_root) === len(data_root) - assert len(data_root) == nchar_root + raise NotImplementedError("This function only supports Zarr version 2.") # setup logging with _LogWriter(log) as log: @@ -682,10 +651,7 @@ def copy_store( if not source_key.startswith(source_path): continue elif source_store_version == 3: - # skip 'meta/root/' or 'data/root/' at start of source_key - if not source_key[nchar_root:].startswith(source_path): - continue - + raise NotImplementedError("This function only supports Zarr version 2.") # process excludes and includes exclude = False for prog in excludes: @@ -705,10 +671,7 @@ def copy_store( key_suffix = source_key[len(source_path) :] dest_key = dest_path + key_suffix elif source_store_version == 3: - # nchar_root is length of 'meta/root/' or 'data/root/' - key_suffix = source_key[nchar_root + len(source_path) :] - dest_key = source_key[:nchar_root] + dest_path + key_suffix - + raise NotImplementedError("This function only supports Zarr version 2.") # create a descriptive label for this operation descr = source_key if dest_key != source_key: @@ -1177,8 +1140,6 @@ def copy_all( # setup counting variables n_copied = n_skipped = n_bytes_copied = 0 - zarr_version = getattr(source, "_version", 2) - # setup logging with _LogWriter(log) as log: for k in source.keys(): @@ -1197,8 +1158,8 @@ def copy_all( n_copied += c n_skipped += s n_bytes_copied += b - if zarr_version == 2: - dest.attrs.update(**source.attrs) + + dest.attrs.update(**source.attrs) # log a final message with a summary of what happened _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) @@ -1253,23 +1214,7 @@ def is_zarr_key(key): return key.endswith(".zarray") or key.endswith(".zgroup") or key.endswith(".zattrs") else: - assert_zarr_v3_api_available() - - sfx = _get_metadata_suffix(store) # type: ignore - - def is_zarr_key(key): - return ( - key.endswith(".array" + sfx) or key.endswith(".group" + sfx) or key == "zarr.json" - ) - - # cannot create a group without a path in v3 - # so create /meta/root/consolidated group to store the metadata - if "consolidated" not in store: - _create_group(store, path="consolidated") - if not metadata_key.startswith("meta/root/"): - metadata_key = "meta/root/consolidated/" + metadata_key - # path = 'consolidated' - + raise NotImplementedError("This function only supports Zarr version 2.") out = { "zarr_consolidated_format": 1, "metadata": {key: json_loads(store[key]) for key in store if is_zarr_key(key)}, @@ -1321,10 +1266,7 @@ def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", ** """ # normalize parameters - zarr_version = kwargs.get("zarr_version") - store = normalize_store_arg( - store, storage_options=kwargs.get("storage_options"), mode=mode, zarr_version=zarr_version - ) + store = normalize_store_arg(store, storage_options=kwargs.get("storage_options"), mode=mode) if mode not in {"r", "r+"}: raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}".format(mode)) @@ -1332,11 +1274,7 @@ def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", ** if store._store_version == 2: ConsolidatedStoreClass = ConsolidatedMetadataStore else: - assert_zarr_v3_api_available() - ConsolidatedStoreClass = ConsolidatedMetadataStoreV3 - # default is to store within 'consolidated' group on v3 - if not metadata_key.startswith("meta/root/"): - metadata_key = "meta/root/consolidated/" + metadata_key + raise NotImplementedError("This function only supports Zarr version 2.") # setup metadata store meta_store = ConsolidatedStoreClass(store, metadata_key=metadata_key) diff --git a/src/zarr/core.py b/src/zarr/core.py index d22a9d79c3..06dcb32063 100644 --- a/src/zarr/core.py +++ b/src/zarr/core.py @@ -10,7 +10,7 @@ import numpy as np from numcodecs.compat import ensure_bytes -from zarr._storage.store import _prefix_to_attrs_key, assert_zarr_v3_api_available +from zarr._storage.store import _prefix_to_attrs_key from zarr.attrs import Attributes from zarr.codecs import AsType, get_codec from zarr.context import Context @@ -36,7 +36,6 @@ pop_fields, ) from zarr.storage import ( - _get_hierarchy_metadata, _prefix_to_array_key, KVStore, getsize, @@ -45,6 +44,7 @@ ) from zarr.util import ( ConstantMap, + UncompressedPartialReadBufferV3, all_equal, InfoReporter, check_array_shape, @@ -56,7 +56,6 @@ normalize_shape, normalize_storage_path, PartialReadBuffer, - UncompressedPartialReadBufferV3, ensure_ndarray_like, ) @@ -125,21 +124,14 @@ def __init__( cache_attrs=True, partial_decompress=False, write_empty_chunks=True, - zarr_version=None, meta_array=None, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized - - store = normalize_store_arg(store, zarr_version=zarr_version) - if zarr_version is None: - zarr_version = store._store_version - - if zarr_version != 2: - assert_zarr_v3_api_available() + store = normalize_store_arg(store) if chunk_store is not None: - chunk_store = normalize_store_arg(chunk_store, zarr_version=zarr_version) + chunk_store = normalize_store_arg(chunk_store) self._store = store self._chunk_store = chunk_store @@ -159,12 +151,6 @@ def __init__( self._meta_array = np.empty_like(meta_array, shape=()) else: self._meta_array = np.empty(()) - self._version = zarr_version - if self._version == 3: - self._data_key_prefix = "data/root/" + self._key_prefix - self._data_path = "data/root/" + self._path - self._hierarchy_metadata = _get_hierarchy_metadata(store=self._store) - self._metadata_key_suffix = self._hierarchy_metadata["metadata_key_suffix"] # initialize metadata self._load_metadata() @@ -205,26 +191,19 @@ def _load_metadata_nosync(self): self._shape = meta["shape"] self._fill_value = meta["fill_value"] dimension_separator = meta.get("dimension_separator", None) - if self._version == 2: - self._chunks = meta["chunks"] - self._dtype = meta["dtype"] - self._order = meta["order"] - if dimension_separator is None: - try: - dimension_separator = self._store._dimension_separator - except (AttributeError, KeyError): - pass - - # Fallback for any stores which do not choose a default - if dimension_separator is None: - dimension_separator = "." - else: - self._chunks = meta["chunk_grid"]["chunk_shape"] - self._dtype = meta["data_type"] - self._order = meta["chunk_memory_layout"] - chunk_separator = meta["chunk_grid"]["separator"] + + self._chunks = meta["chunks"] + self._dtype = meta["dtype"] + self._order = meta["order"] + if dimension_separator is None: + try: + dimension_separator = self._store._dimension_separator + except (AttributeError, KeyError): + pass + + # Fallback for any stores which do not choose a default if dimension_separator is None: - dimension_separator = meta.get("dimension_separator", chunk_separator) + dimension_separator = "." self._dimension_separator = dimension_separator @@ -232,32 +211,17 @@ def _load_metadata_nosync(self): compressor = meta.get("compressor", None) if compressor is None: self._compressor = None - elif self._version == 2: - self._compressor = get_codec(compressor) else: - self._compressor = compressor + self._compressor = get_codec(compressor) # setup filters - if self._version == 2: - filters = meta.get("filters", []) - else: - # TODO: storing filters under attributes for now since the v3 - # array metadata does not have a 'filters' attribute. - filters = meta["attributes"].get("filters", []) + + filters = meta.get("filters", []) + if filters: filters = [get_codec(config) for config in filters] self._filters = filters - if self._version == 3: - storage_transformers = meta.get("storage_transformers", []) - if storage_transformers: - transformed_store = self._chunk_store or self._store - for storage_transformer in storage_transformers[::-1]: - transformed_store = storage_transformer._copy_for_array( - self, transformed_store - ) - self._transformed_chunk_store = transformed_store - def _refresh_metadata(self): if not self._cache_metadata: self._load_metadata() @@ -278,35 +242,22 @@ def _flush_metadata_nosync(self): filters_config = [f.get_config() for f in self._filters] else: filters_config = None - _compressor = compressor_config if self._version == 2 else self._compressor + _compressor = compressor_config meta = dict( shape=self._shape, compressor=_compressor, fill_value=self._fill_value, filters=filters_config, ) - if getattr(self._store, "_store_version", 2) == 2: - meta.update( - dict( - chunks=self._chunks, - dtype=self._dtype, - order=self._order, - dimension_separator=self._dimension_separator, - ) - ) - else: - meta.update( - dict( - chunk_grid=dict( - type="regular", - chunk_shape=self._chunks, - separator=self._dimension_separator, - ), - data_type=self._dtype, - chunk_memory_layout=self._order, - attributes=self.attrs.asdict(), - ) + + meta.update( + dict( + chunks=self._chunks, + dtype=self._dtype, + order=self._order, + dimension_separator=self._dimension_separator, ) + ) mkey = _prefix_to_array_key(self._store, self._key_prefix) self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta) @@ -496,28 +447,11 @@ def nchunks(self): def nchunks_initialized(self): """The number of chunks that have been initialized with some data.""" - # count chunk keys - if self._version == 3: - # # key pattern for chunk keys - # prog = re.compile(r'\.'.join([r'c\d+'] * min(1, self.ndim))) - # # get chunk keys, excluding the prefix - # members = self.chunk_store.list_prefix(self._data_path) - # members = [k.split(self._data_key_prefix)[1] for k in members] - # # count the chunk keys - # return sum(1 for k in members if prog.match(k)) - - # key pattern for chunk keys - prog = re.compile(self._data_key_prefix + r"c\d+") # TODO: ndim == 0 case? - # get chunk keys, excluding the prefix - members = self.chunk_store.list_prefix(self._data_path) - # count the chunk keys - return sum(1 for k in members if prog.match(k)) - else: - # key pattern for chunk keys - prog = re.compile(r"\.".join([r"\d+"] * min(1, self.ndim))) + # key pattern for chunk keys + prog = re.compile(r"\.".join([r"\d+"] * min(1, self.ndim))) - # count chunk keys - return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) + # count chunk keys + return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) # backwards compatibility initialized = nchunks_initialized @@ -2044,8 +1978,6 @@ def _process_chunk( cdata = cdata.read_full() self._compressor.decode(cdata, dest) else: - if isinstance(cdata, UncompressedPartialReadBufferV3): - cdata = cdata.read_full() chunk = ensure_ndarray_like(cdata).view(self._dtype) chunk = chunk.reshape(self._chunks, order=self._order) np.copyto(dest, chunk) @@ -2065,21 +1997,13 @@ def _process_chunk( else dim for i, dim in enumerate(self.chunks) ] - if isinstance(cdata, UncompressedPartialReadBufferV3): - chunk_partial = self._decode_chunk( - cdata.read_part(start, nitems), - start=start, - nitems=nitems, - expected_shape=expected_shape, - ) - else: - cdata.read_part(start, nitems) - chunk_partial = self._decode_chunk( - cdata.buff, - start=start, - nitems=nitems, - expected_shape=expected_shape, - ) + cdata.read_part(start, nitems) + chunk_partial = self._decode_chunk( + cdata.buff, + start=start, + nitems=nitems, + expected_shape=expected_shape, + ) tmp[partial_out_selection] = chunk_partial out[out_selection] = tmp[chunk_selection] return @@ -2318,19 +2242,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): return chunk def _chunk_key(self, chunk_coords): - if self._version == 3: - # _chunk_key() corresponds to data_key(P, i, j, ...) example in the spec - # where P = self._key_prefix, i, j, ... = chunk_coords - # e.g. c0/2/3 for 3d array with chunk index (0, 2, 3) - # https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/core/v3.0.html#regular-grids - return ( - "data/root/" - + self._key_prefix - + "c" - + self._dimension_separator.join(map(str, chunk_coords)) - ) - else: - return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) + return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): # decompress @@ -2552,7 +2464,6 @@ def __getstate__(self): "cache_attrs": self._attrs.cache, "partial_decompress": self._partial_decompress, "write_empty_chunks": self._write_empty_chunks, - "zarr_version": self._version, "meta_array": self._meta_array, } @@ -2860,7 +2771,6 @@ def view( read_only=read_only, synchronizer=synchronizer, cache_metadata=True, - zarr_version=self._version, ) a._is_view = True diff --git a/src/zarr/creation.py b/src/zarr/creation.py index 6227f90b7b..c93178c0e7 100644 --- a/src/zarr/creation.py +++ b/src/zarr/creation.py @@ -4,7 +4,6 @@ import numpy as np from numcodecs.registry import codec_registry -from zarr._storage.store import DEFAULT_ZARR_VERSION from zarr.core import Array from zarr.errors import ( ArrayNotFoundError, @@ -42,9 +41,7 @@ def create( dimension_separator=None, write_empty_chunks=True, *, - zarr_version=None, meta_array=None, - storage_transformers=(), **kwargs, ): """Create an array. @@ -109,21 +106,6 @@ def create( .. versionadded:: 2.11 - storage_transformers : sequence of StorageTransformers, optional - Setting storage transformers, changes the storage structure and behaviour - of data coming from the underlying store. The transformers are applied in the - order of the given sequence. Supplying an empty sequence is the same as omitting - the argument or setting it to None. May only be set when using zarr_version 3. - - .. versionadded:: 2.13 - - zarr_version : {None, 2, 3}, optional - The zarr protocol version of the created array. If None, it will be - inferred from ``store`` or ``chunk_store`` if they are provided, - otherwise defaulting to 2. - - .. versionadded:: 2.12 - meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. @@ -173,12 +155,9 @@ def create( """ - if zarr_version is None and store is None: - zarr_version = getattr(chunk_store, "_store_version", DEFAULT_ZARR_VERSION) # handle polymorphic store arg - store = normalize_store_arg(store, zarr_version=zarr_version, mode="w") - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + store = normalize_store_arg(store, mode="w") # API compatibility with h5py compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) @@ -196,9 +175,6 @@ def create( ) dimension_separator = normalize_dimension_separator(dimension_separator) - if zarr_version > 2 and path is None: - path = "/" - # initialize array metadata init_array( store, @@ -214,7 +190,6 @@ def create( filters=filters, object_codec=object_codec, dimension_separator=dimension_separator, - storage_transformers=storage_transformers, ) # instantiate array @@ -463,7 +438,6 @@ def open_array( partial_decompress=False, write_empty_chunks=True, *, - zarr_version=None, dimension_separator=None, meta_array=None, **kwargs, @@ -531,15 +505,10 @@ def open_array( .. versionadded:: 2.11 - zarr_version : {None, 2, 3}, optional - The zarr protocol version of the array to be opened. If None, it will - be inferred from ``store`` or ``chunk_store`` if they are provided, - otherwise defaulting to 2. dimension_separator : {None, '.', '/'}, optional Can be used to specify whether the array is in a flat ('.') or nested ('/') format. If None, the appropriate value will be read from `store` - when present. Otherwise, defaults to '.' when ``zarr_version == 2`` - and `/` otherwise. + when present. Otherwise, defaults to '.'. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. @@ -579,28 +548,18 @@ def open_array( # w- or x : create, fail if exists # a : read/write if exists, create otherwise (default) - if zarr_version is None and store is None: - zarr_version = getattr(chunk_store, "_store_version", DEFAULT_ZARR_VERSION) - # handle polymorphic store arg - store = normalize_store_arg( - store, storage_options=storage_options, mode=mode, zarr_version=zarr_version - ) - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + store = normalize_store_arg(store, storage_options=storage_options, mode=mode) + if chunk_store is not None: - chunk_store = normalize_store_arg( - chunk_store, storage_options=storage_options, mode=mode, zarr_version=zarr_version - ) + chunk_store = normalize_store_arg(chunk_store, storage_options=storage_options, mode=mode) # respect the dimension separator specified in a store, if present if dimension_separator is None: if hasattr(store, "_dimension_separator"): dimension_separator = store._dimension_separator else: - dimension_separator = "." if zarr_version == 2 else "/" - - if zarr_version == 3 and path is None: - path = "array" # TODO: raise ValueError instead? + dimension_separator = "." path = normalize_storage_path(path) @@ -709,7 +668,6 @@ def _like_args(a, kwargs): kwargs.setdefault("compressor", a.compressor) kwargs.setdefault("order", a.order) kwargs.setdefault("filters", a.filters) - kwargs.setdefault("zarr_version", a._version) else: kwargs.setdefault("compressor", "default") kwargs.setdefault("order", "C") diff --git a/src/zarr/hierarchy.py b/src/zarr/hierarchy.py index 1c9848e647..e30d2d7996 100644 --- a/src/zarr/hierarchy.py +++ b/src/zarr/hierarchy.py @@ -3,13 +3,6 @@ import numpy as np -from zarr._storage.store import ( - _get_metadata_suffix, - data_root, - meta_root, - DEFAULT_ZARR_VERSION, - assert_zarr_v3_api_available, -) from zarr.attrs import Attributes from zarr.core import Array from zarr.creation import ( @@ -31,21 +24,20 @@ ReadOnlyError, ) from zarr.storage import ( - _get_hierarchy_metadata, _prefix_to_group_key, BaseStore, MemoryStore, + group_meta_key, attrs_key, contains_array, contains_group, - group_meta_key, init_group, listdir, normalize_store_arg, rename, rmdir, ) -from zarr._storage.v3 import MemoryStoreV3 + from zarr.util import ( InfoReporter, TreeViewer, @@ -143,19 +135,12 @@ def __init__( chunk_store=None, cache_attrs=True, synchronizer=None, - zarr_version=None, *, meta_array=None, ): - store: BaseStore = _normalize_store_arg(store, zarr_version=zarr_version) - if zarr_version is None: - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) - - if zarr_version != 2: - assert_zarr_v3_api_available() - + store: BaseStore = _normalize_store_arg(store) if chunk_store is not None: - chunk_store: BaseStore = _normalize_store_arg(chunk_store, zarr_version=zarr_version) + chunk_store: BaseStore = _normalize_store_arg(chunk_store) self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) @@ -169,12 +154,6 @@ def __init__( self._meta_array = np.empty_like(meta_array, shape=()) else: self._meta_array = np.empty(()) - self._version = zarr_version - if self._version == 3: - self._data_key_prefix = data_root + self._key_prefix - self._data_path = data_root + self._path - self._hierarchy_metadata = _get_hierarchy_metadata(store=self._store) - self._metadata_key_suffix = _get_metadata_suffix(store=self._store) # guard conditions if contains_array(store, path=self._path): @@ -187,25 +166,13 @@ def __init__( assert not mkey.endswith("root/.group") meta_bytes = store[mkey] except KeyError: - if self._version == 2: - raise GroupNotFoundError(path) - else: - implicit_prefix = meta_root + self._key_prefix - if self._store.list_prefix(implicit_prefix): - # implicit group does not have any metadata - self._meta = None - else: - raise GroupNotFoundError(path) + raise GroupNotFoundError(path) else: self._meta = self._store._metadata_class.decode_group_metadata(meta_bytes) # setup attributes - if self._version == 2: - akey = self._key_prefix + attrs_key - else: - # Note: mkey doesn't actually exist for implicit groups, but the - # object can still be created. - akey = mkey + akey = self._key_prefix + attrs_key + self._attrs = Attributes( store, key=akey, read_only=read_only, cache=cache_attrs, synchronizer=synchronizer ) @@ -304,35 +271,11 @@ def __iter__(self): quux """ - if getattr(self._store, "_store_version", 2) == 2: - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_array(self._store, path) or contains_group(self._store, path): - yield key - else: - # TODO: Should this iterate over data folders and/or metadata - # folders and/or metadata files - - dir_path = meta_root + self._key_prefix - name_start = len(dir_path) - keys, prefixes = self._store.list_dir(dir_path) - - # yield any groups or arrays - sfx = self._metadata_key_suffix - for key in keys: - len_suffix = len(".group") + len(sfx) # same for .array - if key.endswith((".group" + sfx, ".array" + sfx)): - yield key[name_start:-len_suffix] - - # also yield any implicit groups - for prefix in prefixes: - prefix = prefix.rstrip("/") - # only implicit if there is no .group.sfx file - if prefix + ".group" + sfx not in self._store: - yield prefix[name_start:] - - # Note: omit data/root/ to avoid duplicate listings - # any group in data/root/ must has an entry in meta/root/ + + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_array(self._store, path) or contains_group(self._store, path): + yield key def __len__(self): """Number of members.""" @@ -400,7 +343,6 @@ def __getstate__(self): "chunk_store": self._chunk_store, "cache_attrs": self._attrs.cache, "synchronizer": self._synchronizer, - "zarr_version": self._version, "meta_array": self._meta_array, } @@ -466,7 +408,6 @@ def __getitem__(self, item): chunk_store=self._chunk_store, synchronizer=self._synchronizer, cache_attrs=self.attrs.cache, - zarr_version=self._version, meta_array=self._meta_array, ) elif contains_group(self._store, path, explicit_only=True): @@ -477,25 +418,8 @@ def __getitem__(self, item): chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, synchronizer=self._synchronizer, - zarr_version=self._version, meta_array=self._meta_array, ) - elif self._version == 3: - implicit_group = meta_root + path + "/" - # non-empty folder in the metadata path implies an implicit group - if self._store.list_prefix(implicit_group): - return Group( - self._store, - read_only=self._read_only, - path=path, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - zarr_version=self._version, - meta_array=self._meta_array, - ) - else: - raise KeyError(item) else: raise KeyError(item) @@ -546,29 +470,11 @@ def group_keys(self): ['bar', 'foo'] """ - if self._version == 2: - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path): - yield key - else: - dir_name = meta_root + self._path - group_sfx = ".group" + self._metadata_key_suffix - # The fact that we call sorted means this can't be a streaming generator. - # The keys are already in memory. - all_keys = sorted(listdir(self._store, dir_name)) - for key in all_keys: - if key.endswith(group_sfx): - key = key[: -len(group_sfx)] - if key in all_keys: - # otherwise we will double count this group - continue - path = self._key_prefix + key - if path.endswith(".array" + self._metadata_key_suffix): - # skip array keys - continue - if contains_group(self._store, path, explicit_only=False): - yield key + + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path): + yield key def groups(self): """Return an iterator over (name, value) pairs for groups only. @@ -587,26 +493,10 @@ def groups(self): foo """ - if self._version == 2: - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path, explicit_only=False): - yield ( - key, - Group( - self._store, - path=path, - read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer, - zarr_version=self._version, - ), - ) - else: - for key in self.group_keys(): - path = self._key_prefix + key + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path, explicit_only=False): yield ( key, Group( @@ -616,7 +506,6 @@ def groups(self): chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, synchronizer=self._synchronizer, - zarr_version=self._version, ), ) @@ -671,34 +560,14 @@ def arrays(self, recurse=False): return self._array_iter(keys_only=False, method="arrays", recurse=recurse) def _array_iter(self, keys_only, method, recurse): - if self._version == 2: - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_array(self._store, path): - _key = key.rstrip("/") - yield _key if keys_only else (_key, self[key]) - elif recurse and contains_group(self._store, path): - group = self[key] - yield from getattr(group, method)(recurse=recurse) - else: - dir_name = meta_root + self._path - array_sfx = ".array" + self._metadata_key_suffix - group_sfx = ".group" + self._metadata_key_suffix - - for key in sorted(listdir(self._store, dir_name)): - if key.endswith(array_sfx): - key = key[: -len(array_sfx)] - _key = key.rstrip("/") - yield _key if keys_only else (_key, self[key]) - - path = self._key_prefix + key - assert not path.startswith("meta/") - if key.endswith(group_sfx): - # skip group metadata keys - continue - elif recurse and contains_group(self._store, path): - group = self[key] - yield from getattr(group, method)(recurse=recurse) + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_array(self._store, path): + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + elif recurse and contains_group(self._store, path): + group = self[key] + yield from getattr(group, method)(recurse=recurse) def visitvalues(self, func): """Run ``func`` on each object. @@ -978,7 +847,6 @@ def _create_group_nosync(self, name, overwrite=False): chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, synchronizer=self._synchronizer, - zarr_version=self._version, ) def create_groups(self, *names, **kwargs): @@ -1028,7 +896,6 @@ def _require_group_nosync(self, name, overwrite=False): chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, synchronizer=self._synchronizer, - zarr_version=self._version, ) def require_groups(self, *names): @@ -1340,18 +1207,10 @@ def move(self, source, dest): self._write_op(self._move_nosync, source, dest) -def _normalize_store_arg(store, *, storage_options=None, mode="r", zarr_version=None): - if zarr_version is None: - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) - - if zarr_version != 2: - assert_zarr_v3_api_available() - +def _normalize_store_arg(store, *, storage_options=None, mode="r"): if store is None: - return MemoryStore() if zarr_version == 2 else MemoryStoreV3() - return normalize_store_arg( - store, storage_options=storage_options, mode=mode, zarr_version=zarr_version - ) + return MemoryStore() + return normalize_store_arg(store, storage_options=storage_options, mode=mode) def group( @@ -1362,7 +1221,6 @@ def group( synchronizer=None, path=None, *, - zarr_version=None, meta_array=None, ): """Create a group. @@ -1414,20 +1272,11 @@ def group( """ # handle polymorphic store arg - store = _normalize_store_arg(store, zarr_version=zarr_version, mode="w") - if zarr_version is None: - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) - - if zarr_version != 2: - assert_zarr_v3_api_available() + store = _normalize_store_arg(store, mode="w") path = normalize_storage_path(path) - requires_init = None - if zarr_version == 2: - requires_init = overwrite or not contains_group(store) - elif zarr_version == 3: - requires_init = overwrite or not contains_group(store, path) + requires_init = overwrite or not contains_group(store) if requires_init: init_group(store, overwrite=overwrite, chunk_store=chunk_store, path=path) @@ -1439,7 +1288,6 @@ def group( cache_attrs=cache_attrs, synchronizer=synchronizer, path=path, - zarr_version=zarr_version, meta_array=meta_array, ) @@ -1453,7 +1301,6 @@ def open_group( chunk_store=None, storage_options=None, *, - zarr_version=None, meta_array=None, ): """Open a group using file-mode-like semantics. @@ -1507,21 +1354,10 @@ def open_group( """ # handle polymorphic store arg - store = _normalize_store_arg( - store, storage_options=storage_options, mode=mode, zarr_version=zarr_version - ) - if zarr_version is None: - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) - - if zarr_version != 2: - assert_zarr_v3_api_available() + store = _normalize_store_arg(store, storage_options=storage_options, mode=mode) if chunk_store is not None: - chunk_store = _normalize_store_arg( - chunk_store, storage_options=storage_options, mode=mode, zarr_version=zarr_version - ) - if getattr(chunk_store, "_store_version", DEFAULT_ZARR_VERSION) != zarr_version: - raise ValueError("zarr_version of store and chunk_store must match") # pragma: no cover + chunk_store = _normalize_store_arg(chunk_store, storage_options=storage_options, mode=mode) path = normalize_storage_path(path) @@ -1560,6 +1396,5 @@ def open_group( synchronizer=synchronizer, path=path, chunk_store=chunk_store, - zarr_version=zarr_version, meta_array=meta_array, ) diff --git a/src/zarr/meta.py b/src/zarr/meta.py index 34a4f33d1e..50236fc7a0 100644 --- a/src/zarr/meta.py +++ b/src/zarr/meta.py @@ -1,33 +1,22 @@ import base64 import itertools from collections.abc import Mapping -from typing_extensions import deprecated -import numcodecs import numpy as np -from numcodecs.abc import Codec from zarr.errors import MetadataError from zarr.util import json_dumps, json_loads -from typing import cast, Union, Any, List, Mapping as MappingType, Optional, TYPE_CHECKING +from typing import cast, Union, Any, List, Mapping as MappingType, TYPE_CHECKING if TYPE_CHECKING: # pragma: no cover - from zarr._storage.store import StorageTransformer + pass ZARR_FORMAT = 2 -ZARR_FORMAT_v3 = 3 # FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} -_default_entry_point_metadata_v3 = { - "zarr_format": "https://purl.org/zarr/spec/protocol/core/3.0", - "metadata_encoding": "https://purl.org/zarr/spec/protocol/core/3.0", - "metadata_key_suffix": ".json", - "extensions": [], -} - _v3_core_types = set("".join(d) for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8"))) _v3_core_types = {"bool", "i1", "u1"} | _v3_core_types @@ -302,274 +291,6 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return v -@deprecated( - "This implementation of Zarr V3 is out of date and will be supplanted in zarr-python 3.0" -) -class Metadata3(Metadata2): - ZARR_FORMAT = ZARR_FORMAT_v3 - - @classmethod - def decode_dtype(cls, d, validate=True): - if isinstance(d, dict): - # extract the type from the extension info - try: - d = d["type"] - except KeyError: - raise KeyError("Extended dtype info must provide a key named 'type'.") - d = cls._decode_dtype_descr(d) - dtype = np.dtype(d) - if validate: - if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): - # it is a core dtype of the v3 spec - pass - else: - # will raise if this is not a recognized extended dtype - get_extended_dtype_info(dtype) - return dtype - - @classmethod - def encode_dtype(cls, d): - s = d.str - if s == "|b1": - return "bool" - elif s == "|u1": - return "u1" - elif s == "|i1": - return "i1" - elif s in _v3_core_types: - return Metadata2.encode_dtype(d) - else: - # Check if this dtype corresponds to a supported extension to - # the v3 protocol. - return get_extended_dtype_info(np.dtype(d)) - - @classmethod - def decode_group_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - meta = cls.parse_metadata(s) - # 1 / 0 - # # check metadata format version - # zarr_format = meta.get("zarr_format", None) - # if zarr_format != cls.ZARR_FORMAT: - # raise MetadataError("unsupported zarr format: %s" % zarr_format) - - assert "attributes" in meta - # meta = dict(attributes=meta['attributes']) - return meta - - # return json.loads(s) - - @classmethod - def encode_group_metadata(cls, meta=None) -> bytes: - # The ZARR_FORMAT should not be in the group metadata, but in the - # entry point metadata instead - # meta = dict(zarr_format=cls.ZARR_FORMAT) - if meta is None: - meta = {"attributes": {}} - meta = dict(attributes=meta.get("attributes", {})) - return json_dumps(meta) - - @classmethod - def encode_hierarchy_metadata(cls, meta=None) -> bytes: - if meta is None: - meta = _default_entry_point_metadata_v3 - elif set(meta.keys()) != { - "zarr_format", - "metadata_encoding", - "metadata_key_suffix", - "extensions", - }: - raise ValueError(f"Unexpected keys in metadata. meta={meta}") - return json_dumps(meta) - - @classmethod - def decode_hierarchy_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - meta = cls.parse_metadata(s) - # check metadata format - # zarr_format = meta.get("zarr_format", None) - # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": - # raise MetadataError("unsupported zarr format: %s" % zarr_format) - if set(meta.keys()) != { - "zarr_format", - "metadata_encoding", - "metadata_key_suffix", - "extensions", - }: - raise ValueError(f"Unexpected keys in metadata. meta={meta}") - return meta - - @classmethod - def _encode_codec_metadata(cls, codec: Codec) -> Optional[Mapping]: - if codec is None: - return None - - # only support gzip for now - config = codec.get_config() - del config["id"] - uri = "https://purl.org/zarr/spec/codec/" - if isinstance(codec, numcodecs.GZip): - uri = uri + "gzip/1.0" - elif isinstance(codec, numcodecs.Zlib): - uri = uri + "zlib/1.0" - elif isinstance(codec, numcodecs.Blosc): - uri = uri + "blosc/1.0" - elif isinstance(codec, numcodecs.BZ2): - uri = uri + "bz2/1.0" - elif isinstance(codec, numcodecs.LZ4): - uri = uri + "lz4/1.0" - elif isinstance(codec, numcodecs.LZMA): - uri = uri + "lzma/1.0" - meta = { - "codec": uri, - "configuration": config, - } - return meta - - @classmethod - def _decode_codec_metadata(cls, meta: Optional[Mapping]) -> Optional[Codec]: - if meta is None: - return None - - uri = "https://purl.org/zarr/spec/codec/" - conf = meta["configuration"] - if meta["codec"].startswith(uri + "gzip/"): - conf["id"] = "gzip" - elif meta["codec"].startswith(uri + "zlib/"): - conf["id"] = "zlib" - elif meta["codec"].startswith(uri + "blosc/"): - conf["id"] = "blosc" - elif meta["codec"].startswith(uri + "bz2/"): - conf["id"] = "bz2" - elif meta["codec"].startswith(uri + "lz4/"): - conf["id"] = "lz4" - elif meta["codec"].startswith(uri + "lzma/"): - conf["id"] = "lzma" - else: - raise NotImplementedError - - codec = numcodecs.get_codec(conf) - - return codec - - @classmethod - def _encode_storage_transformer_metadata( - cls, storage_transformer: "StorageTransformer" - ) -> Optional[Mapping]: - return { - "extension": storage_transformer.extension_uri, - "type": storage_transformer.type, - "configuration": storage_transformer.get_config(), - } - - @classmethod - def _decode_storage_transformer_metadata(cls, meta: Mapping) -> "StorageTransformer": - from zarr._storage.v3_storage_transformers import ( - ShardingStorageTransformer, - DummyStorageTransfomer, - ) - - # This might be changed to a proper registry in the future - KNOWN_STORAGE_TRANSFORMERS = [DummyStorageTransfomer, ShardingStorageTransformer] - - conf = meta.get("configuration", {}) - extension_uri = meta["extension"] - transformer_type = meta["type"] - - for StorageTransformerCls in KNOWN_STORAGE_TRANSFORMERS: - if StorageTransformerCls.extension_uri == extension_uri: - break - else: # pragma: no cover - raise NotImplementedError - - return StorageTransformerCls.from_config(transformer_type, conf) - - @classmethod - def decode_array_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - meta = cls.parse_metadata(s) - - # extract array metadata fields - try: - dtype = cls.decode_dtype(meta["data_type"]) - if dtype.hasobject: - import numcodecs - - object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) - else: - object_codec = None - fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) - # TODO: remove dimension_separator? - - compressor = cls._decode_codec_metadata(meta.get("compressor", None)) - storage_transformers = meta.get("storage_transformers", ()) - storage_transformers = [ - cls._decode_storage_transformer_metadata(i) for i in storage_transformers - ] - extensions = meta.get("extensions", []) - meta = dict( - shape=tuple(meta["shape"]), - chunk_grid=dict( - type=meta["chunk_grid"]["type"], - chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), - separator=meta["chunk_grid"]["separator"], - ), - data_type=dtype, - fill_value=fill_value, - chunk_memory_layout=meta["chunk_memory_layout"], - attributes=meta["attributes"], - extensions=extensions, - ) - # compressor field should be absent when there is no compression - if compressor: - meta["compressor"] = compressor - if storage_transformers: - meta["storage_transformers"] = storage_transformers - - except Exception as e: - raise MetadataError("error decoding metadata: %s" % e) - else: - return meta - - @classmethod - def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: - dtype = meta["data_type"] - sdshape = () - if dtype.subdtype is not None: - dtype, sdshape = dtype.subdtype - dimension_separator = meta.get("dimension_separator") - if dtype.hasobject: - import numcodecs - - object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) - else: - object_codec = None - - compressor = cls._encode_codec_metadata(meta.get("compressor", None)) - storage_transformers = meta.get("storage_transformers", ()) - storage_transformers = [ - cls._encode_storage_transformer_metadata(i) for i in storage_transformers - ] - extensions = meta.get("extensions", []) - meta = dict( - shape=meta["shape"] + sdshape, - chunk_grid=dict( - type=meta["chunk_grid"]["type"], - chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), - separator=meta["chunk_grid"]["separator"], - ), - data_type=cls.encode_dtype(dtype), - fill_value=encode_fill_value(meta["fill_value"], dtype, object_codec), - chunk_memory_layout=meta["chunk_memory_layout"], - attributes=meta.get("attributes", {}), - extensions=extensions, - ) - if compressor: - meta["compressor"] = compressor - if dimension_separator: - meta["dimension_separator"] = dimension_separator - if storage_transformers: - meta["storage_transformers"] = storage_transformers - return json_dumps(meta) - - parse_metadata = Metadata2.parse_metadata decode_array_metadata = Metadata2.decode_array_metadata encode_array_metadata = Metadata2.encode_array_metadata diff --git a/src/zarr/storage.py b/src/zarr/storage.py index e7bd0c4cf4..a9efbe7071 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -35,7 +35,6 @@ import uuid import time -from numcodecs.abc import Codec from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like from numcodecs.registry import codec_registry from zarr.context import Context @@ -66,21 +65,15 @@ from zarr._storage.absstore import ABSStore # noqa: F401 from zarr._storage.store import ( # noqa: F401 - _get_hierarchy_metadata, - _get_metadata_suffix, _listdir_from_keys, _rename_from_keys, - _rename_metadata_v3, _rmdir_from_keys, - _rmdir_from_keys_v3, _path_to_prefix, _prefix_to_array_key, _prefix_to_group_key, array_meta_key, attrs_key, - data_root, group_meta_key, - meta_root, DEFAULT_ZARR_VERSION, BaseStore, Store, @@ -122,28 +115,10 @@ def contains_group(store: StoreLike, path: Path = None, explicit_only=True) -> b path = normalize_storage_path(path) prefix = _path_to_prefix(path) key = _prefix_to_group_key(store, prefix) - store_version = getattr(store, "_store_version", 2) - if store_version == 2 or explicit_only: - return key in store - else: - if key in store: - return True - # for v3, need to also handle implicit groups - - sfx = _get_metadata_suffix(store) # type: ignore - implicit_prefix = key.replace(".group" + sfx, "") - if not implicit_prefix.endswith("/"): - implicit_prefix += "/" - if store.list_prefix(implicit_prefix): # type: ignore - return True - return False + return key in store -def _normalize_store_arg_v2(store: Any, storage_options=None, mode="r") -> BaseStore: - # default to v2 store for backward compatibility - zarr_version = getattr(store, "_store_version", 2) - if zarr_version != 2: - raise ValueError("store must be a version 2 store") +def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore: if store is None: store = KVStore(dict()) return store @@ -180,38 +155,17 @@ def _normalize_store_arg_v2(store: Any, storage_options=None, mode="r") -> BaseS return store -def normalize_store_arg( - store: Any, storage_options=None, mode="r", *, zarr_version=None -) -> BaseStore: - if zarr_version is None: - # default to v2 store for backward compatibility - zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) - if zarr_version == 2: - normalize_store = _normalize_store_arg_v2 - elif zarr_version == 3: - from zarr._storage.v3 import _normalize_store_arg_v3 - - normalize_store = _normalize_store_arg_v3 - else: - raise ValueError("zarr_version must be either 2 or 3") - return normalize_store(store, storage_options, mode) - - def rmdir(store: StoreLike, path: Path = None): """Remove all items under the given path. If `store` provides a `rmdir` method, this will be called, otherwise will fall back to implementation via the `Store` interface.""" path = normalize_storage_path(path) - store_version = getattr(store, "_store_version", 2) if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore # pass through store.rmdir(path) # type: ignore else: # slow version, delete one key at a time - if store_version == 2: - _rmdir_from_keys(store, path) - else: - _rmdir_from_keys_v3(store, path) # type: ignore + _rmdir_from_keys(store, path) def rename(store: Store, src_path: Path, dst_path: Path): @@ -254,21 +208,10 @@ def _getsize(store: BaseStore, path: Path = None) -> int: else: path = "" if path is None else normalize_storage_path(path) size = 0 - store_version = getattr(store, "_store_version", 2) - if store_version == 3: - if path == "": - # have to list the root folders without trailing / in this case - members = store.list_prefix(data_root.rstrip("/")) # type: ignore - members += store.list_prefix(meta_root.rstrip("/")) # type: ignore - else: - members = store.list_prefix(data_root + path) # type: ignore - members += store.list_prefix(meta_root + path) # type: ignore - # also include zarr.json? - # members += ['zarr.json'] - else: - members = listdir(store, path) - prefix = _path_to_prefix(path) - members = [prefix + k for k in members] + + members = listdir(store, path) + prefix = _path_to_prefix(path) + members = [prefix + k for k in members] for k in members: try: v = store[k] @@ -437,13 +380,8 @@ def init_array( path = normalize_storage_path(path) # ensure parent group initialized - store_version = getattr(store, "_store_version", 2) - if store_version < 3: - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) - if store_version == 3 and "zarr.json" not in store: - # initialize with default zarr.json entry level metadata - store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore + _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) if not compressor: # compatibility with legacy tests using compressor=[] @@ -482,50 +420,20 @@ def _init_array_metadata( dimension_separator=None, storage_transformers=(), ): - store_version = getattr(store, "_store_version", 2) - path = normalize_storage_path(path) # guard conditions if overwrite: - if store_version == 2: - # attempt to delete any pre-existing array in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - else: - group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) - array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) - data_prefix = data_root + _path_to_prefix(path) - - # attempt to delete any pre-existing array in store - if array_meta_key in store: - store.erase(array_meta_key) # type: ignore - if group_meta_key in store: - store.erase(group_meta_key) # type: ignore - store.erase_prefix(data_prefix) # type: ignore - if chunk_store is not None: - chunk_store.erase_prefix(data_prefix) # type: ignore - - if "/" in path: - # path is a subfolder of an existing array, remove that array - parent_path = "/".join(path.split("/")[:-1]) - sfx = _get_metadata_suffix(store) # type: ignore - array_key = meta_root + parent_path + ".array" + sfx - if array_key in store: - store.erase(array_key) # type: ignore + # attempt to delete any pre-existing array in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) if not overwrite: if contains_array(store, path): raise ContainsArrayError(path) - elif contains_group(store, path, explicit_only=False): + if contains_group(store, path, explicit_only=False): raise ContainsGroupError(path) - elif store_version == 3: - if "/" in path: - # cannot create an array within an existing array path - parent_path = "/".join(path.split("/")[:-1]) - if contains_array(store, parent_path): - raise ContainsArrayError(path) # normalize metadata dtype, object_codec = normalize_dtype(dtype, object_codec) @@ -536,7 +444,7 @@ def _init_array_metadata( fill_value = normalize_fill_value(fill_value, dtype) # optional array metadata - if dimension_separator is None and store_version == 2: + if dimension_separator is None: dimension_separator = getattr(store, "_dimension_separator", None) dimension_separator = normalize_dimension_separator(dimension_separator) @@ -553,16 +461,10 @@ def _init_array_metadata( # obtain compressor config compressor_config = None if compressor: - if store_version == 2: - try: - compressor_config = compressor.get_config() - except AttributeError as e: - raise BadCompressorError(compressor) from e - elif not isinstance(compressor, Codec): - raise ValueError("expected a numcodecs Codec for compressor") - # TODO: alternatively, could autoconvert str to a Codec - # e.g. 'zlib' -> numcodec.Zlib object - # compressor = numcodecs.get_codec({'id': compressor}) + try: + compressor_config = compressor.get_config() + except AttributeError as e: + raise BadCompressorError(compressor) from e # obtain filters config if filters: @@ -596,33 +498,16 @@ def _init_array_metadata( filters_config = None # type: ignore # initialize metadata - # TODO: don't store redundant dimension_separator for v3? - _compressor = compressor_config if store_version == 2 else compressor + _compressor = compressor_config meta = dict( shape=shape, compressor=_compressor, fill_value=fill_value, dimension_separator=dimension_separator, ) - if store_version < 3: - meta.update(dict(chunks=chunks, dtype=dtype, order=order, filters=filters_config)) - assert not storage_transformers - else: - if dimension_separator is None: - dimension_separator = "/" - if filters_config: - attributes = {"filters": filters_config} - else: - attributes = {} - meta.update( - dict( - chunk_grid=dict(type="regular", chunk_shape=chunks, separator=dimension_separator), - chunk_memory_layout=order, - data_type=dtype, - attributes=attributes, - storage_transformers=storage_transformers, - ) - ) + + meta.update(dict(chunks=chunks, dtype=dtype, order=order, filters=filters_config)) + assert not storage_transformers key = _prefix_to_array_key(store, _path_to_prefix(path)) if hasattr(store, "_metadata_class"): @@ -661,24 +546,11 @@ def init_group( # normalize path path = normalize_storage_path(path) - store_version = getattr(store, "_store_version", 2) - if store_version < 3: - # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) - - if store_version == 3 and "zarr.json" not in store: - # initialize with default zarr.json entry level metadata - store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore + _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) # initialise metadata _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store) - if store_version == 3: - # TODO: Should initializing a v3 group also create a corresponding - # empty folder under data/root/? I think probably not until there - # is actual data written there. - pass - def _init_group_metadata( store: StoreLike, @@ -686,50 +558,25 @@ def _init_group_metadata( path: Optional[str] = None, chunk_store: Optional[StoreLike] = None, ): - store_version = getattr(store, "_store_version", 2) path = normalize_storage_path(path) # guard conditions if overwrite: - if store_version == 2: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - else: - group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) - array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) - data_prefix = data_root + _path_to_prefix(path) - meta_prefix = meta_root + _path_to_prefix(path) - - # attempt to delete any pre-existing array in store - if array_meta_key in store: - store.erase(array_meta_key) # type: ignore - if group_meta_key in store: - store.erase(group_meta_key) # type: ignore - store.erase_prefix(data_prefix) # type: ignore - store.erase_prefix(meta_prefix) # type: ignore - if chunk_store is not None: - chunk_store.erase_prefix(data_prefix) # type: ignore + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) if not overwrite: if contains_array(store, path): raise ContainsArrayError(path) elif contains_group(store, path): raise ContainsGroupError(path) - elif store_version == 3 and "/" in path: - # cannot create a group overlapping with an existing array name - parent_path = "/".join(path.split("/")[:-1]) - if contains_array(store, parent_path): - raise ContainsArrayError(path) # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - if store_version == 3: - meta = {"attributes": {}} # type: ignore - else: - meta = {} # type: ignore + meta = {} # type: ignore key = _prefix_to_group_key(store, _path_to_prefix(path)) if hasattr(store, "_metadata_class"): store[key] = store._metadata_class.encode_group_metadata(meta) # type: ignore diff --git a/tests/test_attrs.py b/tests/test_attrs.py index 7e3377f664..2575163840 100644 --- a/tests/test_attrs.py +++ b/tests/test_attrs.py @@ -4,34 +4,24 @@ import pytest import zarr -from zarr._storage.store import meta_root from zarr.attrs import Attributes from zarr.storage import KVStore, DirectoryStore -from zarr._storage.v3 import KVStoreV3 -from .util import CountingDict, CountingDictV3 +from .util import CountingDict from zarr.hierarchy import group -@pytest.fixture(params=[2, 3]) -def zarr_version(request): - return request.param - - -def _init_store(version): - """Use a plain dict() for v2, but KVStoreV3 otherwise.""" - if version == 2: - return dict() - return KVStoreV3(dict()) +def _init_store(): + return dict() class TestAttributes: - def init_attributes(self, store, read_only=False, cache=True, zarr_version=2): - root = ".z" if zarr_version == 2 else meta_root + def init_attributes(self, store, read_only=False, cache=True): + root = ".z" return Attributes(store, key=root + "attrs", read_only=read_only, cache=cache) - def test_storage(self, zarr_version): - store = _init_store(zarr_version) - root = ".z" if zarr_version == 2 else meta_root + def test_storage(self): + store = _init_store() + root = ".z" attrs_key = root + "attrs" a = Attributes(store=store, key=attrs_key) assert isinstance(a.store, KVStore) @@ -44,11 +34,9 @@ def test_storage(self, zarr_version): assert attrs_key in store assert isinstance(store[attrs_key], bytes) d = json.loads(str(store[attrs_key], "utf-8")) - if zarr_version == 3: - d = d["attributes"] assert dict(foo="bar", baz=42) == d - def test_utf8_encoding(self, zarr_version): + def test_utf8_encoding(self): project_root = pathlib.Path(zarr.__file__).resolve().parent.parent fixdir = project_root / "fixture" testdir = fixdir / "utf8attrs" @@ -64,9 +52,9 @@ def test_utf8_encoding(self, zarr_version): fixture = group(store=DirectoryStore(str(fixdir))) assert fixture["utf8attrs"].attrs.asdict() == dict(foo="た") - def test_get_set_del_contains(self, zarr_version): - store = _init_store(zarr_version) - a = self.init_attributes(store, zarr_version=zarr_version) + def test_get_set_del_contains(self): + store = _init_store() + a = self.init_attributes(store) assert "foo" not in a a["foo"] = "bar" a["baz"] = 42 @@ -80,9 +68,9 @@ def test_get_set_del_contains(self, zarr_version): # noinspection PyStatementEffect a["foo"] - def test_update_put(self, zarr_version): - store = _init_store(zarr_version) - a = self.init_attributes(store, zarr_version=zarr_version) + def test_update_put(self): + store = _init_store() + a = self.init_attributes(store) assert "foo" not in a assert "bar" not in a assert "baz" not in a @@ -97,9 +85,9 @@ def test_update_put(self, zarr_version): assert a["bar"] == 84 assert "baz" not in a - def test_iterators(self, zarr_version): - store = _init_store(zarr_version) - a = self.init_attributes(store, zarr_version=zarr_version) + def test_iterators(self): + store = _init_store() + a = self.init_attributes(store) assert 0 == len(a) assert set() == set(a) assert set() == set(a.keys()) @@ -115,15 +103,10 @@ def test_iterators(self, zarr_version): assert {"bar", 42} == set(a.values()) assert {("foo", "bar"), ("baz", 42)} == set(a.items()) - def test_read_only(self, zarr_version): - store = _init_store(zarr_version) - a = self.init_attributes(store, read_only=True, zarr_version=zarr_version) - if zarr_version == 2: - store[".zattrs"] = json.dumps(dict(foo="bar", baz=42)).encode("ascii") - else: - store["meta/root/attrs"] = json.dumps(dict(attributes=dict(foo="bar", baz=42))).encode( - "ascii" - ) + def test_read_only(self): + store = _init_store() + a = self.init_attributes(store, read_only=True) + store[".zattrs"] = json.dumps(dict(foo="bar", baz=42)).encode("ascii") assert a["foo"] == "bar" assert a["baz"] == 42 with pytest.raises(PermissionError): @@ -133,9 +116,9 @@ def test_read_only(self, zarr_version): with pytest.raises(PermissionError): a.update(foo="quux") - def test_key_completions(self, zarr_version): - store = _init_store(zarr_version) - a = self.init_attributes(store, zarr_version=zarr_version) + def test_key_completions(self): + store = _init_store() + a = self.init_attributes(store) d = a._ipython_key_completions_() assert "foo" not in d assert "123" not in d @@ -150,23 +133,20 @@ def test_key_completions(self, zarr_version): assert "asdf;" in d assert "baz" not in d - def test_caching_on(self, zarr_version): + def test_caching_on(self): # caching is turned on by default # setup store - store = CountingDict() if zarr_version == 2 else CountingDictV3() - attrs_key = ".zattrs" if zarr_version == 2 else "meta/root/attrs" + store = CountingDict() + attrs_key = ".zattrs" assert 0 == store.counter["__getitem__", attrs_key] assert 0 == store.counter["__setitem__", attrs_key] - if zarr_version == 2: - store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") - else: - store[attrs_key] = json.dumps(dict(attributes=dict(foo="xxx", bar=42))).encode("ascii") + store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") assert 0 == store.counter["__getitem__", attrs_key] assert 1 == store.counter["__setitem__", attrs_key] # setup attributes - a = self.init_attributes(store, zarr_version=zarr_version) + a = self.init_attributes(store) # test __getitem__ causes all attributes to be cached assert a["foo"] == "xxx" @@ -178,7 +158,7 @@ def test_caching_on(self, zarr_version): # test __setitem__ updates the cache a["foo"] = "yyy" - get_cnt = 2 if zarr_version == 2 else 3 + get_cnt = 2 assert get_cnt == store.counter["__getitem__", attrs_key] assert 2 == store.counter["__setitem__", attrs_key] assert a["foo"] == "yyy" @@ -187,7 +167,7 @@ def test_caching_on(self, zarr_version): # test update() updates the cache a.update(foo="zzz", bar=84) - get_cnt = 3 if zarr_version == 2 else 5 + get_cnt = 3 assert get_cnt == store.counter["__getitem__", attrs_key] assert 3 == store.counter["__setitem__", attrs_key] assert a["foo"] == "zzz" @@ -205,7 +185,7 @@ def test_caching_on(self, zarr_version): # test __delitem__ updates the cache del a["bar"] - get_cnt = 4 if zarr_version == 2 else 7 + get_cnt = 4 assert get_cnt == store.counter["__getitem__", attrs_key] assert 4 == store.counter["__setitem__", attrs_key] assert "bar" not in a @@ -213,35 +193,28 @@ def test_caching_on(self, zarr_version): assert 4 == store.counter["__setitem__", attrs_key] # test refresh() - if zarr_version == 2: - store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") - else: - store[attrs_key] = json.dumps(dict(attributes=dict(foo="xxx", bar=42))).encode("ascii") + store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") assert get_cnt == store.counter["__getitem__", attrs_key] a.refresh() - get_cnt = 5 if zarr_version == 2 else 8 + get_cnt = 5 assert get_cnt == store.counter["__getitem__", attrs_key] assert a["foo"] == "xxx" assert get_cnt == store.counter["__getitem__", attrs_key] assert a["bar"] == 42 assert get_cnt == store.counter["__getitem__", attrs_key] - def test_caching_off(self, zarr_version): + def test_caching_off(self): # setup store - store = CountingDict() if zarr_version == 2 else CountingDictV3() - attrs_key = ".zattrs" if zarr_version == 2 else "meta/root/attrs" + store = CountingDict() + attrs_key = ".zattrs" assert 0 == store.counter["__getitem__", attrs_key] assert 0 == store.counter["__setitem__", attrs_key] - - if zarr_version == 2: - store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") - else: - store[attrs_key] = json.dumps(dict(attributes=dict(foo="xxx", bar=42))).encode("ascii") + store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") assert 0 == store.counter["__getitem__", attrs_key] assert 1 == store.counter["__setitem__", attrs_key] # setup attributes - a = self.init_attributes(store, cache=False, zarr_version=zarr_version) + a = self.init_attributes(store, cache=False) # test __getitem__ assert a["foo"] == "xxx" @@ -253,38 +226,38 @@ def test_caching_off(self, zarr_version): # test __setitem__ a["foo"] = "yyy" - get_cnt = 4 if zarr_version == 2 else 5 + get_cnt = 4 assert get_cnt == store.counter["__getitem__", attrs_key] assert 2 == store.counter["__setitem__", attrs_key] assert a["foo"] == "yyy" - get_cnt = 5 if zarr_version == 2 else 6 + get_cnt = 5 assert get_cnt == store.counter["__getitem__", attrs_key] assert 2 == store.counter["__setitem__", attrs_key] # test update() a.update(foo="zzz", bar=84) - get_cnt = 6 if zarr_version == 2 else 8 + get_cnt = 6 assert get_cnt == store.counter["__getitem__", attrs_key] assert 3 == store.counter["__setitem__", attrs_key] assert a["foo"] == "zzz" assert a["bar"] == 84 - get_cnt = 8 if zarr_version == 2 else 10 + get_cnt = 8 assert get_cnt == store.counter["__getitem__", attrs_key] assert 3 == store.counter["__setitem__", attrs_key] # test __contains__ assert "foo" in a - get_cnt = 9 if zarr_version == 2 else 11 + get_cnt = 9 assert get_cnt == store.counter["__getitem__", attrs_key] assert 3 == store.counter["__setitem__", attrs_key] assert "spam" not in a - get_cnt = 10 if zarr_version == 2 else 12 + get_cnt = 10 assert get_cnt == store.counter["__getitem__", attrs_key] assert 3 == store.counter["__setitem__", attrs_key] - def test_wrong_keys(self, zarr_version): - store = _init_store(zarr_version) - a = self.init_attributes(store, zarr_version=zarr_version) + def test_wrong_keys(self): + store = _init_store() + a = self.init_attributes(store) warning_msg = "only attribute keys of type 'string' will be allowed in the future" diff --git a/tests/test_convenience.py b/tests/test_convenience.py index 7cb4db7a35..d50533e847 100644 --- a/tests/test_convenience.py +++ b/tests/test_convenience.py @@ -27,53 +27,29 @@ from zarr.storage import ( ConsolidatedMetadataStore, FSStore, - KVStore, MemoryStore, atexit_rmtree, - data_root, - meta_root, getsize, ) -from zarr._storage.store import v3_api_available -from zarr._storage.v3 import ( - ConsolidatedMetadataStoreV3, - DirectoryStoreV3, - FSStoreV3, - KVStoreV3, - MemoryStoreV3, - SQLiteStoreV3, -) -from .util import have_fsspec - -_VERSIONS = (2, 3) if v3_api_available else (2,) - - -def _init_creation_kwargs(zarr_version): - kwargs = {"zarr_version": zarr_version} - if zarr_version == 3: - kwargs["path"] = "dataset" - return kwargs -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_open_array(path_type, zarr_version): +def test_open_array(path_type): store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) - kwargs = _init_creation_kwargs(zarr_version) # open array, create if doesn't exist - z = open(store, mode="a", shape=100, **kwargs) + z = open(store, mode="a", shape=100) assert isinstance(z, Array) assert z.shape == (100,) # open array, overwrite - z = open(store, mode="w", shape=200, **kwargs) + z = open(store, mode="w", shape=200) assert isinstance(z, Array) assert z.shape == (200,) # open array, read-only - z = open(store, mode="r", **kwargs) + z = open(store, mode="r") assert isinstance(z, Array) assert z.shape == (200,) assert z.read_only @@ -83,79 +59,46 @@ def test_open_array(path_type, zarr_version): open("doesnotexist", mode="r") -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_open_group(path_type, zarr_version): +def test_open_group(path_type): store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) - kwargs = _init_creation_kwargs(zarr_version) # open group, create if doesn't exist - g = open(store, mode="a", **kwargs) + g = open(store, mode="a") g.create_group("foo") assert isinstance(g, Group) assert "foo" in g # open group, overwrite - g = open(store, mode="w", **kwargs) + g = open(store, mode="w") assert isinstance(g, Group) assert "foo" not in g # open group, read-only - g = open(store, mode="r", **kwargs) + g = open(store, mode="r") assert isinstance(g, Group) assert g.read_only -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_save_errors(zarr_version): +def test_save_errors(): with pytest.raises(ValueError): # no arrays provided - save_group("data/group.zarr", zarr_version=zarr_version) + save_group("data/group.zarr") with pytest.raises(TypeError): # no array provided - save_array("data/group.zarr", zarr_version=zarr_version) + save_array("data/group.zarr") with pytest.raises(ValueError): # no arrays provided - save("data/group.zarr", zarr_version=zarr_version) - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -def test_zarr_v3_save_multiple_unnamed(): - x = np.ones(8) - y = np.zeros(8) - store = KVStoreV3(dict()) - # no path provided - save_group(store, x, y, path="dataset", zarr_version=3) - # names become arr_{i} for unnamed *args - assert data_root + "dataset/arr_0/c0" in store - assert data_root + "dataset/arr_1/c0" in store - assert meta_root + "dataset/arr_0.array.json" in store - assert meta_root + "dataset/arr_1.array.json" in store - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -def test_zarr_v3_save_errors(): - x = np.ones(8) - with pytest.raises(ValueError): - # no path provided - save_group("data/group.zr3", x, zarr_version=3) - with pytest.raises(ValueError): - # no path provided - save_array("data/group.zr3", x, zarr_version=3) - with pytest.raises(ValueError): - # no path provided - save("data/group.zr3", x, zarr_version=3) + save("data/group.zarr") -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_lazy_loader(zarr_version): +def test_lazy_loader(): foo = np.arange(100) bar = np.arange(100, 0, -1) - store = "data/group.zarr" if zarr_version == 2 else "data/group.zr3" - kwargs = _init_creation_kwargs(zarr_version) - save(store, foo=foo, bar=bar, **kwargs) - loader = load(store, **kwargs) + store = "data/group.zarr" + save(store, foo=foo, bar=bar) + loader = load(store) assert "foo" in loader assert "bar" in loader assert "baz" not in loader @@ -166,18 +109,16 @@ def test_lazy_loader(zarr_version): assert "LazyLoader: " in repr(loader) -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_load_array(zarr_version): +def test_load_array(): foo = np.arange(100) bar = np.arange(100, 0, -1) - store = "data/group.zarr" if zarr_version == 2 else "data/group.zr3" - kwargs = _init_creation_kwargs(zarr_version) - save(store, foo=foo, bar=bar, **kwargs) + store = "data/group.zarr" + save(store, foo=foo, bar=bar) # can also load arrays directly into a numpy array for array_name in ["foo", "bar"]: - array_path = "dataset/" + array_name if zarr_version == 3 else array_name - array = load(store, path=array_path, zarr_version=zarr_version) + array_path = array_name + array = load(store, path=array_path) assert isinstance(array, np.ndarray) if array_name == "foo": assert_array_equal(foo, array) @@ -185,10 +126,8 @@ def test_load_array(zarr_version): assert_array_equal(bar, array) -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_tree(zarr_version): - kwargs = _init_creation_kwargs(zarr_version) - g1 = zarr.group(**kwargs) +def test_tree(): + g1 = zarr.group() g1.create_group("foo") g3 = g1.create_group("bar") g3.create_group("baz") @@ -198,16 +137,13 @@ def test_tree(zarr_version): assert str(zarr.tree(g1)) == str(g1.tree()) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("stores_from_path", [False, True]) @pytest.mark.parametrize( "with_chunk_store,listable", [(False, True), (True, True), (False, False)], ids=["default-listable", "with_chunk_store-listable", "default-unlistable"], ) -def test_consolidate_metadata( - with_chunk_store, zarr_version, listable, monkeypatch, stores_from_path -): +def test_consolidate_metadata(with_chunk_store, listable, monkeypatch, stores_from_path): # setup initial data if stores_from_path: store = tempfile.mkdtemp() @@ -217,17 +153,11 @@ def test_consolidate_metadata( atexit.register(atexit_rmtree, chunk_store) else: chunk_store = None - version_kwarg = {"zarr_version": zarr_version} else: - if zarr_version == 2: - store = MemoryStore() - chunk_store = MemoryStore() if with_chunk_store else None - elif zarr_version == 3: - store = MemoryStoreV3() - chunk_store = MemoryStoreV3() if with_chunk_store else None - version_kwarg = {} - path = "dataset" if zarr_version == 3 else None - z = group(store, chunk_store=chunk_store, path=path, **version_kwarg) + store = MemoryStore() + chunk_store = MemoryStore() if with_chunk_store else None + path = None + z = group(store, chunk_store=chunk_store, path=path) # Reload the actual store implementation in case str store_to_copy = z.store @@ -248,41 +178,22 @@ def test_consolidate_metadata( else: store_class = store - if zarr_version == 3: - # error on v3 if path not provided - with pytest.raises(ValueError): - consolidate_metadata(store_class, path=None) - - with pytest.raises(ValueError): - consolidate_metadata(store_class, path="") - # perform consolidation out = consolidate_metadata(store_class, path=path) assert isinstance(out, Group) assert ["g1", "g2"] == list(out) if not stores_from_path: - if zarr_version == 2: - assert isinstance(out._store, ConsolidatedMetadataStore) - assert ".zmetadata" in store - meta_keys = [ - ".zgroup", - "g1/.zgroup", - "g2/.zgroup", - "g2/.zattrs", - "g2/arr/.zarray", - "g2/arr/.zattrs", - ] - else: - assert isinstance(out._store, ConsolidatedMetadataStoreV3) - assert "meta/root/consolidated/.zmetadata" in store - meta_keys = [ - "zarr.json", - meta_root + "dataset.group.json", - meta_root + "dataset/g1.group.json", - meta_root + "dataset/g2.group.json", - meta_root + "dataset/g2/arr.array.json", - "meta/root/consolidated.group.json", - ] + assert isinstance(out._store, ConsolidatedMetadataStore) + assert ".zmetadata" in store + meta_keys = [ + ".zgroup", + "g1/.zgroup", + "g2/.zgroup", + "g2/.zattrs", + "g2/arr/.zarray", + "g2/arr/.zattrs", + ] + for key in meta_keys: del store[key] @@ -293,11 +204,7 @@ def test_consolidate_metadata( monkeypatch.setattr(fs_memory.MemoryFileSystem, "isdir", lambda x, y: False) monkeypatch.delattr(fs_memory.MemoryFileSystem, "ls") fs = fs_memory.MemoryFileSystem() - if zarr_version == 2: - store_to_open = FSStore("", fs=fs) - else: - store_to_open = FSStoreV3("", fs=fs) - + store_to_open = FSStore("", fs=fs) # copy original store to new unlistable store store_to_open.update(store_to_copy) @@ -305,7 +212,7 @@ def test_consolidate_metadata( store_to_open = store # open consolidated - z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path, **version_kwarg) + z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path) assert ["g1", "g2"] == list(z2) assert "world" == z2.g2.attrs["hello"] assert 1 == z2.g2.arr.attrs["data"] @@ -320,26 +227,17 @@ def test_consolidate_metadata( if stores_from_path: # path string is note a BaseStore subclass so cannot be used to # initialize a ConsolidatedMetadataStore. - if zarr_version == 2: - with pytest.raises(ValueError): - cmd = ConsolidatedMetadataStore(store) - elif zarr_version == 3: - with pytest.raises(ValueError): - cmd = ConsolidatedMetadataStoreV3(store) + + with pytest.raises(ValueError): + cmd = ConsolidatedMetadataStore(store) else: # tests del/write on the store - if zarr_version == 2: - cmd = ConsolidatedMetadataStore(store) - with pytest.raises(PermissionError): - del cmd[".zgroup"] - with pytest.raises(PermissionError): - cmd[".zgroup"] = None - else: - cmd = ConsolidatedMetadataStoreV3(store) - with pytest.raises(PermissionError): - del cmd[meta_root + "dataset.group.json"] - with pytest.raises(PermissionError): - cmd[meta_root + "dataset.group.json"] = None + + cmd = ConsolidatedMetadataStore(store) + with pytest.raises(PermissionError): + del cmd[".zgroup"] + with pytest.raises(PermissionError): + cmd[".zgroup"] = None # test getsize on the store assert isinstance(getsize(cmd), Integral) @@ -377,7 +275,6 @@ def test_consolidate_metadata( path=path, cache_attrs=True, synchronizer=None, - **version_kwarg, ) @@ -469,7 +366,7 @@ def test_excludes_includes(self): copy_store(source, dest, excludes=excludes) assert len(dest) == 2 - root = "" if self._version == 2 else meta_root + root = "" assert root + "foo" not in dest # multiple excludes @@ -500,7 +397,7 @@ def test_dry_run(self): def test_if_exists(self): source = self.source dest = self._get_dest_store() - root = "" if self._version == 2 else meta_root + root = "" dest[root + "bar/baz"] = b"mmm" # default ('raise') @@ -530,27 +427,6 @@ def test_if_exists(self): copy_store(source, dest, if_exists="foobar") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestCopyStoreV3(TestCopyStore): - _version = 3 - - def setUp(self): - source = KVStoreV3(dict()) - source["meta/root/foo"] = b"xxx" - source["meta/root/bar/baz"] = b"yyy" - source["meta/root/bar/qux"] = b"zzz" - self.source = source - - def _get_dest_store(self): - return KVStoreV3(dict()) - - def test_mismatched_store_versions(self): - # cannot copy between stores of mixed Zarr versions - dest = KVStore(dict()) - with pytest.raises(ValueError): - copy_store(self.source, dest) - - def check_copied_array(original, copied, without_attrs=False, expect_props=None): # setup source_h5py = original.__module__.startswith("h5py.") @@ -672,28 +548,6 @@ def test_copy_all(): assert destination_group.subgroup.attrs["info"] == "sub attrs" -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -def test_copy_all_v3(): - """ - https://github.com/zarr-developers/zarr-python/issues/269 - - copy_all used to not copy attributes as `.keys()` - - """ - original_group = zarr.group(store=MemoryStoreV3(), path="group1", overwrite=True) - original_group.create_group("subgroup") - - destination_group = zarr.group(store=MemoryStoreV3(), path="group2", overwrite=True) - - # copy from memory to directory store - copy_all( - original_group, - destination_group, - dry_run=False, - ) - assert "subgroup" in destination_group - - class TestCopy: @pytest.fixture(params=[False, True], ids=["zarr", "hdf5"]) def source(self, request, tmpdir): @@ -948,100 +802,3 @@ def test_logging(self, source, dest, tmpdir): # bad option with pytest.raises(TypeError): copy(source["foo"], dest, dry_run=True, log=True) - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestCopyV3(TestCopy): - @pytest.fixture(params=["zarr", "hdf5"]) - def source(self, request, tmpdir): - def prep_source(source): - foo = source.create_group("foo") - foo.attrs["experiment"] = "weird science" - baz = foo.create_dataset("bar/baz", data=np.arange(100), chunks=(50,)) - baz.attrs["units"] = "metres" - if request.param == "hdf5": - extra_kws = dict( - compression="gzip", - compression_opts=3, - fillvalue=84, - shuffle=True, - fletcher32=True, - ) - else: - extra_kws = dict(compressor=Zlib(3), order="F", fill_value=42, filters=[Adler32()]) - source.create_dataset( - "spam", - data=np.arange(100, 200).reshape(20, 5), - chunks=(10, 2), - dtype="i2", - **extra_kws, - ) - return source - - if request.param == "hdf5": - h5py = pytest.importorskip("h5py") - fn = tmpdir.join("source.h5") - with h5py.File(str(fn), mode="w") as h5f: - yield prep_source(h5f) - elif request.param == "zarr": - yield prep_source(group(path="group1", zarr_version=3)) - - # Test with various destination StoreV3 types as TestCopyV3 covers rmdir - destinations = ["hdf5", "zarr", "zarr_kvstore", "zarr_directorystore", "zarr_sqlitestore"] - if have_fsspec: - destinations += ["zarr_fsstore"] - - @pytest.fixture(params=destinations) - def dest(self, request, tmpdir): - if request.param == "hdf5": - h5py = pytest.importorskip("h5py") - fn = tmpdir.join("dest.h5") - with h5py.File(str(fn), mode="w") as h5f: - yield h5f - elif request.param == "zarr": - yield group(path="group2", zarr_version=3) - elif request.param == "zarr_kvstore": - store = KVStoreV3(dict()) - yield group(store, path="group2", zarr_version=3) - elif request.param == "zarr_fsstore": - fn = tmpdir.join("dest.zr3") - store = FSStoreV3(str(fn), auto_mkdir=True) - yield group(store, path="group2", zarr_version=3) - elif request.param == "zarr_directorystore": - fn = tmpdir.join("dest.zr3") - store = DirectoryStoreV3(str(fn)) - yield group(store, path="group2", zarr_version=3) - elif request.param == "zarr_sqlitestore": - fn = tmpdir.join("dest.db") - store = SQLiteStoreV3(str(fn)) - yield group(store, path="group2", zarr_version=3) - - def test_copy_array_create_options(self, source, dest): - dest_h5py = dest.__module__.startswith("h5py.") - - # copy array, provide creation options - compressor = Zlib(9) - create_kws = dict(chunks=(10,)) - if dest_h5py: - create_kws.update( - compression="gzip", compression_opts=9, shuffle=True, fletcher32=True, fillvalue=42 - ) - else: - # v3 case has no filters argument in zarr create_kws - create_kws.update(compressor=compressor, fill_value=42, order="F") - copy(source["foo/bar/baz"], dest, without_attrs=True, **create_kws) - check_copied_array( - source["foo/bar/baz"], dest["baz"], without_attrs=True, expect_props=create_kws - ) - - def test_copy_group_no_name(self, source, dest): - if source.__module__.startswith("h5py"): - with pytest.raises(TypeError): - copy(source, dest) - else: - # For v3, dest.name will be inferred from source.name - copy(source, dest) - check_copied_group(source, dest[source.name.lstrip("/")]) - - copy(source, dest, name="root") - check_copied_group(source, dest["root"]) diff --git a/tests/test_core.py b/tests/test_core.py index e8d527c4ef..6303371793 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -33,15 +33,9 @@ import zarr from zarr._storage.store import ( BaseStore, - v3_api_available, -) -from zarr._storage.v3_storage_transformers import ( - DummyStorageTransfomer, - ShardingStorageTransformer, - v3_sharding_available, ) + from zarr.core import Array -from zarr.errors import ArrayNotFoundError, ContainsGroupError from zarr.meta import json_loads from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( @@ -56,24 +50,10 @@ SQLiteStore, atexit_rmglob, atexit_rmtree, - data_root, init_array, init_group, - meta_root, normalize_store_arg, ) -from zarr._storage.v3 import ( - ABSStoreV3, - DBMStoreV3, - DirectoryStoreV3, - FSStoreV3, - KVStoreV3, - LMDBStoreV3, - LRUStoreCacheV3, - RmdirV3, - SQLiteStoreV3, - StoreV3, -) from zarr.util import buffer_size from .util import abs_container, skip_test_env_var, have_fsspec, mktemp @@ -82,7 +62,6 @@ class TestArray: - version = 2 root = "" path = "" compressor = Zlib(level=1) @@ -139,7 +118,7 @@ def test_array_init(self): # normal initialization store = self.create_store() init_array(store, shape=100, chunks=10, dtype=" 2: - # in v3, attributes are in a sub-dictionary of the metadata - attrs = attrs["attributes"] assert "foo" in attrs and attrs["foo"] == "bar" a.attrs["bar"] = "foo" assert a.attrs.key in a.store attrs = json_loads(a.store[a.attrs.key]) - if self.version > 2: - # in v3, attributes are in a sub-dictionary of the metadata - attrs = attrs["attributes"] assert "foo" in attrs and attrs["foo"] == "bar" assert "bar" in attrs and attrs["bar"] == "foo" a.store.close() @@ -2298,7 +2256,7 @@ def test_nbytes_stored(self): class TestArrayNoCache(TestArray): def test_cache_metadata(self): a1 = self.create_array(shape=100, chunks=10, dtype="i1", cache_metadata=False) - path = None if self.version == 2 else a1.path + path = None a2 = Array(a1.store, path=path, cache_metadata=True) assert a1.shape == a2.shape assert a1.size == a2.size @@ -2339,7 +2297,7 @@ def test_cache_metadata(self): def test_cache_attrs(self): a1 = self.create_array(shape=100, chunks=10, dtype="i1", cache_attrs=False) - path = None if self.version == 2 else "arr1" + path = None a2 = Array(a1.store, path=path, cache_attrs=True) assert a1.attrs.asdict() == a2.attrs.asdict() @@ -2460,7 +2418,7 @@ def test_read_nitems_less_than_blocksize_from_multiple_chunks(self): """ z = self.create_array(shape=1000000, chunks=100_000) z[40_000:80_000] = 1 - path = None if self.version == 2 else z.path + path = None b = Array(z.store, path=path, read_only=True, partial_decompress=True) assert (b[40_000:80_000] == 1).all() @@ -2470,7 +2428,7 @@ def test_read_from_all_blocks(self): """ z = self.create_array(shape=1000000, chunks=100_000) z[2:99_000] = 1 - path = None if self.version == 2 else z.path + path = None b = Array(z.store, path=path, read_only=True, partial_decompress=True) assert (b[2:99_000] == 1).all() @@ -2517,7 +2475,7 @@ def test_read_nitems_less_than_blocksize_from_multiple_chunks(self): """ z = self.create_array(shape=1000000, chunks=100_000) z[40_000:80_000] = 1 - path = None if self.version == 2 else z.path + path = None b = Array(z.store, path=path, read_only=True, partial_decompress=True) assert (b[40_000:80_000] == 1).all() @@ -2527,607 +2485,11 @@ def test_read_from_all_blocks(self): """ z = self.create_array(shape=1000000, chunks=100_000) z[2:99_000] = 1 - path = None if self.version == 2 else z.path + path = None b = Array(z.store, path=path, read_only=True, partial_decompress=True) assert (b[2:99_000] == 1).all() -#### -# StoreV3 test classes inheriting from the above below this point -#### - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayV3(TestArray): - version = 3 - root = meta_root - path = "arr1" - - def create_store(self): - return KVStoreV3(dict()) - - def expected(self): - # tests for array without path will not be run for v3 stores - assert self.version == 3 - return [ - "73ab8ace56719a5c9308c3754f5e2d57bc73dc20", - "5fb3d02b8f01244721582929b3cad578aec5cea5", - "26b098bedb640846e18dc2fbc1c27684bb02b532", - "799a458c287d431d747bec0728987ca4fe764549", - "c780221df84eb91cb62f633f12d3f1eaa9cee6bd", - ] - - # TODO: fix test_nbytes_stored - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithPathV3(TestArrayV3): - def test_array_init(self): - store = self.create_store() - # can initialize an array without a path - init_array(store, shape=100, chunks=10, dtype=" BaseStore: - path = mkdtemp() - atexit.register(shutil.rmtree, path) - return DirectoryStoreV3(path) - - def test_nbytes_stored(self): - # dict as store - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - - -@skip_test_env_var("ZARR_TEST_ABS") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithABSStoreV3(TestArrayV3): - def create_store(self) -> ABSStoreV3: - client = abs_container() - store = ABSStoreV3(client=client) - store.rmdir() - return store - - -# TODO: TestArrayWithN5StoreV3 -# class TestArrayWithN5StoreV3(TestArrayWithDirectoryStoreV3): - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithDBMStoreV3(TestArrayV3): - def create_store(self) -> DBMStoreV3: - path = mktemp(suffix=".anydbm") - atexit.register(atexit_rmglob, path + "*") - store = DBMStoreV3(path, flag="n") - return store - - def test_nbytes_stored(self): - pass # not implemented - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithDBMStoreV3BerkeleyDB(TestArrayV3): - def create_store(self) -> DBMStoreV3: - bsddb3 = pytest.importorskip("bsddb3") - path = mktemp(suffix=".dbm") - atexit.register(os.remove, path) - store = DBMStoreV3(path, flag="n", open=bsddb3.btopen) - return store - - def test_nbytes_stored(self): - pass # not implemented - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithLMDBStoreV3(TestArrayV3): - lmdb_buffers = True - - def create_store(self) -> LMDBStoreV3: - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - store = LMDBStoreV3(path, buffers=self.lmdb_buffers) - return store - - def test_store_has_bytes_values(self): - pass # returns values as memoryviews/buffers instead of bytes - - def test_nbytes_stored(self): - pass # not implemented - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithLMDBStoreV3NoBuffers(TestArrayWithLMDBStoreV3): - lmdb_buffers = False - - def test_nbytes_stored(self): - pass # not implemented - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithSQLiteStoreV3(TestArrayV3): - def create_store(self): - pytest.importorskip("sqlite3") - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStoreV3(path) - return store - - def test_nbytes_stored(self): - pass # not implemented - - -# skipped adding V3 equivalents for compressors (no change in v3): -# TestArrayWithNoCompressor -# TestArrayWithBZ2Compressor -# TestArrayWithBloscCompressor -# TestArrayWithLZMACompressor - -# skipped test with filters (v3 protocol removed filters) -# TestArrayWithFilters - - -# custom store, does not support getsize() -# Note: this custom mapping doesn't actually have all methods in the -# v3 spec (e.g. erase), but they aren't needed here. - - -class CustomMappingV3(RmdirV3, StoreV3): - def __init__(self): - self.inner = KVStoreV3(dict()) - - def __iter__(self): - return iter(self.keys()) - - def __len__(self): - return len(self.inner) - - def keys(self): - return self.inner.keys() - - def values(self): - return self.inner.values() - - def get(self, item, default=None): - try: - return self.inner[item] - except KeyError: - return default - - def __getitem__(self, item): - return self.inner[item] - - def __setitem__(self, item, value): - self.inner[item] = ensure_bytes(value) - - def __delitem__(self, key): - del self.inner[key] - - def __contains__(self, item): - return item in self.inner - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithCustomMappingV3(TestArrayV3): - def create_store(self): - store = CustomMappingV3() - return store - - def test_nbytes_stored(self): - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - - def test_len(self): - # dict as store - z = self.create_array(shape=1000, chunks=100) - assert len(z._store) == 2 - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayNoCacheV3(TestArrayWithPathV3): - def create_store(self): - store = KVStoreV3(dict()) - return store - - def test_object_arrays_danger(self): - # skip this one as it only works if metadata are cached - pass - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithStoreCacheV3(TestArrayV3): - def create_store(self): - store = LRUStoreCacheV3(dict(), max_size=None) - return store - - def test_store_has_bytes_values(self): - # skip as the cache has no control over how the store provides values - pass - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithFSStoreV3(TestArrayV3): - compressor = Blosc() - - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - key_separator = self.dimension_separator - store = FSStoreV3( - path, - key_separator=key_separator, - auto_mkdir=True, - create=True, - check=True, - missing_exceptions=None, - ) - return store - - def expected(self): - return [ - "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", - "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", - "b663857bb89a8ab648390454954a9cdd453aa24b", - "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", - "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithFSStoreV3FromFilesystem(TestArrayWithFSStoreV3): - def create_store(self): - from fsspec.implementations.local import LocalFileSystem - - fs = LocalFileSystem(auto_mkdir=True) - path = mkdtemp() - atexit.register(shutil.rmtree, path) - key_separator = self.dimension_separator - store = FSStoreV3( - path, - fs=fs, - key_separator=key_separator, - create=True, - check=True, - missing_exceptions=None, - ) - return store - - def expected(self): - return [ - "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", - "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", - "b663857bb89a8ab648390454954a9cdd453aa24b", - "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", - "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithFSStoreV3PartialRead(TestArrayWithFSStoreV3): - partial_decompress = True - - def expected(self): - return [ - "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", - "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", - "b663857bb89a8ab648390454954a9cdd453aa24b", - "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", - "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -@pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") -class TestArrayWithFSStoreV3PartialReadUncompressedSharded(TestArrayWithFSStoreV3): - partial_decompress = True - compressor = None - - def create_storage_transformers(self, shape) -> Tuple[Any]: - num_dims = 1 if isinstance(shape, int) else len(shape) - sharding_transformer = ShardingStorageTransformer( - "indexed", chunks_per_shard=(2,) * num_dims - ) - return (sharding_transformer,) - - def test_nbytes_stored(self): - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - - def test_supports_efficient_get_set_partial_values(self): - z = self.create_array(shape=100, chunks=10) - assert z.chunk_store.supports_efficient_get_partial_values - assert not z.chunk_store.supports_efficient_set_partial_values() - - def expected(self): - return [ - "90109fc2a4e17efbcb447003ea1c08828b91f71e", - "2b73519f7260dba3ddce0d2b70041888856fec6b", - "bca5798be2ed71d444f3045b05432d937682b7dd", - "9ff1084501e28520e577662a6e3073f1116c76a2", - "882a97cad42417f90f111d0cb916a21579650467", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithFSStoreV3Nested(TestArrayWithFSStoreV3): - dimension_separator = "/" - - def expected(self): - return [ - "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", - "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", - "b663857bb89a8ab648390454954a9cdd453aa24b", - "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", - "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithFSStoreV3NestedPartialRead(TestArrayWithFSStoreV3): - dimension_separator = "/" - - def expected(self): - return [ - "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", - "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", - "b663857bb89a8ab648390454954a9cdd453aa24b", - "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", - "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", - ] - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithStorageTransformersV3(TestArrayWithChunkStoreV3): - def create_storage_transformers(self, shape) -> Tuple[Any]: - return ( - DummyStorageTransfomer("dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT), - ) - - def expected(self): - return [ - "3fb9a4f8233b09ad02067b6b7fc9fd5caa405c7d", - "89c8eb364beb84919fc9153d2c1ed2696274ec18", - "73307055c3aec095dd1232c38d793ef82a06bd97", - "6152c09255a5efa43b1a115546e35affa00c138c", - "2f8802fc391f67f713302e84fad4fd8f1366d6c2", - ] - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -@pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") -class TestArrayWithShardingStorageTransformerV3(TestArrayV3): - compressor = None - - def create_storage_transformers(self, shape) -> Tuple[Any]: - num_dims = 1 if isinstance(shape, int) else len(shape) - return (ShardingStorageTransformer("indexed", chunks_per_shard=(2,) * num_dims),) - - def test_nbytes_stored(self): - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") - assert expect_nbytes_stored == z.nbytes_stored - - # mess with store - z.store[data_root + z._key_prefix + "foo"] = list(range(10)) - assert -1 == z.nbytes_stored - - def test_keys_inner_store(self): - z = self.create_array(shape=1000, chunks=100) - assert z.chunk_store.keys() == z._store.keys() - meta_keys = set(z.store.keys()) - z[:] = 42 - assert len(z.chunk_store.keys() - meta_keys) == 10 - # inner store should have half the data keys, - # since chunks_per_shard is 2: - assert len(z._store.keys() - meta_keys) == 5 - - def test_supports_efficient_get_set_partial_values(self): - z = self.create_array(shape=100, chunks=10) - assert not z.chunk_store.supports_efficient_get_partial_values - assert not z.chunk_store.supports_efficient_set_partial_values() - - def expected(self): - return [ - "90109fc2a4e17efbcb447003ea1c08828b91f71e", - "2b73519f7260dba3ddce0d2b70041888856fec6b", - "bca5798be2ed71d444f3045b05432d937682b7dd", - "9ff1084501e28520e577662a6e3073f1116c76a2", - "882a97cad42417f90f111d0cb916a21579650467", - ] - - -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -def test_array_mismatched_store_versions(): - store_v3 = KVStoreV3(dict()) - store_v2 = KVStore(dict()) - - # separate chunk store - chunk_store_v2 = KVStore(dict()) - chunk_store_v3 = KVStoreV3(dict()) - - init_kwargs = dict(shape=100, chunks=10, dtype="""" diff --git a/tests/test_creation.py b/tests/test_creation.py index 27ce00bc8a..369d755700 100644 --- a/tests/test_creation.py +++ b/tests/test_creation.py @@ -7,8 +7,6 @@ import pytest from numpy.testing import assert_array_equal -from zarr._storage.store import DEFAULT_ZARR_VERSION -from zarr._storage.v3_storage_transformers import DummyStorageTransfomer from zarr.codecs import Zlib from zarr.core import Array from zarr.creation import ( @@ -28,14 +26,12 @@ from zarr.hierarchy import open_group from zarr.n5 import N5Store from zarr.storage import DirectoryStore, KVStore -from zarr._storage.store import v3_api_available -from zarr._storage.v3 import DirectoryStoreV3, KVStoreV3 from zarr.sync import ThreadSynchronizer from .util import mktemp, have_fsspec -_VERSIONS = (None, 2, 3) if v3_api_available else (None, 2) -_VERSIONS2 = (2, 3) if v3_api_available else (2,) +_VERSIONS = (None, 2) +_VERSIONS2 = (2,) # something bcolz-like @@ -64,25 +60,22 @@ def __getitem__(self, item): return self.data[item] -def _init_creation_kwargs(zarr_version, at_root=True): - kwargs = {"zarr_version": zarr_version} +def _init_creation_kwargs(at_root=True): + kwargs = {} if not at_root: kwargs["path"] = "array" return kwargs -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_array(zarr_version, at_root): - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version - kwargs = _init_creation_kwargs(zarr_version, at_root) +def test_array(at_root): + kwargs = _init_creation_kwargs(at_root) # with numpy array a = np.arange(100) z = array(a, chunks=10, **kwargs) assert a.shape == z.shape assert a.dtype == z.dtype - assert z._store._store_version == expected_zarr_version assert_array_equal(a, z[:]) # with array-like @@ -131,39 +124,35 @@ def test_array(zarr_version, at_root): assert np.dtype("i8") == z.dtype -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_empty(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) +def test_empty(at_root): + kwargs = _init_creation_kwargs(at_root) z = empty(100, chunks=10, **kwargs) assert (100,) == z.shape assert (10,) == z.chunks -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_zeros(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) +def test_zeros(at_root): + kwargs = _init_creation_kwargs(at_root) z = zeros(100, chunks=10, **kwargs) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.zeros(100), z[:]) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_ones(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) +def test_ones(at_root): + kwargs = _init_creation_kwargs(at_root) z = ones(100, chunks=10, **kwargs) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.ones(100), z[:]) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_full(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) +def test_full(at_root): + kwargs = _init_creation_kwargs(at_root) z = full(100, chunks=10, fill_value=42, dtype="i4", **kwargs) assert (100,) == z.shape assert (10,) == z.chunks @@ -174,10 +163,9 @@ def test_full(zarr_version, at_root): assert np.all(np.isnan(z[:])) -@pytest.mark.parametrize("zarr_version", [None, 2]) # TODO -def test_full_additional_dtypes(zarr_version): +def test_full_additional_dtypes(): """Test additional types that aren't part of the base v3 spec.""" - kwargs = _init_creation_kwargs(zarr_version) + kwargs = _init_creation_kwargs() # NaT z = full(100, chunks=10, fill_value="NaT", dtype="M8[s]", **kwargs) assert np.all(np.isnat(z[:])) @@ -209,11 +197,10 @@ def test_full_additional_dtypes(zarr_version): @pytest.mark.parametrize("dimension_separator", [".", "/", None]) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_open_array(zarr_version, at_root, dimension_separator): +def test_open_array(at_root, dimension_separator): store = "data/array.zarr" - kwargs = _init_creation_kwargs(zarr_version, at_root) + kwargs = _init_creation_kwargs(at_root) # mode == 'w' z = open_array( @@ -221,23 +208,19 @@ def test_open_array(zarr_version, at_root, dimension_separator): ) z[:] = 42 assert isinstance(z, Array) - if z._store._store_version == 2: - assert isinstance(z.store, DirectoryStore) - else: - assert isinstance(z.store, DirectoryStoreV3) + + assert isinstance(z.store, DirectoryStore) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.full(100, fill_value=42), z[:]) if dimension_separator is None: - assert z._dimension_separator == "/" if zarr_version == 3 else "." + assert z._dimension_separator == "." else: assert z._dimension_separator == dimension_separator # mode in 'r', 'r+' group_kwargs = kwargs.copy() - if zarr_version == 3: - group_kwargs["path"] = "group" open_group("data/group.zarr", mode="w", **group_kwargs) for mode in "r", "r+": with pytest.raises(ValueError): @@ -246,10 +229,7 @@ def test_open_array(zarr_version, at_root, dimension_separator): open_array("data/group.zarr", mode=mode) z = open_array(store, mode="r", **kwargs) assert isinstance(z, Array) - if z._store._store_version == 2: - assert isinstance(z.store, DirectoryStore) - else: - assert isinstance(z.store, DirectoryStoreV3) + assert isinstance(z.store, DirectoryStore) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.full(100, fill_value=42), z[:]) @@ -257,10 +237,7 @@ def test_open_array(zarr_version, at_root, dimension_separator): z[:] = 43 z = open_array(store, mode="r+", **kwargs) assert isinstance(z, Array) - if z._store._store_version == 2: - assert isinstance(z.store, DirectoryStore) - else: - assert isinstance(z.store, DirectoryStoreV3) + assert isinstance(z.store, DirectoryStore) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.full(100, fill_value=42), z[:]) @@ -272,18 +249,12 @@ def test_open_array(zarr_version, at_root, dimension_separator): z = open_array(store, mode="a", shape=100, chunks=10, **kwargs) z[:] = 42 assert isinstance(z, Array) - if z._store._store_version == 2: - assert isinstance(z.store, DirectoryStore) - else: - assert isinstance(z.store, DirectoryStoreV3) + assert isinstance(z.store, DirectoryStore) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.full(100, fill_value=42), z[:]) - expected_error = TypeError if zarr_version == 3 else ValueError - # v3 path does not conflict, but will raise TypeError without shape kwarg - with pytest.raises(expected_error): - # array would end up at data/group.zarr/meta/root/array.array.json + with pytest.raises(ValueError): open_array("data/group.zarr", mode="a", **kwargs) # mode in 'w-', 'x' @@ -292,18 +263,14 @@ def test_open_array(zarr_version, at_root, dimension_separator): z = open_array(store, mode=mode, shape=100, chunks=10, **kwargs) z[:] = 42 assert isinstance(z, Array) - if z._store._store_version == 2: - assert isinstance(z.store, DirectoryStore) - else: - assert isinstance(z.store, DirectoryStoreV3) + assert isinstance(z.store, DirectoryStore) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.full(100, fill_value=42), z[:]) with pytest.raises(ValueError): open_array(store, mode=mode, **kwargs) - expected_error = TypeError if zarr_version == 3 else ValueError - # v3 path does not conflict, but will raise TypeError without shape kwarg - with pytest.raises(expected_error): + + with pytest.raises(ValueError): open_array("data/group.zarr", mode=mode, **kwargs) # with synchronizer @@ -327,21 +294,15 @@ def test_open_array(zarr_version, at_root, dimension_separator): def test_open_array_none(): - # open with both store and zarr_version = None + # open with store = None z = open_array(mode="w", shape=100, chunks=10) assert isinstance(z, Array) - assert z._version == 2 @pytest.mark.parametrize("dimension_separator", [".", "/", None]) -@pytest.mark.parametrize("zarr_version", _VERSIONS2) -def test_open_array_infer_separator_from_store(zarr_version, dimension_separator): - if zarr_version == 3: - StoreClass = DirectoryStoreV3 - path = "data" - else: - StoreClass = DirectoryStore - path = None +def test_open_array_infer_separator_from_store(dimension_separator): + StoreClass = DirectoryStore + path = None store = StoreClass("data/array.zarr", dimension_separator=dimension_separator) # Note: no dimension_separator kwarg to open_array @@ -349,25 +310,20 @@ def test_open_array_infer_separator_from_store(zarr_version, dimension_separator z = open_array(store, path=path, mode="w", shape=100, chunks=10) z[:] = 42 assert isinstance(z, Array) - if z._store._store_version == 2: - assert isinstance(z.store, DirectoryStore) - else: - assert isinstance(z.store, DirectoryStoreV3) + assert isinstance(z.store, DirectoryStore) assert (100,) == z.shape assert (10,) == z.chunks assert_array_equal(np.full(100, fill_value=42), z[:]) if dimension_separator is None: - assert z._dimension_separator == "/" if zarr_version == 3 else "." + assert z._dimension_separator == "." else: assert z._dimension_separator == dimension_separator -# TODO: N5 support for v3 -@pytest.mark.parametrize("zarr_version", [None, 2]) -def test_open_array_n5(zarr_version): +def test_open_array_n5(): store = "data/array.zarr" - kwargs = _init_creation_kwargs(zarr_version) + kwargs = _init_creation_kwargs() # for N5 store store = "data/array.n5" @@ -381,8 +337,6 @@ def test_open_array_n5(zarr_version): store = "data/group.n5" group_kwargs = kwargs.copy() - # if zarr_version == 3: - # group_kwargs['path'] = 'group' z = open_group(store, mode="w", **group_kwargs) i = z.create_group("inner") a = i.zeros("array", shape=100, chunks=10) @@ -401,13 +355,12 @@ def test_open_array_n5(zarr_version): assert_array_equal(np.full(100, fill_value=42), a[:]) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_open_array_dict_store(zarr_version, at_root): +def test_open_array_dict_store(at_root): # dict will become a KVStore store = dict() - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_store_type = KVStoreV3 if zarr_version == 3 else KVStore + kwargs = _init_creation_kwargs(at_root) + expected_store_type = KVStore # mode == 'w' z = open_array(store, mode="w", shape=100, chunks=10, **kwargs) @@ -419,11 +372,10 @@ def test_open_array_dict_store(zarr_version, at_root): assert_array_equal(np.full(100, fill_value=42), z[:]) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_create_in_dict(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_store_type = KVStoreV3 if zarr_version == 3 else KVStore +def test_create_in_dict(at_root): + kwargs = _init_creation_kwargs(at_root) + expected_store_type = KVStore for func in [empty, zeros, ones]: a = func(100, store=dict(), **kwargs) @@ -434,27 +386,23 @@ def test_create_in_dict(zarr_version, at_root): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_create_writeable_mode(zarr_version, at_root, tmp_path): +def test_create_writeable_mode(at_root, tmp_path): # Regression test for https://github.com/zarr-developers/zarr-python/issues/1306 import fsspec - kwargs = _init_creation_kwargs(zarr_version, at_root) + kwargs = _init_creation_kwargs(at_root) store = fsspec.get_mapper(str(tmp_path)) z = create(100, store=store, **kwargs) assert z.store.map == store -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_empty_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version +def test_empty_like(at_root): + kwargs = _init_creation_kwargs(at_root) # zarr array z = empty(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) - # zarr_version will be inferred from z, but have to specify a path in v3 z2 = empty_like(z, path=kwargs.get("path")) assert z.shape == z2.shape assert z.chunks == z2.chunks @@ -462,7 +410,6 @@ def test_empty_like(zarr_version, at_root): assert z.compressor.get_config() == z2.compressor.get_config() assert z.fill_value == z2.fill_value assert z.order == z2.order - assert z._store._store_version == z2._store._store_version == expected_zarr_version # numpy array a = np.empty(100, dtype="f4") @@ -471,7 +418,6 @@ def test_empty_like(zarr_version, at_root): assert (100,) == z3.chunks assert a.dtype == z3.dtype assert z3.fill_value is None - assert z3._store._store_version == expected_zarr_version # something slightly silly a = [0] * 100 @@ -494,11 +440,9 @@ def test_empty_like(zarr_version, at_root): assert isinstance(z.chunks, tuple) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_zeros_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version +def test_zeros_like(at_root): + kwargs = _init_creation_kwargs(at_root) # zarr array z = zeros(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) @@ -509,7 +453,7 @@ def test_zeros_like(zarr_version, at_root): assert z.compressor.get_config() == z2.compressor.get_config() assert z.fill_value == z2.fill_value assert z.order == z2.order - assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array a = np.empty(100, dtype="f4") z3 = zeros_like(a, chunks=10, **kwargs) @@ -519,11 +463,9 @@ def test_zeros_like(zarr_version, at_root): assert 0 == z3.fill_value -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_ones_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version +def test_ones_like(at_root): + kwargs = _init_creation_kwargs(at_root) # zarr array z = ones(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) @@ -534,7 +476,7 @@ def test_ones_like(zarr_version, at_root): assert z.compressor.get_config() == z2.compressor.get_config() assert z.fill_value == z2.fill_value assert z.order == z2.order - assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array a = np.empty(100, dtype="f4") z3 = ones_like(a, chunks=10, **kwargs) @@ -542,14 +484,11 @@ def test_ones_like(zarr_version, at_root): assert (10,) == z3.chunks assert a.dtype == z3.dtype assert 1 == z3.fill_value - assert z3._store._store_version == expected_zarr_version -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_full_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version +def test_full_like(at_root): + kwargs = _init_creation_kwargs(at_root) z = full(100, chunks=10, dtype="f4", compressor=Zlib(5), fill_value=42, order="F", **kwargs) z2 = full_like(z, path=kwargs.get("path")) @@ -559,7 +498,7 @@ def test_full_like(zarr_version, at_root): assert z.compressor.get_config() == z2.compressor.get_config() assert z.fill_value == z2.fill_value assert z.order == z2.order - assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array a = np.empty(100, dtype="f4") z3 = full_like(a, chunks=10, fill_value=42, **kwargs) @@ -567,17 +506,15 @@ def test_full_like(zarr_version, at_root): assert (10,) == z3.chunks assert a.dtype == z3.dtype assert 42 == z3.fill_value - assert z3._store._store_version == expected_zarr_version + with pytest.raises(TypeError): # fill_value missing full_like(a, chunks=10, **kwargs) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_open_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version +def test_open_like(at_root): + kwargs = _init_creation_kwargs(at_root) # zarr array path = mktemp() @@ -590,24 +527,21 @@ def test_open_like(zarr_version, at_root): assert z.compressor.get_config() == z2.compressor.get_config() assert z.fill_value == z2.fill_value assert z.order == z2.order - assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array path = mktemp() atexit.register(shutil.rmtree, path) a = np.empty(100, dtype="f4") - z3 = open_like(a, path, chunks=10, zarr_version=zarr_version) + z3 = open_like(a, path, chunks=10) assert a.shape == z3.shape assert (10,) == z3.chunks assert a.dtype == z3.dtype assert 0 == z3.fill_value - assert z3._store._store_version == expected_zarr_version -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_create(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version +def test_create(at_root): + kwargs = _init_creation_kwargs(at_root) # defaults z = create(100, **kwargs) @@ -617,7 +551,6 @@ def test_create(zarr_version, at_root): assert np.dtype(None) == z.dtype assert "blosc" == z.compressor.codec_id assert 0 == z.fill_value - assert z._store._store_version == expected_zarr_version # all specified z = create(100, chunks=10, dtype="i4", compressor=Zlib(1), fill_value=42, order="F", **kwargs) @@ -629,7 +562,6 @@ def test_create(zarr_version, at_root): assert 1 == z.compressor.level assert 42 == z.fill_value assert "F" == z.order - assert z._store._store_version == expected_zarr_version # with synchronizer synchronizer = ThreadSynchronizer() @@ -638,7 +570,6 @@ def test_create(zarr_version, at_root): assert (100,) == z.shape assert (10,) == z.chunks assert synchronizer is z.synchronizer - assert z._store._store_version == expected_zarr_version # don't allow string as compressor arg with pytest.raises(ValueError): @@ -671,9 +602,8 @@ def test_create(zarr_version, at_root): assert z.chunks == z.shape -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_compression_args(zarr_version): - kwargs = _init_creation_kwargs(zarr_version) +def test_compression_args(): + kwargs = _init_creation_kwargs() with warnings.catch_warnings(): warnings.simplefilter("default") @@ -704,12 +634,11 @@ def test_compression_args(zarr_version): create(100, compressor=Zlib(9), compression_opts=1, **kwargs) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_create_read_only(zarr_version, at_root): +def test_create_read_only(at_root): # https://github.com/alimanfoo/zarr/issues/151 - kwargs = _init_creation_kwargs(zarr_version, at_root) + kwargs = _init_creation_kwargs(at_root) # create an array initially read-only, then enable writing z = create(100, read_only=True, **kwargs) @@ -738,18 +667,6 @@ def test_json_dumps_chunks_numpy_dtype(): assert np.all(z[...] == 0) -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -@pytest.mark.parametrize("at_root", [False, True]) -def test_create_with_storage_transformers(at_root): - kwargs = _init_creation_kwargs(zarr_version=3, at_root=at_root) - transformer = DummyStorageTransfomer( - "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT - ) - z = create(1000000000, chunks=True, storage_transformers=[transformer], **kwargs) - assert isinstance(z.chunk_store, DummyStorageTransfomer) - assert z.chunk_store.test_value == DummyStorageTransfomer.TEST_CONSTANT - - @pytest.mark.parametrize( ("init_shape", "init_chunks", "shape", "chunks"), ( diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 6d4b1ff54c..8cd51cc940 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -18,7 +18,6 @@ from numcodecs import Zlib from numpy.testing import assert_array_equal -from zarr._storage.store import _get_metadata_suffix, v3_api_available from zarr.attrs import Attributes from zarr.core import Array from zarr.creation import open_array @@ -38,29 +37,13 @@ array_meta_key, atexit_rmglob, atexit_rmtree, - data_root, group_meta_key, init_array, init_group, - meta_root, ) -from zarr._storage.v3 import ( - ABSStoreV3, - KVStoreV3, - DirectoryStoreV3, - MemoryStoreV3, - FSStoreV3, - ZipStoreV3, - DBMStoreV3, - LMDBStoreV3, - SQLiteStoreV3, - LRUStoreCacheV3, -) -from zarr.util import InfoReporter, buffer_size -from .util import skip_test_env_var, have_fsspec, abs_container, mktemp - -_VERSIONS = (2, 3) if v3_api_available else (2,) +from zarr.util import InfoReporter +from .util import skip_test_env_var, have_fsspec, abs_container, mktemp # noinspection PyStatementEffect @@ -148,10 +131,7 @@ def _subgroup_path(self, group, path): def test_create_group(self): g1 = self.create_group() - if g1._version == 2: - path, name = "", "/" - else: - path, name = "group", "/group" + path, name = "", "/" # check root group assert path == g1.path assert name == g1.name @@ -205,12 +185,8 @@ def __str__(self): # test bad keys with pytest.raises(ValueError): g1.create_group("foo") # already exists - if g1._version == 2: - with pytest.raises(ValueError): - g1.create_group("a/b/c") # already exists - elif g1._version == 3: - # for v3 'group/a/b/c' does not already exist - g1.create_group("a/b/c") + with pytest.raises(ValueError): + g1.create_group("a/b/c") # already exists with pytest.raises(ValueError): g4.create_group("/a/b/c") # already exists with pytest.raises(ValueError): @@ -260,16 +236,7 @@ def test_require_group(self): assert g5.store is g5a.store # test path normalization - if g1._version == 2: - assert g1.require_group("quux") == g1.require_group("/quux/") - elif g1._version: - # These are not equal in v3! - # 'quux' will be within the group: - # meta/root/group/quux.group.json - # '/quux/' will be outside of the group at: - # meta/root/quux.group.json - assert g1.require_group("quux") != g1.require_group("/quux/") - + assert g1.require_group("quux") == g1.require_group("/quux/") # multi g6, g7 = g1.require_groups("y", "z") assert isinstance(g6, Group) @@ -289,24 +256,9 @@ def test_rmdir_group_and_array_metadata_files(self): g1.create_dataset("arr1", shape=(100,), chunks=(10,), dtype=np.uint8) # create level 1 child group - g2 = g1.create_group("foo") + _ = g1.create_group("foo") g1.create_dataset("arr2", shape=(100,), chunks=(10,), dtype=np.uint8) - if g1._version > 2 and g1.store.is_erasable(): - arr_path = g1.path + "/arr1" - sfx = _get_metadata_suffix(g1.store) - array_meta_file = meta_root + arr_path + ".array" + sfx - assert array_meta_file in g1.store - group_meta_file = meta_root + g2.path + ".group" + sfx - assert group_meta_file in g1.store - - # rmdir on the array path should also remove the metadata file - g1.store.rmdir(arr_path) - assert array_meta_file not in g1.store - # rmdir on the group path should also remove its metadata file - g1.store.rmdir(g2.path) - assert group_meta_file not in g1.store - def _dataset_path(self, group, path): path = path.rstrip("/") absolute = path.startswith("/") @@ -541,12 +493,9 @@ def test_getitem_contains_iterators(self): # setup g1 = self.create_group() g2 = g1.create_group("foo/bar") - if g1._version == 2: - d1 = g2.create_dataset("/a/b/c", shape=1000, chunks=100) - else: - # v3: cannot create a dataset at the root by starting with / - # instead, need to create the dataset on g1 directly - d1 = g1.create_dataset("a/b/c", shape=1000, chunks=100) + + d1 = g2.create_dataset("/a/b/c", shape=1000, chunks=100) + d1[:] = np.arange(1000) d2 = g1.create_dataset("foo/baz", shape=3000, chunks=300) d2[:] = np.arange(3000) @@ -555,13 +504,7 @@ def test_getitem_contains_iterators(self): assert isinstance(g1["foo"], Group) assert isinstance(g1["foo"]["bar"], Group) assert isinstance(g1["foo/bar"], Group) - if g1._version == 2: - assert isinstance(g1["/foo/bar/"], Group) - else: - # start or end with / raises KeyError - # TODO: should we allow stripping of these on v3? - with pytest.raises(KeyError): - assert isinstance(g1["/foo/bar/"], Group) + assert isinstance(g1["/foo/bar/"], Group) assert isinstance(g1["foo/baz"], Array) assert g2 == g1["foo/bar"] assert g1["foo"]["bar"] == g1["foo/bar"] @@ -604,18 +547,12 @@ def test_getitem_contains_iterators(self): # test __iter__, keys() - if g1._version == 2: - # currently assumes sorted by key - assert ["a", "foo"] == list(g1) - assert ["a", "foo"] == list(g1.keys()) - assert ["bar", "baz"] == list(g1["foo"]) - assert ["bar", "baz"] == list(g1["foo"].keys()) - else: - # v3 is not necessarily sorted by key - assert ["a", "foo"] == sorted(list(g1)) - assert ["a", "foo"] == sorted(list(g1.keys())) - assert ["bar", "baz"] == sorted(list(g1["foo"])) - assert ["bar", "baz"] == sorted(list(g1["foo"].keys())) + # currently assumes sorted by key + assert ["a", "foo"] == list(g1) + assert ["a", "foo"] == list(g1.keys()) + assert ["bar", "baz"] == list(g1["foo"]) + assert ["bar", "baz"] == list(g1["foo"].keys()) + assert [] == sorted(g1["foo/bar"]) assert [] == sorted(g1["foo/bar"].keys()) @@ -624,9 +561,6 @@ def test_getitem_contains_iterators(self): items = list(g1.items()) values = list(g1.values()) - if g1._version == 3: - # v3 are not automatically sorted by key - items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) assert "a" == items[0][0] assert g1["a"] == items[0][1] assert g1["a"] == values[0] @@ -636,9 +570,6 @@ def test_getitem_contains_iterators(self): items = list(g1["foo"].items()) values = list(g1["foo"].values()) - if g1._version == 3: - # v3 are not automatically sorted by key - items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) assert "bar" == items[0][0] assert g1["foo"]["bar"] == items[0][1] assert g1["foo"]["bar"] == values[0] @@ -650,13 +581,8 @@ def test_getitem_contains_iterators(self): groups = list(g1.groups()) arrays = list(g1.arrays()) - if g1._version == 2: - # currently assumes sorted by key - assert ["a", "foo"] == list(g1.group_keys()) - else: - assert ["a", "foo"] == sorted(list(g1.group_keys())) - groups = sorted(groups) - arrays = sorted(arrays) + # currently assumes sorted by key + assert ["a", "foo"] == list(g1.group_keys()) assert "a" == groups[0][0] assert g1["a"] == groups[0][1] assert "foo" == groups[1][0] @@ -668,9 +594,6 @@ def test_getitem_contains_iterators(self): assert ["baz"] == list(g1["foo"].array_keys()) groups = list(g1["foo"].groups()) arrays = list(g1["foo"].arrays()) - if g1._version == 3: - groups = sorted(groups) - arrays = sorted(arrays) assert "bar" == groups[0][0] assert g1["foo"]["bar"] == groups[0][1] assert "baz" == arrays[0][0] @@ -699,8 +622,6 @@ def visitor4(name, obj): "foo/bar", "foo/baz", ] - if g1._version == 3: - expected_items = [g1.path + "/" + i for i in expected_items] assert expected_items == items del items[:] @@ -709,8 +630,6 @@ def visitor4(name, obj): "foo/bar", "foo/baz", ] - if g1._version == 3: - expected_items = [g1.path + "/" + i for i in expected_items] assert expected_items == items del items[:] @@ -937,28 +856,10 @@ def test_move(self): g2.move("bar", "/bar") assert "foo2" in g assert "foo2/bar" not in g - if g2._version == 2: - assert "bar" in g - else: - # The `g2.move` call above moved bar to meta/root/bar and - # meta/data/bar. This is outside the `g` group located at - # /meta/root/group, so bar is no longer within `g`. - assert "bar" not in g - assert "meta/root/bar.array.json" in g._store - if g._chunk_store: - assert "data/root/bar/c0" in g._chunk_store - else: - assert "data/root/bar/c0" in g._store + assert "bar" in g assert isinstance(g["foo2"], Group) - if g2._version == 2: - assert_array_equal(data, g["bar"]) - else: - # TODO: How to access element created outside of group.path in v3? - # One option is to make a Hierarchy class representing the - # root. Currently Group requires specification of `path`, - # but the path of the root would be just '' which is not - # currently allowed. - pass + + assert_array_equal(data, g["bar"]) with pytest.raises(ValueError): g2.move("bar", "bar2") @@ -1035,39 +936,19 @@ def test_paths(self): g1 = self.create_group() g2 = g1.create_group("foo/bar") - if g1._version == 2: - assert g1 == g1["/"] - assert g1 == g1["//"] - assert g1 == g1["///"] - assert g1 == g2["/"] - assert g1 == g2["//"] - assert g1 == g2["///"] - assert g2 == g1["foo/bar"] - assert g2 == g1["/foo/bar"] - assert g2 == g1["foo/bar/"] - assert g2 == g1["//foo/bar"] - assert g2 == g1["//foo//bar//"] - assert g2 == g1["///foo///bar///"] - assert g2 == g2["/foo/bar"] - else: - # the expected key format gives a match - assert g2 == g1["foo/bar"] - - # TODO: Should presence of a trailing slash raise KeyError? - # The spec says "the final character is not a / character" - # but we currently strip trailing '/' as done for v2. - assert g2 == g1["foo/bar/"] - - # double slash also currently works (spec doesn't mention this - # case, but have kept it for v2 behavior compatibility) - assert g2 == g1["foo//bar"] - - # TODO, root: fix these cases - # v3: leading / implies we are at the root, not within a group, - # so these all raise KeyError - for path in ["/foo/bar", "//foo/bar", "//foo//bar//", "///fooo///bar///"]: - with pytest.raises(KeyError): - g1[path] + assert g1 == g1["/"] + assert g1 == g1["//"] + assert g1 == g1["///"] + assert g1 == g2["/"] + assert g1 == g2["//"] + assert g1 == g2["///"] + assert g2 == g1["foo/bar"] + assert g2 == g1["/foo/bar"] + assert g2 == g1["foo/bar/"] + assert g2 == g1["//foo/bar"] + assert g2 == g1["//foo//bar//"] + assert g2 == g1["///foo///bar///"] + assert g2 == g2["/foo/bar"] with pytest.raises(ValueError): g1["."] @@ -1133,77 +1014,12 @@ def test_group_init_from_dict(chunk_dict): assert chunk_store is not g.chunk_store -# noinspection PyStatementEffect -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3(TestGroup, unittest.TestCase): - @staticmethod - def create_store(): - # can be overridden in sub-classes - return KVStoreV3(dict()), None - - def create_group( - self, store=None, path="group", read_only=False, chunk_store=None, synchronizer=None - ): - # can be overridden in sub-classes - if store is None: - store, chunk_store = self.create_store() - init_group(store, path=path, chunk_store=chunk_store) - g = Group( - store, - path=path, - read_only=read_only, - chunk_store=chunk_store, - synchronizer=synchronizer, - ) - return g - - def test_group_init_1(self): - store, chunk_store = self.create_store() - g = self.create_group(store, chunk_store=chunk_store) - assert store is g.store - if chunk_store is None: - assert store is g.chunk_store - else: - assert chunk_store is g.chunk_store - assert not g.read_only - # different path/name in v3 case - assert "group" == g.path - assert "/group" == g.name - assert "group" == g.basename - - assert isinstance(g.attrs, Attributes) - g.attrs["foo"] = "bar" - assert g.attrs["foo"] == "bar" - - assert isinstance(g.info, InfoReporter) - assert isinstance(repr(g.info), str) - assert isinstance(g.info._repr_html_(), str) - store.close() - - def test_group_init_errors_2(self): - store, chunk_store = self.create_store() - path = "tmp" - init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) - # array blocks group - with pytest.raises(ValueError): - Group(store, path=path, chunk_store=chunk_store) - store.close() - - class TestGroupWithMemoryStore(TestGroup): @staticmethod def create_store(): return MemoryStore(), None -# noinspection PyStatementEffect -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithMemoryStore(TestGroupWithMemoryStore, TestGroupV3): - @staticmethod - def create_store(): - return MemoryStoreV3(), None - - class TestGroupWithDirectoryStore(TestGroup): @staticmethod def create_store(): @@ -1213,16 +1029,6 @@ def create_store(): return store, None -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithDirectoryStore(TestGroupWithDirectoryStore, TestGroupV3): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = DirectoryStoreV3(path) - return store, None - - @skip_test_env_var("ZARR_TEST_ABS") class TestGroupWithABSStore(TestGroup): @staticmethod @@ -1238,22 +1044,6 @@ def test_pickle(self): super().test_pickle() -@skip_test_env_var("ZARR_TEST_ABS") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithABSStore(TestGroupV3): - @staticmethod - def create_store(): - container_client = abs_container() - store = ABSStoreV3(client=container_client) - store.rmdir() - return store, None - - @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") - def test_pickle(self): - # internal attribute on ContainerClient isn't serializable for py36 and earlier - super().test_pickle() - - class TestGroupWithNestedDirectoryStore(TestGroup): @staticmethod def create_store(): @@ -1284,39 +1074,6 @@ def test_round_trip_nd(self): np.testing.assert_array_equal(h[name][:], data) -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithFSStore(TestGroupWithFSStore, TestGroupV3): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = FSStoreV3(path) - return store, None - - def test_round_trip_nd(self): - data = np.arange(1000).reshape(10, 10, 10) - name = "raw" - - store, _ = self.create_store() - f = open_group(store, path="group", mode="w") - f.create_dataset(name, data=data, chunks=(5, 5, 5), compressor=None) - h = open_group(store, path="group", mode="r") - np.testing.assert_array_equal(h[name][:], data) - - f = open_group(store, path="group2", mode="w") - - data_size = data.nbytes - group_meta_size = buffer_size(store[meta_root + "group.group.json"]) - group2_meta_size = buffer_size(store[meta_root + "group2.group.json"]) - array_meta_size = buffer_size(store[meta_root + "group/raw.array.json"]) - assert store.getsize() == data_size + group_meta_size + group2_meta_size + array_meta_size - # added case with path to complete coverage - assert store.getsize("group") == data_size + group_meta_size + array_meta_size - assert store.getsize("group2") == group2_meta_size - assert store.getsize("group/raw") == data_size + array_meta_size - - @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestGroupWithNestedFSStore(TestGroupWithFSStore): @staticmethod @@ -1340,30 +1097,6 @@ def test_inconsistent_dimension_separator(self): ) -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithNestedFSStore(TestGroupV3WithFSStore): - @staticmethod - def create_store(): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) - store = FSStoreV3(path, key_separator="/", auto_mkdir=True) - return store, None - - def test_inconsistent_dimension_separator(self): - data = np.arange(1000).reshape(10, 10, 10) - name = "raw" - - store, _ = self.create_store() - f = open_group(store, path="group", mode="w") - - # cannot specify dimension_separator that conflicts with the store - with pytest.raises(ValueError): - f.create_dataset( - name, data=data, chunks=(5, 5, 5), compressor=None, dimension_separator="." - ) - - class TestGroupWithZipStore(TestGroup): @staticmethod def create_store(): @@ -1389,16 +1122,6 @@ def test_move(self): pass -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithZipStore(TestGroupWithZipStore, TestGroupV3): - @staticmethod - def create_store(): - path = mktemp(suffix=".zip") - atexit.register(os.remove, path) - store = ZipStoreV3(path) - return store, None - - class TestGroupWithDBMStore(TestGroup): @staticmethod def create_store(): @@ -1408,16 +1131,6 @@ def create_store(): return store, None -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithDBMStore(TestGroupWithDBMStore, TestGroupV3): - @staticmethod - def create_store(): - path = mktemp(suffix=".anydbm") - atexit.register(atexit_rmglob, path + "*") - store = DBMStoreV3(path, flag="n") - return store, None - - class TestGroupWithDBMStoreBerkeleyDB(TestGroup): @staticmethod def create_store(): @@ -1428,17 +1141,6 @@ def create_store(): return store, None -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithDBMStoreBerkeleyDB(TestGroupWithDBMStoreBerkeleyDB, TestGroupV3): - @staticmethod - def create_store(): - bsddb3 = pytest.importorskip("bsddb3") - path = mktemp(suffix=".dbm") - atexit.register(os.remove, path) - store = DBMStoreV3(path, flag="n", open=bsddb3.btopen) - return store, None - - class TestGroupWithLMDBStore(TestGroup): @staticmethod def create_store(): @@ -1449,17 +1151,6 @@ def create_store(): return store, None -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithLMDBStore(TestGroupWithLMDBStore, TestGroupV3): - @staticmethod - def create_store(): - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - store = LMDBStoreV3(path) - return store, None - - class TestGroupWithSQLiteStore(TestGroup): def create_store(self): pytest.importorskip("sqlite3") @@ -1469,16 +1160,6 @@ def create_store(self): return store, None -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithSQLiteStore(TestGroupWithSQLiteStore, TestGroupV3): - def create_store(self): - pytest.importorskip("sqlite3") - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStoreV3(path) - return store, None - - class TestGroupWithChunkStore(TestGroup): @staticmethod def create_store(): @@ -1509,41 +1190,6 @@ def test_chunk_store(self): assert expect == actual -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithChunkStore(TestGroupWithChunkStore, TestGroupV3): - @staticmethod - def create_store(): - return KVStoreV3(dict()), KVStoreV3(dict()) - - def test_chunk_store(self): - # setup - store, chunk_store = self.create_store() - path = "group1" - g = self.create_group(store, path=path, chunk_store=chunk_store) - - # check attributes - assert store is g.store - assert chunk_store is g.chunk_store - - # create array - a = g.zeros("foo", shape=100, chunks=10) - assert store is a.store - assert chunk_store is a.chunk_store - a[:] = np.arange(100) - assert_array_equal(np.arange(100), a[:]) - - # check store keys - group_key = meta_root + path + ".group.json" - array_key = meta_root + path + "/foo" + ".array.json" - expect = sorted([group_key, array_key, "zarr.json"]) - actual = sorted(store.keys()) - assert expect == actual - expect = [data_root + path + "/foo/c" + str(i) for i in range(10)] - expect += ["zarr.json"] - actual = sorted(chunk_store.keys()) - assert expect == actual - - class TestGroupWithStoreCache(TestGroup): @staticmethod def create_store(): @@ -1551,58 +1197,8 @@ def create_store(): return store, None -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestGroupV3WithStoreCache(TestGroupWithStoreCache, TestGroupV3): - @staticmethod - def create_store(): - store = LRUStoreCacheV3(dict(), max_size=None) - return store, None - - -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_group(zarr_version): - # test the group() convenience function - - # basic usage - if zarr_version == 2: - g = group() - assert "" == g.path - assert "/" == g.name - else: - g = group(path="group1", zarr_version=zarr_version) - assert "group1" == g.path - assert "/group1" == g.name - assert isinstance(g, Group) - - # usage with custom store - if zarr_version == 2: - store = KVStore(dict()) - path = None - else: - store = KVStoreV3(dict()) - path = "foo" - g = group(store=store, path=path) - assert isinstance(g, Group) - assert store is g.store - - # overwrite behaviour - if zarr_version == 2: - store = KVStore(dict()) - path = None - else: - store = KVStoreV3(dict()) - path = "foo" - init_array(store, path=path, shape=100, chunks=10) - with pytest.raises(ValueError): - group(store, path=path) - g = group(store, path=path, overwrite=True) - assert isinstance(g, Group) - assert store is g.store - - @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_group_writeable_mode(zarr_version, tmp_path): +def test_group_writeable_mode(tmp_path): # Regression test for https://github.com/zarr-developers/zarr-python/issues/1353 import fsspec @@ -1611,17 +1207,16 @@ def test_group_writeable_mode(zarr_version, tmp_path): assert zg.store.map == store -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_open_group(zarr_version): +def test_open_group(): # test the open_group() convenience function store = "data/group.zarr" - expected_store_type = DirectoryStore if zarr_version == 2 else DirectoryStoreV3 + expected_store_type = DirectoryStore # mode == 'w' - path = None if zarr_version == 2 else "group1" - g = open_group(store, path=path, mode="w", zarr_version=zarr_version) + path = None + g = open_group(store, path=path, mode="w") assert isinstance(g, Group) assert isinstance(g.store, expected_store_type) assert 0 == len(g) @@ -1648,44 +1243,39 @@ def test_open_group(zarr_version): # mode == 'a' shutil.rmtree(store) - g = open_group(store, path=path, mode="a", zarr_version=zarr_version) + g = open_group(store, path=path, mode="a") assert isinstance(g, Group) assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups("foo", "bar") assert 2 == len(g) - if zarr_version == 2: - with pytest.raises(ValueError): - open_group("data/array.zarr", mode="a", zarr_version=zarr_version) - else: - # TODO, root: should this raise an error? - open_group("data/array.zarr", mode="a", zarr_version=zarr_version) + + with pytest.raises(ValueError): + open_group("data/array.zarr", mode="a") # mode in 'w-', 'x' for mode in "w-", "x": shutil.rmtree(store) - g = open_group(store, path=path, mode=mode, zarr_version=zarr_version) + g = open_group(store, path=path, mode=mode) assert isinstance(g, Group) assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups("foo", "bar") assert 2 == len(g) with pytest.raises(ValueError): - open_group(store, path=path, mode=mode, zarr_version=zarr_version) - if zarr_version == 2: - with pytest.raises(ValueError): - open_group("data/array.zarr", mode=mode) + open_group(store, path=path, mode=mode) + with pytest.raises(ValueError): + open_group("data/array.zarr", mode=mode) # open with path - g = open_group(store, path="foo/bar", zarr_version=zarr_version) + g = open_group(store, path="foo/bar") assert isinstance(g, Group) assert "foo/bar" == g.path -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_group_completions(zarr_version): - path = None if zarr_version == 2 else "group1" - g = group(path=path, zarr_version=zarr_version) +def test_group_completions(): + path = None + g = group(path=path) d = dir(g) assert "foo" not in d assert "bar" not in d @@ -1713,10 +1303,9 @@ def test_group_completions(zarr_version): assert "456" not in d # not valid identifier -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_group_key_completions(zarr_version): - path = None if zarr_version == 2 else "group1" - g = group(path=path, zarr_version=zarr_version) +def test_group_key_completions(): + path = None + g = group(path=path) d = dir(g) # noinspection PyProtectedMember k = g._ipython_key_completions_() @@ -1750,12 +1339,7 @@ def test_group_key_completions(zarr_version): g.zeros("yyy", shape=100) g.zeros("zzz", shape=100) g.zeros("456", shape=100) - if zarr_version == 2: - g.zeros("asdf;", shape=100) - else: - # cannot have ; in key name for v3 - with pytest.raises(ValueError): - g.zeros("asdf;", shape=100) + g.zeros("asdf;", shape=100) d = dir(g) # noinspection PyProtectedMember @@ -1770,8 +1354,7 @@ def test_group_key_completions(zarr_version): assert "zzz" in d assert "123" not in d # not valid identifier assert "456" not in d # not valid identifier - if zarr_version == 2: - assert "asdf;" not in d # not valid identifier + assert "asdf;" not in d # not valid identifier assert "foo" in k assert "bar" in k @@ -1782,8 +1365,7 @@ def test_group_key_completions(zarr_version): assert "zzz" in k assert "123" in k assert "456" in k - if zarr_version == 2: - assert "asdf;" in k + assert "asdf;" in k def _check_tree(g, expect_bytes, expect_text): @@ -1797,12 +1379,11 @@ def _check_tree(g, expect_bytes, expect_text): isinstance(widget, ipytree.Tree) -@pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) -def test_tree(zarr_version, at_root): +def test_tree(at_root): # setup path = None if at_root else "group1" - g1 = group(path=path, zarr_version=zarr_version) + g1 = group(path=path) g2 = g1.create_group("foo") g3 = g1.create_group("bar") g3.create_group("baz") @@ -1811,46 +1392,25 @@ def test_tree(zarr_version, at_root): tree_path = "/" if at_root else path # test root group - if zarr_version == 2: - expect_bytes = textwrap.dedent( - f"""\ - {tree_path} - +-- bar - | +-- baz - | +-- quux - | +-- baz (100,) float64 - +-- foo""" - ).encode() - expect_text = textwrap.dedent( - f"""\ - {tree_path} - ├── bar - │ ├── baz - │ └── quux - │ └── baz (100,) float64 - └── foo""" - ) - else: - # Almost the same as for v2, but has a path name and the - # subgroups are not necessarily sorted alphabetically. - expect_bytes = textwrap.dedent( - f"""\ - {tree_path} - +-- foo - +-- bar - +-- baz - +-- quux - +-- baz (100,) float64""" - ).encode() - expect_text = textwrap.dedent( - f"""\ - {tree_path} - ├── foo - └── bar - ├── baz - └── quux - └── baz (100,) float64""" - ) + + expect_bytes = textwrap.dedent( + f"""\ + {tree_path} + +-- bar + | +-- baz + | +-- quux + | +-- baz (100,) float64 + +-- foo""" + ).encode() + expect_text = textwrap.dedent( + f"""\ + {tree_path} + ├── bar + │ ├── baz + │ └── quux + │ └── baz (100,) float64 + └── foo""" + ) _check_tree(g1, expect_bytes, expect_text) # test different group @@ -1882,47 +1442,11 @@ def test_tree(zarr_version, at_root): _check_tree(g3, expect_bytes, expect_text) -@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -def test_group_mismatched_store_versions(): - store_v3 = KVStoreV3(dict()) - store_v2 = KVStore(dict()) - - # separate chunk store - chunk_store_v2 = KVStore(dict()) - chunk_store_v3 = KVStoreV3(dict()) - - init_group(store_v2, path="group1", chunk_store=chunk_store_v2) - init_group(store_v3, path="group1", chunk_store=chunk_store_v3) - - g1_v3 = Group(store_v3, path="group1", read_only=True, chunk_store=chunk_store_v3) - assert isinstance(g1_v3._store, KVStoreV3) - g1_v2 = Group(store_v2, path="group1", read_only=True, chunk_store=chunk_store_v2) - assert isinstance(g1_v2._store, KVStore) - - # store and chunk_store must have the same zarr protocol version - with pytest.raises(ValueError): - Group(store_v3, path="group1", read_only=False, chunk_store=chunk_store_v2) - with pytest.raises(ValueError): - Group(store_v2, path="group1", read_only=False, chunk_store=chunk_store_v3) - with pytest.raises(ValueError): - open_group(store_v2, path="group1", chunk_store=chunk_store_v3) - with pytest.raises(ValueError): - open_group(store_v3, path="group1", chunk_store=chunk_store_v2) - - # raises Value if read_only and path is not a pre-existing group - with pytest.raises(ValueError): - Group(store_v3, path="group2", read_only=True, chunk_store=chunk_store_v3) - with pytest.raises(ValueError): - Group(store_v3, path="group2", read_only=True, chunk_store=chunk_store_v3) - - -@pytest.mark.parametrize("zarr_version", _VERSIONS) -def test_open_group_from_paths(zarr_version): +def test_open_group_from_paths(): """Verify zarr_version is applied to both the store and chunk_store.""" store = tempfile.mkdtemp() chunk_store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) atexit.register(atexit_rmtree, chunk_store) path = "g1" - g = open_group(store, path=path, chunk_store=chunk_store, zarr_version=zarr_version) - assert g._store._store_version == g._chunk_store._store_version == zarr_version + _ = open_group(store, path=path, chunk_store=chunk_store) diff --git a/tests/test_meta.py b/tests/test_meta.py index 50f51929ef..089afec781 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -1,5 +1,4 @@ import base64 -import copy import json import numpy as np @@ -16,11 +15,6 @@ encode_dtype, encode_fill_value, decode_fill_value, - get_extended_dtype_info, - _v3_complex_types, - _v3_datetime_types, - _default_entry_point_metadata_v3, - Metadata3, ) from zarr.util import normalize_dtype, normalize_fill_value @@ -285,77 +279,6 @@ def test_encode_decode_array_dtype_shape(): assert meta_dec["filters"] is None -def test_encode_decode_array_dtype_shape_v3(): - meta = dict( - shape=(100,), - chunk_grid=dict(type="regular", chunk_shape=(10,), separator=("/")), - data_type=np.dtype("(10, 10)U4", " Date: Wed, 10 Apr 2024 19:08:08 +0200 Subject: [PATCH 07/22] chore: remove more version-conditional logic --- src/zarr/_storage/store.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/zarr/_storage/store.py b/src/zarr/_storage/store.py index f10d65be57..911af20fda 100644 --- a/src/zarr/_storage/store.py +++ b/src/zarr/_storage/store.py @@ -212,24 +212,15 @@ def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: - if getattr(store, "_store_version", 2) == 3: - raise NotImplementedError("This function only supports Zarr version 2.") - else: - key = prefix + array_meta_key + key = prefix + array_meta_key return key def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: - if getattr(store, "_store_version", 2) == 3: - raise NotImplementedError("This function only supports Zarr version 2.") - else: - key = prefix + group_meta_key + key = prefix + group_meta_key return key def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: - if getattr(store, "_store_version", 2) == 3: - raise NotImplementedError("This function only supports Zarr version 2.") - else: - key = prefix + attrs_key + key = prefix + attrs_key return key From dcddb63fd8c245cb5adffa07704ccaabee90bc57 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 11 Apr 2024 22:31:57 +0200 Subject: [PATCH 08/22] chore: prune out n5, abs, sqlite, zip, redis, mongodb, dbm, lmdb stores --- src/zarr/__init__.py | 8 - src/zarr/_storage/absstore.py | 224 -------- src/zarr/n5.py | 897 ----------------------------- src/zarr/storage.py | 1011 +-------------------------------- tests/test_core.py | 389 +------------ tests/test_creation.py | 35 -- tests/test_hierarchy.py | 88 +-- tests/test_meta_array.py | 6 +- tests/test_n5.py | 53 -- tests/test_storage.py | 492 ---------------- 10 files changed, 8 insertions(+), 3195 deletions(-) delete mode 100644 src/zarr/_storage/absstore.py delete mode 100644 src/zarr/n5.py delete mode 100644 tests/test_n5.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 601b1295ab..725ad0a783 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -30,22 +30,14 @@ ) from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group -from zarr.n5 import N5Store, N5FSStore from zarr.storage import ( - ABSStore, - DBMStore, DictStore, DirectoryStore, KVStore, - LMDBStore, LRUStoreCache, MemoryStore, - MongoDBStore, NestedDirectoryStore, - RedisStore, - SQLiteStore, TempStore, - ZipStore, ) from zarr.sync import ProcessSynchronizer, ThreadSynchronizer from zarr._version import version as __version__ diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py deleted file mode 100644 index d8e292535c..0000000000 --- a/src/zarr/_storage/absstore.py +++ /dev/null @@ -1,224 +0,0 @@ -"""This module contains storage classes related to Azure Blob Storage (ABS)""" - -import warnings -from numcodecs.compat import ensure_bytes -from zarr.util import normalize_storage_path -from zarr._storage.store import Store - -__doctest_requires__ = { - ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], -} - - -class ABSStore(Store): - """Storage class using Azure Blob Storage (ABS). - - Parameters - ---------- - container : string - The name of the ABS container to use. - - .. deprecated:: - Use ``client`` instead. - - prefix : string - Location of the "directory" to use as the root of the storage hierarchy - within the container. - - account_name : string - The Azure blob storage account name. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - account_key : string - The Azure blob storage account access key. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - blob_service_kwargs : dictionary - Extra arguments to be passed into the azure blob client, for e.g. when - using the emulator, pass in blob_service_kwargs={'is_emulated': True}. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - client : azure.storage.blob.ContainerClient, optional - And ``azure.storage.blob.ContainerClient`` to connect with. See - `here `_ # noqa - for more. - - .. versionadded:: 2.8.3 - - Notes - ----- - In order to use this store, you must install the Microsoft Azure Storage SDK for Python, - ``azure-storage-blob>=12.5.0``. - """ # noqa: E501 - - def __init__( - self, - container=None, - prefix="", - account_name=None, - account_key=None, - blob_service_kwargs=None, - dimension_separator=None, - client=None, - ): - self._dimension_separator = dimension_separator - self.prefix = normalize_storage_path(prefix) - if client is None: - # deprecated option, try to construct the client for them - msg = ( - "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" - "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " - "'client' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - from azure.storage.blob import ContainerClient - - blob_service_kwargs = blob_service_kwargs or {} - client = ContainerClient( - "https://{}.blob.core.windows.net/".format(account_name), - container, - credential=account_key, - **blob_service_kwargs, - ) - - self.client = client - self._container = container - self._account_name = account_name - self._account_key = account_key - - @staticmethod - def _warn_deprecated(property_): - msg = ( - "The {} property is deprecated and will be removed in a future " - "version. Get the property from 'ABSStore.client' instead." - ) - warnings.warn(msg.format(property_), FutureWarning, stacklevel=3) - - @property - def container(self): - self._warn_deprecated("container") - return self._container - - @property - def account_name(self): - self._warn_deprecated("account_name") - return self._account_name - - @property - def account_key(self): - self._warn_deprecated("account_key") - return self._account_key - - def _append_path_to_prefix(self, path): - if self.prefix == "": - return normalize_storage_path(path) - else: - return "/".join([self.prefix, normalize_storage_path(path)]) - - @staticmethod - def _strip_prefix_from_path(path, prefix): - # normalized things will not have any leading or trailing slashes - path_norm = normalize_storage_path(path) - prefix_norm = normalize_storage_path(prefix) - if prefix: - return path_norm[(len(prefix_norm) + 1) :] - else: - return path_norm - - def __getitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - blob_name = self._append_path_to_prefix(key) - try: - return self.client.download_blob(blob_name).readall() - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % blob_name) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - blob_name = self._append_path_to_prefix(key) - self.client.upload_blob(blob_name, value, overwrite=True) - - def __delitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - try: - self.client.delete_blob(self._append_path_to_prefix(key)) - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % key) - - def __eq__(self, other): - return ( - isinstance(other, ABSStore) - and self.client == other.client - and self.prefix == other.prefix - ) - - def keys(self): - return list(self.__iter__()) - - def __iter__(self): - if self.prefix: - list_blobs_prefix = self.prefix + "/" - else: - list_blobs_prefix = None - for blob in self.client.list_blobs(list_blobs_prefix): - yield self._strip_prefix_from_path(blob.name, self.prefix) - - def __len__(self): - return len(self.keys()) - - def __contains__(self, key): - blob_name = self._append_path_to_prefix(key) - return self.client.get_blob_client(blob_name).exists() - - def listdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - items = [ - self._strip_prefix_from_path(blob.name, dir_path) - for blob in self.client.walk_blobs(name_starts_with=dir_path, delimiter="/") - ] - return items - - def rmdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - for blob in self.client.list_blobs(name_starts_with=dir_path): - self.client.delete_blob(blob) - - def getsize(self, path=None): - store_path = normalize_storage_path(path) - fs_path = self._append_path_to_prefix(store_path) - if fs_path: - blob_client = self.client.get_blob_client(fs_path) - else: - blob_client = None - - if blob_client and blob_client.exists(): - return blob_client.get_blob_properties().size - else: - size = 0 - if fs_path == "": - fs_path = None - elif not fs_path.endswith("/"): - fs_path += "/" - for blob in self.client.walk_blobs(name_starts_with=fs_path, delimiter="/"): - blob_client = self.client.get_blob_client(blob) - if blob_client.exists(): - size += blob_client.get_blob_properties().size - return size - - def clear(self): - self.rmdir() diff --git a/src/zarr/n5.py b/src/zarr/n5.py deleted file mode 100644 index 44b44e69e2..0000000000 --- a/src/zarr/n5.py +++ /dev/null @@ -1,897 +0,0 @@ -"""This module contains a storage class and codec to support the N5 format. -""" -import os -import struct -import sys -from typing import Any, Dict, Optional, cast -import warnings - -import numpy as np -from numcodecs.abc import Codec -from numcodecs.compat import ndarray_copy -from numcodecs.registry import get_codec, register_codec - -from .meta import ZARR_FORMAT, json_dumps, json_loads -from .storage import FSStore -from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path -from .storage import array_meta_key as zarr_array_meta_key -from .storage import attrs_key as zarr_attrs_key -from .storage import group_meta_key as zarr_group_meta_key - -N5_FORMAT = "2.0.0" - -zarr_to_n5_keys = [ - ("chunks", "blockSize"), - ("dtype", "dataType"), - ("compressor", "compression"), - ("shape", "dimensions"), -] -n5_attrs_key = "attributes.json" -n5_keywords = ["n5", "dataType", "dimensions", "blockSize", "compression"] - - -class N5Store(NestedDirectoryStore): - """Storage class using directories and files on a standard file system, - following the N5 format (https://github.com/saalfeldlab/n5). - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5Store('data/array.n5') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5Store('data/group.n5') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - - This is an experimental feature. - - Safe to write in multiple threads or processes. - - """ - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs: - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # remove previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - super().__delitem__(key_new) - - def __contains__(self, key): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - return super().__contains__(key_new) - - def __eq__(self, other): - return isinstance(other, N5Store) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - path = cast(str, path) - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and os.path.isdir(entry_path): - for dir_path, _, file_names in os.walk(entry_path): - for file_name in file_names: - file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path + os.path.sep)[1] - new_child = rel_path.replace(os.path.sep, ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - return sorted(children) - - else: - return children - - def _load_n5_attrs(self, path: str) -> Dict[str, Any]: - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - if not path.endswith(n5_attrs_key): - attrs_key = os.path.join(path, n5_attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -class N5FSStore(FSStore): - """Implementation of the N5 format (https://github.com/saalfeldlab/n5) - using `fsspec`, which allows storage on a variety of filesystems. Based - on `zarr.N5Store`. - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - This is an experimental feature. - Safe to write in multiple threads or processes. - - Be advised that the `_dimension_separator` property of this store - (and arrays it creates) is ".", but chunks saved by this store will - in fact be "/" separated, as proscribed by the N5 format. - - This is counter-intuitive (to say the least), but not arbitrary. - Chunks in N5 format are stored with reversed dimension order - relative to Zarr chunks: a chunk of a 3D Zarr array would be stored - on a file system as `/0/1/2`, but in N5 the same chunk would be - stored as `/2/1/0`. Therefore, stores targeting N5 must intercept - chunk keys and flip the order of the dimensions before writing to - storage, and this procedure requires chunk keys with "." separated - dimensions, hence the Zarr arrays targeting N5 have the deceptive - "." dimension separator. - """ - - _array_meta_key = "attributes.json" - _group_meta_key = "attributes.json" - _attrs_key = "attributes.json" - - def __init__(self, *args, **kwargs): - if "dimension_separator" in kwargs: - kwargs.pop("dimension_separator") - warnings.warn("Keyword argument `dimension_separator` will be ignored") - dimension_separator = "." - super().__init__(*args, dimension_separator=dimension_separator, **kwargs) - - @staticmethod - def _swap_separator(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - def _normalize_key(self, key: str): - if is_chunk_key(key): - key = invert_chunk_coords(key) - - key = normalize_storage_path(key).lstrip("/") - if key: - *bits, end = key.split("/") - - if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): - end = end.replace(".", "/") - key = "/".join(bits + [end]) - return key.lower() if self.normalize_keys else key - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs.keys(): - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # replace previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - elif is_chunk_key(key): - key_new = self._swap_separator(key) - else: - key_new = key - super().__delitem__(key_new) - - def __contains__(self, key: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - return super().__contains__(key_new) - - def __eq__(self, other: Any): - return isinstance(other, N5FSStore) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._array_meta_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and self.fs.isdir(entry_path): - for file_name in self.fs.find(entry_path): - file_path = os.path.join(root_path, file_name) - rel_path = file_path.split(root_path)[1] - new_child = rel_path.lstrip("/").replace("/", ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._group_meta_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - return sorted(children) - else: - return children - - def _load_n5_attrs(self, path: str): - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - if not path.endswith(self._attrs_key): - attrs_key = os.path.join(path, self._attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -def is_chunk_key(key: str): - rv = False - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - rv = bool(_prog_ckey.match(last_segment)) - return rv - - -def invert_chunk_coords(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - -def group_metadata_to_n5(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from zarr to N5 format.""" - del group_metadata["zarr_format"] - # TODO: This should only exist at the top-level - group_metadata["n5"] = N5_FORMAT - return group_metadata - - -def group_metadata_to_zarr(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from N5 to zarr format.""" - # This only exists at the top level - group_metadata.pop("n5", None) - group_metadata["zarr_format"] = ZARR_FORMAT - return group_metadata - - -def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dict[str, Any]: - """Convert array metadata from zarr to N5 format. If the `top_level` keyword argument is True, - then the `N5` : N5_FORMAT key : value pair will be inserted into the metadata.""" - - for f, t in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - del array_metadata["zarr_format"] - if top_level: - array_metadata["n5"] = N5_FORMAT - try: - dtype = np.dtype(array_metadata["dataType"]) - except TypeError: - raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") - - array_metadata["dataType"] = dtype.name - array_metadata["dimensions"] = array_metadata["dimensions"][::-1] - array_metadata["blockSize"] = array_metadata["blockSize"][::-1] - - if "fill_value" in array_metadata: - if array_metadata["fill_value"] != 0 and array_metadata["fill_value"] is not None: - raise ValueError( - f"""Received fill_value = {array_metadata['fill_value']}, - but N5 only supports fill_value = 0""" - ) - del array_metadata["fill_value"] - - if "order" in array_metadata: - if array_metadata["order"] != "C": - raise ValueError( - f"Received order = {array_metadata['order']}, but N5 only supports order = C" - ) - del array_metadata["order"] - - if "filters" in array_metadata: - if array_metadata["filters"] != [] and array_metadata["filters"] is not None: - raise ValueError("Received filters, but N5 storage does not support zarr filters") - del array_metadata["filters"] - - assert "compression" in array_metadata - compressor_config = array_metadata["compression"] - compressor_config = compressor_config_to_n5(compressor_config) - array_metadata["compression"] = compressor_config - - if "dimension_separator" in array_metadata: - del array_metadata["dimension_separator"] - - return array_metadata - - -def array_metadata_to_zarr( - array_metadata: Dict[str, Any], top_level: bool = False -) -> Dict[str, Any]: - """Convert array metadata from N5 to zarr format. - If the `top_level` keyword argument is True, then the `N5` key will be removed from metadata""" - for t, f in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - if top_level: - array_metadata.pop("n5") - array_metadata["zarr_format"] = ZARR_FORMAT - - array_metadata["shape"] = array_metadata["shape"][::-1] - array_metadata["chunks"] = array_metadata["chunks"][::-1] - array_metadata["fill_value"] = 0 # also if None was requested - array_metadata["order"] = "C" - array_metadata["filters"] = [] - array_metadata["dimension_separator"] = "." - array_metadata["dtype"] = np.dtype(array_metadata["dtype"]).str - - compressor_config = array_metadata["compressor"] - compressor_config = compressor_config_to_zarr(compressor_config) - array_metadata["compressor"] = { - "id": N5ChunkWrapper.codec_id, - "compressor_config": compressor_config, - "dtype": array_metadata["dtype"], - "chunk_shape": array_metadata["chunks"], - } - - return array_metadata - - -def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: - """Get all zarr attributes from an N5 attributes dictionary (i.e., - all non-keyword attributes).""" - - # remove all N5 keywords - for n5_key in n5_keywords: - if n5_key in attrs: - del attrs[n5_key] - - return attrs - - -def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: - if compressor_config is None: - return {"type": "raw"} - else: - _compressor_config = compressor_config - - # peel wrapper, if present - if _compressor_config["id"] == N5ChunkWrapper.codec_id: - _compressor_config = _compressor_config["compressor_config"] - - codec_id = _compressor_config["id"] - n5_config = {"type": codec_id} - - if codec_id == "bz2": - n5_config["type"] = "bzip2" - n5_config["blockSize"] = _compressor_config["level"] - - elif codec_id == "blosc": - n5_config["cname"] = _compressor_config["cname"] - n5_config["clevel"] = _compressor_config["clevel"] - n5_config["shuffle"] = _compressor_config["shuffle"] - n5_config["blocksize"] = _compressor_config["blocksize"] - - elif codec_id == "lzma": - # Switch to XZ for N5 if we are using the default XZ format. - # Note: 4 is the default, which is lzma.CHECK_CRC64. - if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: - n5_config["type"] = "xz" - else: - warnings.warn( - "Not all N5 implementations support lzma compression (yet). You " - "might not be able to open the dataset with another N5 library.", - RuntimeWarning, - ) - n5_config["format"] = _compressor_config["format"] - n5_config["check"] = _compressor_config["check"] - n5_config["filters"] = _compressor_config["filters"] - - # The default is lzma.PRESET_DEFAULT, which is 6. - if _compressor_config["preset"]: - n5_config["preset"] = _compressor_config["preset"] - else: - n5_config["preset"] = 6 - - elif codec_id == "zlib": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = True - - elif codec_id == "gzip": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = False - - else: - n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) - - return n5_config - - -def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: - codec_id = compressor_config["type"] - zarr_config = {"id": codec_id} - - if codec_id == "bzip2": - zarr_config["id"] = "bz2" - zarr_config["level"] = compressor_config["blockSize"] - - elif codec_id == "blosc": - zarr_config["cname"] = compressor_config["cname"] - zarr_config["clevel"] = compressor_config["clevel"] - zarr_config["shuffle"] = compressor_config["shuffle"] - zarr_config["blocksize"] = compressor_config["blocksize"] - - elif codec_id == "lzma": - zarr_config["format"] = compressor_config["format"] - zarr_config["check"] = compressor_config["check"] - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = compressor_config["filters"] - - elif codec_id == "xz": - zarr_config["id"] = "lzma" - zarr_config["format"] = 1 # lzma.FORMAT_XZ - zarr_config["check"] = -1 - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = None - - elif codec_id == "gzip": - if "useZlib" in compressor_config and compressor_config["useZlib"]: - zarr_config["id"] = "zlib" - zarr_config["level"] = compressor_config["level"] - else: - zarr_config["id"] = "gzip" - zarr_config["level"] = compressor_config["level"] - - elif codec_id == "raw": - return None - - else: - zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) - - return zarr_config - - -class N5ChunkWrapper(Codec): - codec_id = "n5_wrapper" - - def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): - self.dtype = np.dtype(dtype) - self.chunk_shape = tuple(chunk_shape) - # is the dtype a little endian format? - self._little_endian = self.dtype.byteorder == "<" or ( - self.dtype.byteorder == "=" and sys.byteorder == "little" - ) - - if compressor: - if compressor_config is not None: - raise ValueError("Only one of compressor_config or compressor should be given.") - compressor_config = compressor.get_config() - - if compressor_config is None and compressor is None or compressor_config["id"] == "raw": - self.compressor_config = None - self._compressor = None - else: - self._compressor = get_codec(compressor_config) - self.compressor_config = self._compressor.get_config() - - def get_config(self): - config = {"id": self.codec_id, "compressor_config": self.compressor_config} - return config - - def encode(self, chunk): - assert chunk.flags.c_contiguous - - header = self._create_header(chunk) - chunk = self._to_big_endian(chunk) - - if self._compressor: - return header + self._compressor.encode(chunk) - else: - return header + chunk.tobytes(order="A") - - def decode(self, chunk, out=None) -> bytes: - len_header, chunk_shape = self._read_header(chunk) - chunk = chunk[len_header:] - - if out is not None: - # out should only be used if we read a complete chunk - assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( - self.chunk_shape, chunk_shape - ) - - if self._compressor: - self._compressor.decode(chunk, out) - else: - ndarray_copy(chunk, out) - - # we can byteswap in-place - if self._little_endian: - out.byteswap(True) - - return out - - else: - if self._compressor: - chunk = self._compressor.decode(chunk) - - # more expensive byteswap - chunk = self._from_big_endian(chunk) - - # read partial chunk - if chunk_shape != self.chunk_shape: - chunk = np.frombuffer(chunk, dtype=self.dtype) - chunk = chunk.reshape(chunk_shape) - complete_chunk = np.zeros(self.chunk_shape, dtype=self.dtype) - target_slices = tuple(slice(0, s) for s in chunk_shape) - complete_chunk[target_slices] = chunk - chunk = complete_chunk - - return chunk - - @staticmethod - def _create_header(chunk): - mode = struct.pack(">H", 0) - num_dims = struct.pack(">H", len(chunk.shape)) - shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) - - return mode + num_dims + shape - - @staticmethod - def _read_header(chunk): - num_dims = struct.unpack(">H", chunk[2:4])[0] - shape = tuple( - struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) - )[::-1] - - len_header = 4 + num_dims * 4 - - return len_header, shape - - def _to_big_endian(self, data): - # assumes data is ndarray - - if self._little_endian: - return data.byteswap() - return data - - def _from_big_endian(self, data): - # assumes data is byte array in big endian - - if not self._little_endian: - return data - - a = np.frombuffer(data, self.dtype.newbyteorder(">")) - return a.astype(self.dtype) - - -register_codec(N5ChunkWrapper, N5ChunkWrapper.codec_id) diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 9e779133c9..7d4ae3a56c 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -17,25 +17,19 @@ import atexit import errno import glob -import multiprocessing -import operator import os import re import shutil -import sys import tempfile import warnings -import zipfile from collections import OrderedDict from collections.abc import MutableMapping from os import scandir -from pickle import PicklingError -from threading import Lock, RLock +from threading import Lock from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any import uuid -import time -from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray_like from numcodecs.registry import codec_registry from zarr.context import Context @@ -51,7 +45,6 @@ from zarr.util import ( buffer_size, json_loads, - nolock, normalize_chunks, normalize_dimension_separator, normalize_dtype, @@ -63,7 +56,6 @@ ensure_contiguous_ndarray_or_bytes, ) -from zarr._storage.absstore import ABSStore # noqa: F401 from zarr._storage.store import ( # noqa: F401 _listdir_from_keys, _rename_from_keys, @@ -79,13 +71,6 @@ Store, ) -__doctest_requires__ = { - ("RedisStore", "RedisStore.*"): ["redis"], - ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], - ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], -} - - try: # noinspection PyUnresolvedReferences from zarr.codecs import Blosc @@ -142,12 +127,6 @@ def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore return FSStore(store, mode=mode, **(storage_options or {})) elif storage_options: raise ValueError("storage_options passed with non-fsspec path") - if store.endswith(".zip"): - return ZipStore(store, mode=mode) - elif store.endswith(".n5"): - from zarr.n5 import N5Store - - return N5Store(store) else: return DirectoryStore(store) else: @@ -576,7 +555,7 @@ def _init_group_metadata( # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = {} + meta: dict[str, Any] = {} key = _prefix_to_group_key(store, _path_to_prefix(path)) if hasattr(store, "_metadata_class"): store[key] = store._metadata_class.encode_group_metadata(meta) @@ -1508,258 +1487,6 @@ def __eq__(self, other): return isinstance(other, NestedDirectoryStore) and self.path == other.path -# noinspection PyPep8Naming -class ZipStore(Store): - """Storage class using a Zip file. - - Parameters - ---------- - path : string - Location of file. - compression : integer, optional - Compression method to use when writing to the archive. - allowZip64 : bool, optional - If True (the default) will create ZIP files that use the ZIP64 - extensions when the zipfile is larger than 2 GiB. If False - will raise an exception when the ZIP file would require ZIP64 - extensions. - mode : string, optional - One of 'r' to read an existing file, 'w' to truncate and write a new - file, 'a' to append to an existing file, or 'x' to exclusively create - and write a new file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.ZipStore('data/array.zip', mode='w') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.ZipStore('data/group.zip', mode='w') - >>> root = zarr.group(store=store) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a ZipStore, the ``close()`` method must be called, otherwise - essential data will not be written to the underlying Zip file. The ZipStore - class also supports the context manager protocol, which ensures the ``close()`` - method is called on leaving the context, e.g.:: - - >>> with zarr.ZipStore('data/array.zip', mode='w') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - Each chunk of an array is stored as a separate entry in the Zip file. Note - that Zip files do not provide any way to remove or replace existing entries. - If an attempt is made to replace an entry, then a warning is generated by - the Python standard library about a duplicate Zip file entry. This can be - triggered if you attempt to write data to a Zarr array more than once, - e.g.:: - - >>> store = zarr.ZipStore('data/example.zip', mode='w') - >>> z = zarr.zeros(100, chunks=10, store=store) - >>> # first write OK - ... z[...] = 42 - >>> # second write generates warnings - ... z[...] = 42 # doctest: +SKIP - >>> store.close() - - This can also happen in a more subtle situation, where data are written only - once to a Zarr array, but the write operations are not aligned with chunk - boundaries, e.g.:: - - >>> store = zarr.ZipStore('data/example.zip', mode='w') - >>> z = zarr.zeros(100, chunks=10, store=store) - >>> z[5:15] = 42 - >>> # write overlaps chunk previously written, generates warnings - ... z[15:25] = 42 # doctest: +SKIP - - To avoid creating duplicate entries, only write data once, and align writes - with chunk boundaries. This alignment is done automatically if you call - ``z[...] = ...`` or create an array from existing data via :func:`zarr.array`. - - Alternatively, use a :class:`DirectoryStore` when writing the data, then - manually Zip the directory and use the Zip file for subsequent reads. - Take note that the files in the Zip file must be relative to the root of the - Zarr archive. You may find it easier to create such a Zip file with ``7z``, e.g.:: - - 7z a -tzip archive.zarr.zip archive.zarr/. - - Safe to write in multiple threads but not in multiple processes. - - """ - - _erasable = False - - def __init__( - self, - path, - compression=zipfile.ZIP_STORED, - allowZip64=True, - mode="a", - dimension_separator=None, - ): - # store properties - path = os.path.abspath(path) - self.path = path - self.compression = compression - self.allowZip64 = allowZip64 - self.mode = mode - self._dimension_separator = dimension_separator - - # Current understanding is that zipfile module in stdlib is not thread-safe, - # and so locking is required for both read and write. However, this has not - # been investigated in detail, perhaps no lock is needed if mode='r'. - self.mutex = RLock() - - # open zip file - self.zf = zipfile.ZipFile(path, mode=mode, compression=compression, allowZip64=allowZip64) - - def __getstate__(self): - self.flush() - return self.path, self.compression, self.allowZip64, self.mode - - def __setstate__(self, state): - path, compression, allowZip64, mode = state - # if initially opened with mode 'w' or 'x', re-open in mode 'a' so file doesn't - # get clobbered - if mode in "wx": - mode = "a" - self.__init__(path=path, compression=compression, allowZip64=allowZip64, mode=mode) - - def close(self): - """Closes the underlying zip file, ensuring all records are written.""" - with self.mutex: - self.zf.close() - - def flush(self): - """Closes the underlying zip file, ensuring all records are written, - then re-opens the file for further modifications.""" - if self.mode != "r": - with self.mutex: - self.zf.close() - # N.B., re-open with mode 'a' regardless of initial mode so we don't wipe - # what's been written - self.zf = zipfile.ZipFile( - self.path, mode="a", compression=self.compression, allowZip64=self.allowZip64 - ) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - with self.mutex: - with self.zf.open(key) as f: # will raise KeyError - return f.read() - - def __setitem__(self, key, value): - if self.mode == "r": - raise ReadOnlyError() - value = ensure_contiguous_ndarray_like(value).view("u1") - with self.mutex: - # writestr(key, value) writes with default permissions from - # zipfile (600) that are too restrictive, build ZipInfo for - # the key to work around limitation - keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6]) - keyinfo.compress_type = self.compression - if keyinfo.filename[-1] == os.sep: - keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x - keyinfo.external_attr |= 0x10 # MS-DOS directory flag - else: - keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- - - self.zf.writestr(keyinfo, value) - - def __delitem__(self, key): - raise NotImplementedError - - def __eq__(self, other): - return ( - isinstance(other, ZipStore) - and self.path == other.path - and self.compression == other.compression - and self.allowZip64 == other.allowZip64 - ) - - def keylist(self): - with self.mutex: - return sorted(self.zf.namelist()) - - def keys(self): - yield from self.keylist() - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - try: - with self.mutex: - self.zf.getinfo(key) - except KeyError: - return False - else: - return True - - def listdir(self, path=None): - path = normalize_storage_path(path) - return _listdir_from_keys(self, path) - - def getsize(self, path=None): - path = normalize_storage_path(path) - with self.mutex: - children = self.listdir(path) - if children: - size = 0 - for child in children: - if path: - name = path + "/" + child - else: - name = child - try: - info = self.zf.getinfo(name) - except KeyError: - pass - else: - size += info.compress_size - return size - elif path: - try: - info = self.zf.getinfo(path) - return info.compress_size - except KeyError: - return 0 - else: - return 0 - - def clear(self): - if self.mode == "r": - raise ReadOnlyError() - with self.mutex: - self.close() - os.remove(self.path) - self.zf = zipfile.ZipFile( - self.path, mode=self.mode, compression=self.compression, allowZip64=self.allowZip64 - ) - - def migrate_1to2(store): """Migrate array metadata in `store` from Zarr format version 1 to version 2. @@ -1813,386 +1540,6 @@ def migrate_1to2(store): del store["attrs"] -# noinspection PyShadowingBuiltins -class DBMStore(Store): - """Storage class using a DBM-style database. - - Parameters - ---------- - path : string - Location of database file. - flag : string, optional - Flags for opening the database file. - mode : int - File mode used if a new file is created. - open : function, optional - Function to open the database file. If not provided, :func:`dbm.open` will be - used on Python 3, and :func:`anydbm.open` will be used on Python 2. - write_lock: bool, optional - Use a lock to prevent concurrent writes from multiple threads (True by default). - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk.e - **open_kwargs - Keyword arguments to pass the `open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.DBMStore('data/array.db') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.DBMStore('data/group.db') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.DBMStore('data/array.db') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - A different database library can be used by passing a different function to - the `open` parameter. For example, if the `bsddb3 - `_ package is installed, a - Berkeley DB database can be used:: - - >>> import bsddb3 - >>> store = zarr.DBMStore('data/array.bdb', open=bsddb3.btopen) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() - - Notes - ----- - Please note that, by default, this class will use the Python standard - library `dbm.open` function to open the database file (or `anydbm.open` on - Python 2). There are up to three different implementations of DBM-style - databases available in any Python installation, and which one is used may - vary from one system to another. Database file formats are not compatible - between these different implementations. Also, some implementations are - more efficient than others. In particular, the "dumb" implementation will be - the fall-back on many systems, and has very poor performance for some usage - scenarios. If you want to ensure a specific implementation is used, pass the - corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM - library. - - Safe to write in multiple threads. May be safe to write in multiple processes, - depending on which DBM implementation is being used, although this has not been - tested. - - """ - - def __init__( - self, - path, - flag="c", - mode=0o666, - open=None, - write_lock=True, - dimension_separator=None, - **open_kwargs, - ): - if open is None: - import dbm - - open = dbm.open - path = os.path.abspath(path) - # noinspection PyArgumentList - self.db = open(path, flag, mode, **open_kwargs) - self.path = path - self.flag = flag - self.mode = mode - self.open = open - self.write_lock = write_lock - if write_lock: - # This may not be required as some dbm implementations manage their own - # locks, but err on the side of caution. - self.write_mutex = Lock() - else: - self.write_mutex = nolock - self.open_kwargs = open_kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # needed for ndbm - except Exception: - # flush may fail if db has already been closed - pass - return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) - - def __setstate__(self, state): - path, flag, mode, open, write_lock, open_kws = state - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.__init__(path=path, flag=flag, mode=mode, open=open, write_lock=write_lock, **open_kws) - - def close(self): - """Closes the underlying database file.""" - if hasattr(self.db, "close"): - with self.write_mutex: - self.db.close() - - def flush(self): - """Synchronizes data to the underlying database file.""" - if self.flag[0] != "r": - with self.write_mutex: - if hasattr(self.db, "sync"): - self.db.sync() - else: # pragma: no cover - # we don't cover this branch anymore as ndbm (oracle) is not packaged - # by conda-forge on non-mac OS: - # https://github.com/conda-forge/staged-recipes/issues/4476 - # fall-back, close and re-open, needed for ndbm - flag = self.flag - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.db.close() - # noinspection PyArgumentList - self.db = self.open(self.path, flag, self.mode, **self.open_kwargs) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return self.db[key] - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - value = ensure_bytes(value) - with self.write_mutex: - self.db[key] = value - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.write_mutex: - del self.db[key] - - def __eq__(self, other): - return ( - isinstance(other, DBMStore) - and self.path == other.path - and - # allow flag and mode to differ - self.open == other.open - and self.open_kwargs == other.open_kwargs - ) - - def keys(self): - return (ensure_text(k, "ascii") for k in iter(self.db.keys())) - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return key in self.db - - def rmdir(self, path: str = "") -> None: - path = normalize_storage_path(path) - _rmdir_from_keys(self, path) - - -class LMDBStore(Store): - """Storage class using LMDB. Requires the `lmdb `_ - package to be installed. - - - Parameters - ---------- - path : string - Location of database file. - buffers : bool, optional - If True (default) use support for buffers, which should increase performance by - reducing memory copies. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `lmdb.open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.LMDBStore('data/array.mdb') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.LMDBStore('data/group.mdb') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.LMDBStore('data/array.mdb') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - By default writes are not immediately flushed to disk to increase performance. You - can ensure data are flushed to disk by calling the ``flush()`` or ``close()`` methods. - - Should be safe to write in multiple threads or processes due to the synchronization - support within LMDB, although writing from multiple processes has not been tested. - - """ - - def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): - import lmdb - - # set default memory map size to something larger than the lmdb default, which is - # very likely to be too small for any moderate array (logic copied from zict) - map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 - kwargs.setdefault("map_size", map_size) - - # don't initialize buffers to zero by default, shouldn't be necessary - kwargs.setdefault("meminit", False) - - # decide whether to use the writemap option based on the operating system's - # support for sparse files - writemap requires sparse file support otherwise - # the whole# `map_size` may be reserved up front on disk (logic copied from zict) - writemap = sys.platform.startswith("linux") - kwargs.setdefault("writemap", writemap) - - # decide options for when data are flushed to disk - choose to delay syncing - # data to filesystem, otherwise pay a large performance penalty (zict also does - # this) - kwargs.setdefault("metasync", False) - kwargs.setdefault("sync", False) - kwargs.setdefault("map_async", False) - - # set default option for number of cached transactions - max_spare_txns = multiprocessing.cpu_count() - kwargs.setdefault("max_spare_txns", max_spare_txns) - - # normalize path - path = os.path.abspath(path) - - # open database - self.db = lmdb.open(path, **kwargs) - - # store properties - self.buffers = buffers - self.path = path - self.kwargs = kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # just in case - except Exception: - # flush may fail if db has already been closed - pass - return self.path, self.buffers, self.kwargs - - def __setstate__(self, state): - path, buffers, kwargs = state - self.__init__(path=path, buffers=buffers, **kwargs) - - def close(self): - """Closes the underlying database.""" - self.db.close() - - def flush(self): - """Synchronizes data to the file system.""" - self.db.sync() - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - # use the buffers option, should avoid a memory copy - with self.db.begin(buffers=self.buffers) as txn: - value = txn.get(key) - if value is None: - raise KeyError(key) - return value - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True, buffers=self.buffers) as txn: - txn.put(key, value) - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True) as txn: - if not txn.delete(key): - raise KeyError(key) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - return cursor.set_key(key) - - def items(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k, v in cursor.iternext(keys=True, values=True): - yield ensure_text(k, "ascii"), v - - def keys(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k in cursor.iternext(keys=True, values=False): - yield ensure_text(k, "ascii") - - def values(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - yield from cursor.iternext(keys=False, values=True) - - def __iter__(self): - return self.keys() - - def __len__(self): - return self.db.stat()["entries"] - - class LRUStoreCache(Store): """Storage class that implements a least-recently-used (LRU) cache layer over some other store. Intended primarily for use with stores that can be slow to @@ -2392,358 +1739,6 @@ def __delitem__(self, key): self._invalidate_value(key) -class SQLiteStore(Store): - """Storage class using SQLite. - - Parameters - ---------- - path : string - Location of database file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `sqlite3.connect` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.SQLiteStore('data/array.sqldb') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.SQLiteStore('data/group.sqldb') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - """ - - def __init__(self, path, dimension_separator=None, **kwargs): - import sqlite3 - - self._dimension_separator = dimension_separator - - # normalize path - if path != ":memory:": - path = os.path.abspath(path) - - # store properties - self.path = path - self.kwargs = kwargs - - # allow threading if SQLite connections are thread-safe - # - # ref: https://www.sqlite.org/releaselog/3_3_1.html - # ref: https://github.com/python/cpython/issues/71377 - check_same_thread = True - if sqlite3.sqlite_version_info >= (3, 3, 1): - check_same_thread = False - - # keep a lock for serializing mutable operations - self.lock = Lock() - - # open database - self.db = sqlite3.connect( - self.path, - detect_types=0, - isolation_level=None, - check_same_thread=check_same_thread, - **self.kwargs, - ) - - # handle keys as `str`s - self.db.text_factory = str - - # get a cursor to read/write to the database - self.cursor = self.db.cursor() - - # initialize database with our table if missing - with self.lock: - self.cursor.execute("CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)") - - def __getstate__(self): - if self.path == ":memory:": - raise PicklingError("Cannot pickle in-memory SQLite databases") - return self.path, self.kwargs - - def __setstate__(self, state): - path, kwargs = state - self.__init__(path=path, **kwargs) - - def close(self): - """Closes the underlying database.""" - - # close cursor and db objects - self.cursor.close() - self.db.close() - - def __getitem__(self, key): - value = self.cursor.execute("SELECT v FROM zarr WHERE (k = ?)", (key,)) - for (v,) in value: - return v - raise KeyError(key) - - def __setitem__(self, key, value): - self.update({key: value}) - - def __delitem__(self, key): - with self.lock: - self.cursor.execute("DELETE FROM zarr WHERE (k = ?)", (key,)) - if self.cursor.rowcount < 1: - raise KeyError(key) - - def __contains__(self, key): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr WHERE (k = ?)", (key,)) - for (has,) in cs: - has = bool(has) - return has - - def items(self): - kvs = self.cursor.execute("SELECT k, v FROM zarr") - yield from kvs - - def keys(self): - ks = self.cursor.execute("SELECT k FROM zarr") - for (k,) in ks: - yield k - - def values(self): - vs = self.cursor.execute("SELECT v FROM zarr") - for (v,) in vs: - yield v - - def __iter__(self): - return self.keys() - - def __len__(self): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr") - for (c,) in cs: - return c - - def update(self, *args, **kwargs): - args += (kwargs,) - - kv_list = [] - for dct in args: - for k, v in dct.items(): - v = ensure_contiguous_ndarray_like(v) - - # Accumulate key-value pairs for storage - kv_list.append((k, v)) - - with self.lock: - self.cursor.executemany("REPLACE INTO zarr VALUES (?, ?)", kv_list) - - def listdir(self, path=None): - path = normalize_storage_path(path) - sep = "_" if path == "" else "/" - keys = self.cursor.execute( - """ - SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( - SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m - FROM zarr WHERE k LIKE (? || "{sep}%") - ) ORDER BY l ASC - """.format(sep=sep), - (path, path), - ) - keys = list(map(operator.itemgetter(0), keys)) - return keys - - def getsize(self, path=None): - path = normalize_storage_path(path) - size = self.cursor.execute( - """ - SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - WHERE k LIKE (? || "%") AND - 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") - """, - (path, path), - ) - for (s,) in size: - return s - - def rmdir(self, path=None): - path = normalize_storage_path(path) - if path: - with self.lock: - self.cursor.execute('DELETE FROM zarr WHERE k LIKE (? || "/%")', (path,)) - else: - self.clear() - - def clear(self): - with self.lock: - self.cursor.executescript( - """ - BEGIN TRANSACTION; - DROP TABLE zarr; - CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); - COMMIT TRANSACTION; - """ - ) - - -class MongoDBStore(Store): - """Storage class using MongoDB. - - .. note:: This is an experimental feature. - - Requires the `pymongo `_ - package to be installed. - - Parameters - ---------- - database : string - Name of database - collection : string - Name of collection - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `pymongo.MongoClient` function. - - Notes - ----- - The maximum chunksize in MongoDB documents is 16 MB. - - """ - - _key = "key" - _value = "value" - - def __init__( - self, - database="mongodb_zarr", - collection="zarr_collection", - dimension_separator=None, - **kwargs, - ): - import pymongo - - self._database = database - self._collection = collection - self._dimension_separator = dimension_separator - self._kwargs = kwargs - - self.client = pymongo.MongoClient(**self._kwargs) - self.db = self.client.get_database(self._database) - self.collection = self.db.get_collection(self._collection) - - def __getitem__(self, key): - doc = self.collection.find_one({self._key: key}) - - if doc is None: - raise KeyError(key) - else: - return doc[self._value] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.collection.replace_one( - {self._key: key}, {self._key: key, self._value: value}, upsert=True - ) - - def __delitem__(self, key): - result = self.collection.delete_many({self._key: key}) - if not result.deleted_count == 1: - raise KeyError(key) - - def __iter__(self): - for f in self.collection.find({}): - yield f[self._key] - - def __len__(self): - return self.collection.count_documents({}) - - def __getstate__(self): - return self._database, self._collection, self._kwargs - - def __setstate__(self, state): - database, collection, kwargs = state - self.__init__(database=database, collection=collection, **kwargs) - - def close(self): - """Cleanup client resources and disconnect from MongoDB.""" - self.client.close() - - def clear(self): - """Remove all items from store.""" - self.collection.delete_many({}) - - -class RedisStore(Store): - """Storage class using Redis. - - .. note:: This is an experimental feature. - - Requires the `redis `_ - package to be installed. - - Parameters - ---------- - prefix : string - Name of prefix for Redis keys - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `redis.Redis` function. - - """ - - def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): - import redis - - self._prefix = prefix - self._kwargs = kwargs - self._dimension_separator = dimension_separator - - self.client = redis.Redis(**kwargs) - - def _key(self, key): - return "{prefix}:{key}".format(prefix=self._prefix, key=key) - - def __getitem__(self, key): - return self.client[self._key(key)] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.client[self._key(key)] = value - - def __delitem__(self, key): - count = self.client.delete(self._key(key)) - if not count: - raise KeyError(key) - - def keylist(self): - offset = len(self._key("")) # length of prefix - return [key[offset:].decode("utf-8") for key in self.client.keys(self._key("*"))] - - def keys(self): - yield from self.keylist() - - def __iter__(self): - yield from self.keys() - - def __len__(self): - return len(self.keylist()) - - def __getstate__(self): - return self._prefix, self._kwargs - - def __setstate__(self, state): - prefix, kwargs = state - self.__init__(prefix=prefix, **kwargs) - - def clear(self): - for key in self.keys(): - del self[key] - - class ConsolidatedMetadataStore(Store): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/tests/test_core.py b/tests/test_core.py index 6303371793..d996af5563 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,4 @@ import atexit -import os -import sys import pickle import shutil from typing import Any, Literal, Optional, Tuple, Union @@ -37,26 +35,19 @@ from zarr.core import Array from zarr.meta import json_loads -from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( - ABSStore, - DBMStore, DirectoryStore, FSStore, KVStore, - LMDBStore, LRUStoreCache, NestedDirectoryStore, - SQLiteStore, - atexit_rmglob, - atexit_rmtree, init_array, init_group, normalize_store_arg, ) from zarr.util import buffer_size -from .util import abs_container, skip_test_env_var, have_fsspec, mktemp +from .util import have_fsspec # noinspection PyMethodMayBeStatic @@ -1655,24 +1646,6 @@ def test_array_init_from_dict(): assert isinstance(a.store, KVStore) -@skip_test_env_var("ZARR_TEST_ABS") -class TestArrayWithABSStore(TestArray): - def create_store(self): - client = abs_container() - store = ABSStore(client=client) - store.rmdir() - return store - - @pytest.mark.xfail - def test_nbytes_stored(self): - return super().test_nbytes_stored() - - @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") - def test_pickle(self): - # internal attribute on ContainerClient isn't serializable for py36 and earlier - super().test_pickle() - - class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): def create_store(self): path = mkdtemp() @@ -1690,366 +1663,6 @@ def expected(self): ] -class TestArrayWithN5Store(TestArrayWithDirectoryStore): - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = N5Store(path) - return store - - def test_array_0d(self): - # test behaviour for array with 0 dimensions - - # setup - a = np.zeros(()) - z = self.create_array(shape=(), dtype=a.dtype, fill_value=0) - - # check properties - assert a.ndim == z.ndim - assert a.shape == z.shape - assert a.size == z.size - assert a.dtype == z.dtype - assert a.nbytes == z.nbytes - with pytest.raises(TypeError): - len(z) - assert () == z.chunks - assert 1 == z.nchunks - assert (1,) == z.cdata_shape - # compressor always None - no point in compressing a single value - assert z.compressor.compressor_config is None - - # check __getitem__ - b = z[...] - assert isinstance(b, np.ndarray) - assert a.shape == b.shape - assert a.dtype == b.dtype - assert_array_equal(a, np.array(z)) - assert_array_equal(a, z[...]) - assert a[()] == z[()] - with pytest.raises(IndexError): - z[0] - with pytest.raises(IndexError): - z[:] - - # check __setitem__ - z[...] = 42 - assert 42 == z[()] - z[()] = 43 - assert 43 == z[()] - with pytest.raises(IndexError): - z[0] = 42 - with pytest.raises(IndexError): - z[:] = 42 - with pytest.raises(ValueError): - z[...] = np.array([1, 2, 3]) - - def test_array_1d_fill_value(self): - nvalues = 1050 - dtype = np.int32 - for fill_value in 0, None: - a = np.arange(nvalues, dtype=dtype) - f = np.empty_like(a) - f.fill(fill_value or 0) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, fill_value=fill_value) - z[190:310] = a[190:310] - - assert_array_equal(f[:190], z[:190]) - assert_array_equal(a[190:310], z[190:310]) - assert_array_equal(f[310:], z[310:]) - - with pytest.raises(ValueError): - z = self.create_array(shape=(nvalues,), chunks=100, dtype=dtype, fill_value=1) - - def test_nchunks_initialized(self): - fill_value = 0 - dtype = "int" - z = self.create_array( - shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=True - ) - - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store["foo"] = b"bar" - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[z._chunk_key((0,))] - assert 9 == z.nchunks_initialized - - # second round of similar tests with write_empty_chunks set to - # False - z = self.create_array( - shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=False - ) - z[:] = 42 - assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[z._chunk_key((0,))] - assert 9 == z.nchunks_initialized - z[:] = z.fill_value - assert 0 == z.nchunks_initialized - - def test_array_order(self): - # N5 only supports 'C' at the moment - with pytest.raises(ValueError): - self.create_array(shape=(10, 11), chunks=(10, 11), dtype="i8", order="F") - - # 1D - a = np.arange(1050) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, order="C") - assert z.order == "C" - assert z[:].flags.c_contiguous - z[:] = a - assert_array_equal(a, z[:]) - - # 2D - a = np.arange(10000).reshape((100, 100)) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype, order="C") - - assert z.order == "C" - assert z[:].flags.c_contiguous - z[:] = a - actual = z[:] - assert_array_equal(a, actual) - - def test_structured_array(self): - d = np.array( - [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], - dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], - ) - fill_values = None, b"", (b"zzz", 42, 16.8) - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_structured_array_subshapes(self): - d = np.array( - [ - (0, ((0, 1, 2), (1, 2, 3)), b"aaa"), - (1, ((1, 2, 3), (2, 3, 4)), b"bbb"), - (2, ((2, 3, 4), (3, 4, 5)), b"ccc"), - ], - dtype=[("foo", "i8"), ("bar", "(2, 3)f4"), ("baz", "S3")], - ) - fill_values = None, b"", (0, ((0, 0, 0), (1, 1, 1)), b"zzz") - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_structured_array_nested(self): - d = np.array( - [ - (0, (0, ((0, 1), (1, 2), (2, 3)), 0), b"aaa"), - (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b"bbb"), - (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b"ccc"), - ], - dtype=[ - ("foo", "i8"), - ("bar", [("foo", "i4"), ("bar", "(3, 2)f4"), ("baz", "u1")]), - ("baz", "S3"), - ], - ) - fill_values = None, b"", (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b"zzz") - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_dtypes(self): - # integers - for dtype in "u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8": - z = self.create_array(shape=10, chunks=3, dtype=dtype) - assert z.dtype == np.dtype(dtype) - a = np.arange(z.shape[0], dtype=dtype) - z[:] = a - assert_array_equal(a, z[:]) - - # floats - for dtype in "f2", "f4", "f8": - z = self.create_array(shape=10, chunks=3, dtype=dtype) - assert z.dtype == np.dtype(dtype) - a = np.linspace(0, 1, z.shape[0], dtype=dtype) - z[:] = a - assert_array_almost_equal(a, z[:]) - - # check that datetime generic units are not allowed - with pytest.raises(ValueError): - self.create_array(shape=100, dtype="M8") - with pytest.raises(ValueError): - self.create_array(shape=100, dtype="m8") - - def test_object_arrays(self): - # an object_codec is required for object arrays - with pytest.raises(ValueError): - self.create_array(shape=10, chunks=3, dtype=object) - - # an object_codec is required for object arrays, but allow to be provided via - # filters to maintain API backwards compatibility - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - self.create_array(shape=10, chunks=3, dtype=object, filters=[MsgPack()]) - - # create an object array using an object codec - with pytest.raises(ValueError): - self.create_array(shape=10, chunks=3, dtype=object, object_codec=MsgPack()) - - def test_object_arrays_vlen_text(self): - data = np.array(greetings * 1000, dtype=object) - - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=object, object_codec=VLenUTF8()) - - # convenience API - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=str) - - def test_object_arrays_vlen_bytes(self): - greetings_bytes = [g.encode("utf8") for g in greetings] - data = np.array(greetings_bytes * 1000, dtype=object) - - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=object, object_codec=VLenBytes()) - - # convenience API - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=bytes) - - def test_object_arrays_vlen_array(self): - data = np.array( - [np.array([1, 3, 7]), np.array([5]), np.array([2, 8, 12])] * 1000, dtype=object - ) - - codecs = VLenArray(int), VLenArray("> 16) -# assert perm == "0o644" -# info = z.getinfo(baz_key) -# perm = oct(info.external_attr >> 16) -# # only for posix platforms -# if os.name == "posix": -# if self.version == 2: -# assert perm == "0o40775" -# else: -# # baz/ on v2, but baz on v3, so not a directory -# assert perm == "0o644" -# z.close() - -# def test_store_and_retrieve_ndarray(self): -# store = ZipStore("data/store.zip") -# x = np.array([[1, 2], [3, 4]]) -# store["foo"] = x -# y = np.frombuffer(store["foo"], dtype=x.dtype).reshape(x.shape) -# assert np.array_equiv(y, x) - - -# class TestDBMStore(StoreTests): -# def create_store(self, dimension_separator=None): -# path = mktemp(suffix=".anydbm") -# atexit.register(atexit_rmglob, path + "*") -# # create store using default dbm implementation -# store = DBMStore(path, flag="n", dimension_separator=dimension_separator) -# return store - -# def test_context_manager(self): -# with self.create_store() as store: -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"qux" -# assert 2 == len(store) - - -# class TestDBMStoreDumb(TestDBMStore): -# def create_store(self, **kwargs): -# path = mktemp(suffix=".dumbdbm") -# atexit.register(atexit_rmglob, path + "*") - -# import dbm.dumb as dumbdbm - -# store = DBMStore(path, flag="n", open=dumbdbm.open, **kwargs) -# return store - - -# class TestDBMStoreGnu(TestDBMStore): -# def create_store(self, **kwargs): -# gdbm = pytest.importorskip("dbm.gnu") -# path = mktemp(suffix=".gdbm") # pragma: no cover -# atexit.register(os.remove, path) # pragma: no cover -# store = DBMStore( -# path, flag="n", open=gdbm.open, write_lock=False, **kwargs -# ) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreNDBM(TestDBMStore): -# def create_store(self, **kwargs): -# ndbm = pytest.importorskip("dbm.ndbm") -# path = mktemp(suffix=".ndbm") # pragma: no cover -# atexit.register(atexit_rmglob, path + "*") # pragma: no cover -# store = DBMStore(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreBerkeleyDB(TestDBMStore): -# def create_store(self, **kwargs): -# bsddb3 = pytest.importorskip("bsddb3") -# path = mktemp(suffix=".dbm") -# atexit.register(os.remove, path) -# store = DBMStore(path, flag="n", open=bsddb3.btopen, write_lock=False, **kwargs) -# return store - - -# class TestLMDBStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("lmdb") -# path = mktemp(suffix=".lmdb") -# atexit.register(atexit_rmtree, path) -# buffers = True -# store = LMDBStore(path, buffers=buffers, **kwargs) -# return store - -# def test_context_manager(self): -# with self.create_store() as store: -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"qux" -# assert 2 == len(store) - - -# class TestSQLiteStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStore(path, **kwargs) -# return store - -# def test_underscore_in_name(self): -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStore(path) -# store["a"] = b"aaa" -# store["a_b"] = b"aa_bb" -# store.rmdir("a") -# assert "a_b" in store - - -# class TestSQLiteStoreInMemory(TestSQLiteStore): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# store = SQLiteStore(":memory:", **kwargs) -# return store - -# def test_pickle(self): - -# # setup store -# store = self.create_store() -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"quux" - -# # round-trip through pickle -# with pytest.raises(PicklingError): -# pickle.dumps(store) - - -# @skip_test_env_var("ZARR_TEST_MONGO") -# class TestMongoDBStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("pymongo") -# store = MongoDBStore( -# host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs -# ) -# # start with an empty store -# store.clear() -# return store - - -# @skip_test_env_var("ZARR_TEST_REDIS") -# class TestRedisStore(StoreTests): -# def create_store(self, **kwargs): -# # TODO: this is the default host for Redis on Travis, -# # we probably want to generalize this though -# pytest.importorskip("redis") -# store = RedisStore(host="localhost", port=6379, **kwargs) -# # start with an empty store -# store.clear() -# return store - - # class TestLRUStoreCache(StoreTests): # CountingClass = CountingDict From dd7d3a8fbca10ad6d7d662137318f53716e1088d Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 11 Apr 2024 22:33:01 +0200 Subject: [PATCH 09/22] chore: remove v3_storage_transformers.py again --- src/zarr/_storage/v3_storage_transformers.py | 382 ------------------- 1 file changed, 382 deletions(-) delete mode 100644 src/zarr/_storage/v3_storage_transformers.py diff --git a/src/zarr/_storage/v3_storage_transformers.py b/src/zarr/_storage/v3_storage_transformers.py deleted file mode 100644 index cb11cea52e..0000000000 --- a/src/zarr/_storage/v3_storage_transformers.py +++ /dev/null @@ -1,382 +0,0 @@ -import functools -import itertools -import os -from typing import NamedTuple, Tuple, Optional, Union, Iterator - -from numcodecs.compat import ensure_bytes -import numpy as np - -from zarr._storage.store import StorageTransformer, StoreV3, _rmdir_from_keys_v3 -from zarr.util import normalize_storage_path - - -MAX_UINT_64 = 2**64 - 1 - - -v3_sharding_available = os.environ.get("ZARR_V3_SHARDING", "0").lower() not in ["0", "false"] - - -def assert_zarr_v3_sharding_available(): - if not v3_sharding_available: - raise NotImplementedError( - "Using V3 sharding is experimental and not yet finalized! To enable support, set:\n" - "ZARR_V3_SHARDING=1" - ) # pragma: no cover - - -class _ShardIndex(NamedTuple): - store: "ShardingStorageTransformer" - # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) - offsets_and_lengths: np.ndarray - - def __localize_chunk__(self, chunk: Tuple[int, ...]) -> Tuple[int, ...]: - return tuple( - chunk_i % shard_i for chunk_i, shard_i in zip(chunk, self.store.chunks_per_shard) - ) - - def is_all_empty(self) -> bool: - return np.array_equiv(self.offsets_and_lengths, MAX_UINT_64) - - def get_chunk_slice(self, chunk: Tuple[int, ...]) -> Optional[slice]: - localized_chunk = self.__localize_chunk__(chunk) - chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk] - if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): - return None - else: - return slice(int(chunk_start), int(chunk_start + chunk_len)) - - def set_chunk_slice(self, chunk: Tuple[int, ...], chunk_slice: Optional[slice]) -> None: - localized_chunk = self.__localize_chunk__(chunk) - if chunk_slice is None: - self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64) - else: - self.offsets_and_lengths[localized_chunk] = ( - chunk_slice.start, - chunk_slice.stop - chunk_slice.start, - ) - - def to_bytes(self) -> bytes: - return self.offsets_and_lengths.tobytes(order="C") - - @classmethod - def from_bytes( - cls, buffer: Union[bytes, bytearray], store: "ShardingStorageTransformer" - ) -> "_ShardIndex": - try: - return cls( - store=store, - offsets_and_lengths=np.frombuffer(bytearray(buffer), dtype=" None: - super().__init__(_type) - assert test_value == self.TEST_CONSTANT - self.test_value = test_value - - -class ShardingStorageTransformer(StorageTransformer): # lgtm[py/missing-equals] - """Implements sharding as a storage transformer, as described in the spec: - https://zarr-specs.readthedocs.io/en/latest/extensions/storage-transformers/sharding/v1.0.html - https://purl.org/zarr/spec/storage_transformers/sharding/1.0 - """ - - extension_uri = "https://purl.org/zarr/spec/storage_transformers/sharding/1.0" - valid_types = ["indexed"] - - def __init__(self, _type, chunks_per_shard) -> None: - assert_zarr_v3_sharding_available() - super().__init__(_type) - if isinstance(chunks_per_shard, int): - chunks_per_shard = (chunks_per_shard,) - else: - chunks_per_shard = tuple(int(i) for i in chunks_per_shard) - if chunks_per_shard == (): - chunks_per_shard = (1,) - self.chunks_per_shard = chunks_per_shard - self._num_chunks_per_shard = functools.reduce(lambda x, y: x * y, chunks_per_shard, 1) - self._dimension_separator = None - self._data_key_prefix = None - - def _copy_for_array(self, array, inner_store): - transformer_copy = super()._copy_for_array(array, inner_store) - transformer_copy._dimension_separator = array._dimension_separator - transformer_copy._data_key_prefix = array._data_key_prefix - if len(array._shape) > len(self.chunks_per_shard): - # The array shape might be longer when initialized with subdtypes. - # subdtypes dimensions come last, therefore padding chunks_per_shard - # with ones, effectively disabling sharding on the unlisted dimensions. - transformer_copy.chunks_per_shard += (1,) * ( - len(array._shape) - len(self.chunks_per_shard) - ) - return transformer_copy - - @property - def dimension_separator(self) -> str: - assert ( - self._dimension_separator is not None - ), "dimension_separator is not initialized, first get a copy via _copy_for_array." - return self._dimension_separator - - def _is_data_key(self, key: str) -> bool: - assert ( - self._data_key_prefix is not None - ), "data_key_prefix is not initialized, first get a copy via _copy_for_array." - return key.startswith(self._data_key_prefix) - - def _key_to_shard(self, chunk_key: str) -> Tuple[str, Tuple[int, ...]]: - prefix, _, chunk_string = chunk_key.rpartition("c") - chunk_subkeys = ( - tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) - ) - shard_key_tuple = ( - subkey // shard_i for subkey, shard_i in zip(chunk_subkeys, self.chunks_per_shard) - ) - shard_key = prefix + "c" + self.dimension_separator.join(map(str, shard_key_tuple)) - return shard_key, chunk_subkeys - - def _get_index_from_store(self, shard_key: str) -> _ShardIndex: - # At the end of each shard 2*64bit per chunk for offset and length define the index: - index_bytes = self.inner_store.get_partial_values( - [(shard_key, (-16 * self._num_chunks_per_shard, None))] - )[0] - if index_bytes is None: - raise KeyError(shard_key) - return _ShardIndex.from_bytes( - index_bytes, - self, - ) - - def _get_index_from_buffer(self, buffer: Union[bytes, bytearray]) -> _ShardIndex: - # At the end of each shard 2*64bit per chunk for offset and length define the index: - return _ShardIndex.from_bytes(buffer[-16 * self._num_chunks_per_shard :], self) - - def _get_chunks_in_shard(self, shard_key: str) -> Iterator[Tuple[int, ...]]: - _, _, chunk_string = shard_key.rpartition("c") - shard_key_tuple = ( - tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) - ) - for chunk_offset in itertools.product(*(range(i) for i in self.chunks_per_shard)): - yield tuple( - shard_key_i * shards_i + offset_i - for shard_key_i, offset_i, shards_i in zip( - shard_key_tuple, chunk_offset, self.chunks_per_shard - ) - ) - - def __getitem__(self, key): - if self._is_data_key(key): - if self.supports_efficient_get_partial_values: - # Use the partial implementation, which fetches the index separately - value = self.get_partial_values([(key, (0, None))])[0] - if value is None: - raise KeyError(key) - else: - return value - shard_key, chunk_subkey = self._key_to_shard(key) - try: - full_shard_value = self.inner_store[shard_key] - except KeyError: - raise KeyError(key) - index = self._get_index_from_buffer(full_shard_value) - chunk_slice = index.get_chunk_slice(chunk_subkey) - if chunk_slice is not None: - return full_shard_value[chunk_slice] - else: - raise KeyError(key) - else: - return self.inner_store.__getitem__(key) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - chunks_to_read = set(self._get_chunks_in_shard(shard_key)) - chunks_to_read.remove(chunk_subkey) - new_content = {chunk_subkey: value} - try: - if self.supports_efficient_get_partial_values: - index = self._get_index_from_store(shard_key) - full_shard_value = None - else: - full_shard_value = self.inner_store[shard_key] - index = self._get_index_from_buffer(full_shard_value) - except KeyError: - index = _ShardIndex.create_empty(self) - else: - chunk_slices = [ - (chunk_to_read, index.get_chunk_slice(chunk_to_read)) - for chunk_to_read in chunks_to_read - ] - valid_chunk_slices = [ - (chunk_to_read, chunk_slice) - for chunk_to_read, chunk_slice in chunk_slices - if chunk_slice is not None - ] - # use get_partial_values if less than half of the available chunks must be read: - # (This can be changed when set_partial_values can be used efficiently.) - use_partial_get = ( - self.supports_efficient_get_partial_values - and len(valid_chunk_slices) < len(chunk_slices) / 2 - ) - - if use_partial_get: - chunk_values = self.inner_store.get_partial_values( - [ - ( - shard_key, - ( - chunk_slice.start, - chunk_slice.stop - chunk_slice.start, - ), - ) - for _, chunk_slice in valid_chunk_slices - ] - ) - for chunk_value, (chunk_to_read, _) in zip(chunk_values, valid_chunk_slices): - new_content[chunk_to_read] = chunk_value - else: - if full_shard_value is None: - full_shard_value = self.inner_store[shard_key] - for chunk_to_read, chunk_slice in valid_chunk_slices: - if chunk_slice is not None: - new_content[chunk_to_read] = full_shard_value[chunk_slice] - - shard_content = b"" - for chunk_subkey, chunk_content in new_content.items(): - chunk_slice = slice(len(shard_content), len(shard_content) + len(chunk_content)) - index.set_chunk_slice(chunk_subkey, chunk_slice) - shard_content += chunk_content - # Appending the index at the end of the shard: - shard_content += index.to_bytes() - self.inner_store[shard_key] = shard_content - else: # pragma: no cover - self.inner_store[key] = value - - def __delitem__(self, key): - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - try: - index = self._get_index_from_store(shard_key) - except KeyError: - raise KeyError(key) - - index.set_chunk_slice(chunk_subkey, None) - - if index.is_all_empty(): - del self.inner_store[shard_key] - else: - index_bytes = index.to_bytes() - self.inner_store.set_partial_values([(shard_key, -len(index_bytes), index_bytes)]) - else: # pragma: no cover - del self.inner_store[key] - - def _shard_key_to_original_keys(self, key: str) -> Iterator[str]: - if self._is_data_key(key): - index = self._get_index_from_store(key) - prefix, _, _ = key.rpartition("c") - for chunk_tuple in self._get_chunks_in_shard(key): - if index.get_chunk_slice(chunk_tuple) is not None: - yield prefix + "c" + self.dimension_separator.join(map(str, chunk_tuple)) - else: - yield key - - def __iter__(self) -> Iterator[str]: - for key in self.inner_store: - yield from self._shard_key_to_original_keys(key) - - def __len__(self): - return sum(1 for _ in self.keys()) - - def get_partial_values(self, key_ranges): - if self.supports_efficient_get_partial_values: - transformed_key_ranges = [] - cached_indices = {} - none_indices = [] - for i, (key, range_) in enumerate(key_ranges): - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - try: - index = cached_indices[shard_key] - except KeyError: - try: - index = self._get_index_from_store(shard_key) - except KeyError: - none_indices.append(i) - continue - cached_indices[shard_key] = index - chunk_slice = index.get_chunk_slice(chunk_subkey) - if chunk_slice is None: - none_indices.append(i) - continue - range_start, range_length = range_ - if range_length is None: - range_length = chunk_slice.stop - chunk_slice.start - transformed_key_ranges.append( - (shard_key, (range_start + chunk_slice.start, range_length)) - ) - else: # pragma: no cover - transformed_key_ranges.append((key, range_)) - values = self.inner_store.get_partial_values(transformed_key_ranges) - for i in none_indices: - values.insert(i, None) - return values - else: - return StoreV3.get_partial_values(self, key_ranges) - - def supports_efficient_set_partial_values(self): - return False - - def set_partial_values(self, key_start_values): - # This does not yet implement efficient set_partial_values - StoreV3.set_partial_values(self, key_start_values) - - def rename(self, src_path: str, dst_path: str) -> None: - StoreV3.rename(self, src_path, dst_path) # type: ignore[arg-type] - - def list_prefix(self, prefix): - return StoreV3.list_prefix(self, prefix) - - def erase_prefix(self, prefix): - if self._is_data_key(prefix): - StoreV3.erase_prefix(self, prefix) - else: - self.inner_store.erase_prefix(prefix) - - def rmdir(self, path=None): - path = normalize_storage_path(path) - _rmdir_from_keys_v3(self, path) - - def __contains__(self, key): - if self._is_data_key(key): - shard_key, chunk_subkeys = self._key_to_shard(key) - try: - index = self._get_index_from_store(shard_key) - except KeyError: - return False - chunk_slice = index.get_chunk_slice(chunk_subkeys) - return chunk_slice is not None - else: - return self._inner_store.__contains__(key) From 6e52bfea6c11ae42478f82ba957442fa7e819d7c Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 11 Apr 2024 22:33:01 +0200 Subject: [PATCH 10/22] chore: remove v3_storage_transformers.py again --- src/zarr/_storage/v3_storage_transformers.py | 382 ------------------- 1 file changed, 382 deletions(-) delete mode 100644 src/zarr/_storage/v3_storage_transformers.py diff --git a/src/zarr/_storage/v3_storage_transformers.py b/src/zarr/_storage/v3_storage_transformers.py deleted file mode 100644 index cb11cea52e..0000000000 --- a/src/zarr/_storage/v3_storage_transformers.py +++ /dev/null @@ -1,382 +0,0 @@ -import functools -import itertools -import os -from typing import NamedTuple, Tuple, Optional, Union, Iterator - -from numcodecs.compat import ensure_bytes -import numpy as np - -from zarr._storage.store import StorageTransformer, StoreV3, _rmdir_from_keys_v3 -from zarr.util import normalize_storage_path - - -MAX_UINT_64 = 2**64 - 1 - - -v3_sharding_available = os.environ.get("ZARR_V3_SHARDING", "0").lower() not in ["0", "false"] - - -def assert_zarr_v3_sharding_available(): - if not v3_sharding_available: - raise NotImplementedError( - "Using V3 sharding is experimental and not yet finalized! To enable support, set:\n" - "ZARR_V3_SHARDING=1" - ) # pragma: no cover - - -class _ShardIndex(NamedTuple): - store: "ShardingStorageTransformer" - # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) - offsets_and_lengths: np.ndarray - - def __localize_chunk__(self, chunk: Tuple[int, ...]) -> Tuple[int, ...]: - return tuple( - chunk_i % shard_i for chunk_i, shard_i in zip(chunk, self.store.chunks_per_shard) - ) - - def is_all_empty(self) -> bool: - return np.array_equiv(self.offsets_and_lengths, MAX_UINT_64) - - def get_chunk_slice(self, chunk: Tuple[int, ...]) -> Optional[slice]: - localized_chunk = self.__localize_chunk__(chunk) - chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk] - if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): - return None - else: - return slice(int(chunk_start), int(chunk_start + chunk_len)) - - def set_chunk_slice(self, chunk: Tuple[int, ...], chunk_slice: Optional[slice]) -> None: - localized_chunk = self.__localize_chunk__(chunk) - if chunk_slice is None: - self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64) - else: - self.offsets_and_lengths[localized_chunk] = ( - chunk_slice.start, - chunk_slice.stop - chunk_slice.start, - ) - - def to_bytes(self) -> bytes: - return self.offsets_and_lengths.tobytes(order="C") - - @classmethod - def from_bytes( - cls, buffer: Union[bytes, bytearray], store: "ShardingStorageTransformer" - ) -> "_ShardIndex": - try: - return cls( - store=store, - offsets_and_lengths=np.frombuffer(bytearray(buffer), dtype=" None: - super().__init__(_type) - assert test_value == self.TEST_CONSTANT - self.test_value = test_value - - -class ShardingStorageTransformer(StorageTransformer): # lgtm[py/missing-equals] - """Implements sharding as a storage transformer, as described in the spec: - https://zarr-specs.readthedocs.io/en/latest/extensions/storage-transformers/sharding/v1.0.html - https://purl.org/zarr/spec/storage_transformers/sharding/1.0 - """ - - extension_uri = "https://purl.org/zarr/spec/storage_transformers/sharding/1.0" - valid_types = ["indexed"] - - def __init__(self, _type, chunks_per_shard) -> None: - assert_zarr_v3_sharding_available() - super().__init__(_type) - if isinstance(chunks_per_shard, int): - chunks_per_shard = (chunks_per_shard,) - else: - chunks_per_shard = tuple(int(i) for i in chunks_per_shard) - if chunks_per_shard == (): - chunks_per_shard = (1,) - self.chunks_per_shard = chunks_per_shard - self._num_chunks_per_shard = functools.reduce(lambda x, y: x * y, chunks_per_shard, 1) - self._dimension_separator = None - self._data_key_prefix = None - - def _copy_for_array(self, array, inner_store): - transformer_copy = super()._copy_for_array(array, inner_store) - transformer_copy._dimension_separator = array._dimension_separator - transformer_copy._data_key_prefix = array._data_key_prefix - if len(array._shape) > len(self.chunks_per_shard): - # The array shape might be longer when initialized with subdtypes. - # subdtypes dimensions come last, therefore padding chunks_per_shard - # with ones, effectively disabling sharding on the unlisted dimensions. - transformer_copy.chunks_per_shard += (1,) * ( - len(array._shape) - len(self.chunks_per_shard) - ) - return transformer_copy - - @property - def dimension_separator(self) -> str: - assert ( - self._dimension_separator is not None - ), "dimension_separator is not initialized, first get a copy via _copy_for_array." - return self._dimension_separator - - def _is_data_key(self, key: str) -> bool: - assert ( - self._data_key_prefix is not None - ), "data_key_prefix is not initialized, first get a copy via _copy_for_array." - return key.startswith(self._data_key_prefix) - - def _key_to_shard(self, chunk_key: str) -> Tuple[str, Tuple[int, ...]]: - prefix, _, chunk_string = chunk_key.rpartition("c") - chunk_subkeys = ( - tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) - ) - shard_key_tuple = ( - subkey // shard_i for subkey, shard_i in zip(chunk_subkeys, self.chunks_per_shard) - ) - shard_key = prefix + "c" + self.dimension_separator.join(map(str, shard_key_tuple)) - return shard_key, chunk_subkeys - - def _get_index_from_store(self, shard_key: str) -> _ShardIndex: - # At the end of each shard 2*64bit per chunk for offset and length define the index: - index_bytes = self.inner_store.get_partial_values( - [(shard_key, (-16 * self._num_chunks_per_shard, None))] - )[0] - if index_bytes is None: - raise KeyError(shard_key) - return _ShardIndex.from_bytes( - index_bytes, - self, - ) - - def _get_index_from_buffer(self, buffer: Union[bytes, bytearray]) -> _ShardIndex: - # At the end of each shard 2*64bit per chunk for offset and length define the index: - return _ShardIndex.from_bytes(buffer[-16 * self._num_chunks_per_shard :], self) - - def _get_chunks_in_shard(self, shard_key: str) -> Iterator[Tuple[int, ...]]: - _, _, chunk_string = shard_key.rpartition("c") - shard_key_tuple = ( - tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) - ) - for chunk_offset in itertools.product(*(range(i) for i in self.chunks_per_shard)): - yield tuple( - shard_key_i * shards_i + offset_i - for shard_key_i, offset_i, shards_i in zip( - shard_key_tuple, chunk_offset, self.chunks_per_shard - ) - ) - - def __getitem__(self, key): - if self._is_data_key(key): - if self.supports_efficient_get_partial_values: - # Use the partial implementation, which fetches the index separately - value = self.get_partial_values([(key, (0, None))])[0] - if value is None: - raise KeyError(key) - else: - return value - shard_key, chunk_subkey = self._key_to_shard(key) - try: - full_shard_value = self.inner_store[shard_key] - except KeyError: - raise KeyError(key) - index = self._get_index_from_buffer(full_shard_value) - chunk_slice = index.get_chunk_slice(chunk_subkey) - if chunk_slice is not None: - return full_shard_value[chunk_slice] - else: - raise KeyError(key) - else: - return self.inner_store.__getitem__(key) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - chunks_to_read = set(self._get_chunks_in_shard(shard_key)) - chunks_to_read.remove(chunk_subkey) - new_content = {chunk_subkey: value} - try: - if self.supports_efficient_get_partial_values: - index = self._get_index_from_store(shard_key) - full_shard_value = None - else: - full_shard_value = self.inner_store[shard_key] - index = self._get_index_from_buffer(full_shard_value) - except KeyError: - index = _ShardIndex.create_empty(self) - else: - chunk_slices = [ - (chunk_to_read, index.get_chunk_slice(chunk_to_read)) - for chunk_to_read in chunks_to_read - ] - valid_chunk_slices = [ - (chunk_to_read, chunk_slice) - for chunk_to_read, chunk_slice in chunk_slices - if chunk_slice is not None - ] - # use get_partial_values if less than half of the available chunks must be read: - # (This can be changed when set_partial_values can be used efficiently.) - use_partial_get = ( - self.supports_efficient_get_partial_values - and len(valid_chunk_slices) < len(chunk_slices) / 2 - ) - - if use_partial_get: - chunk_values = self.inner_store.get_partial_values( - [ - ( - shard_key, - ( - chunk_slice.start, - chunk_slice.stop - chunk_slice.start, - ), - ) - for _, chunk_slice in valid_chunk_slices - ] - ) - for chunk_value, (chunk_to_read, _) in zip(chunk_values, valid_chunk_slices): - new_content[chunk_to_read] = chunk_value - else: - if full_shard_value is None: - full_shard_value = self.inner_store[shard_key] - for chunk_to_read, chunk_slice in valid_chunk_slices: - if chunk_slice is not None: - new_content[chunk_to_read] = full_shard_value[chunk_slice] - - shard_content = b"" - for chunk_subkey, chunk_content in new_content.items(): - chunk_slice = slice(len(shard_content), len(shard_content) + len(chunk_content)) - index.set_chunk_slice(chunk_subkey, chunk_slice) - shard_content += chunk_content - # Appending the index at the end of the shard: - shard_content += index.to_bytes() - self.inner_store[shard_key] = shard_content - else: # pragma: no cover - self.inner_store[key] = value - - def __delitem__(self, key): - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - try: - index = self._get_index_from_store(shard_key) - except KeyError: - raise KeyError(key) - - index.set_chunk_slice(chunk_subkey, None) - - if index.is_all_empty(): - del self.inner_store[shard_key] - else: - index_bytes = index.to_bytes() - self.inner_store.set_partial_values([(shard_key, -len(index_bytes), index_bytes)]) - else: # pragma: no cover - del self.inner_store[key] - - def _shard_key_to_original_keys(self, key: str) -> Iterator[str]: - if self._is_data_key(key): - index = self._get_index_from_store(key) - prefix, _, _ = key.rpartition("c") - for chunk_tuple in self._get_chunks_in_shard(key): - if index.get_chunk_slice(chunk_tuple) is not None: - yield prefix + "c" + self.dimension_separator.join(map(str, chunk_tuple)) - else: - yield key - - def __iter__(self) -> Iterator[str]: - for key in self.inner_store: - yield from self._shard_key_to_original_keys(key) - - def __len__(self): - return sum(1 for _ in self.keys()) - - def get_partial_values(self, key_ranges): - if self.supports_efficient_get_partial_values: - transformed_key_ranges = [] - cached_indices = {} - none_indices = [] - for i, (key, range_) in enumerate(key_ranges): - if self._is_data_key(key): - shard_key, chunk_subkey = self._key_to_shard(key) - try: - index = cached_indices[shard_key] - except KeyError: - try: - index = self._get_index_from_store(shard_key) - except KeyError: - none_indices.append(i) - continue - cached_indices[shard_key] = index - chunk_slice = index.get_chunk_slice(chunk_subkey) - if chunk_slice is None: - none_indices.append(i) - continue - range_start, range_length = range_ - if range_length is None: - range_length = chunk_slice.stop - chunk_slice.start - transformed_key_ranges.append( - (shard_key, (range_start + chunk_slice.start, range_length)) - ) - else: # pragma: no cover - transformed_key_ranges.append((key, range_)) - values = self.inner_store.get_partial_values(transformed_key_ranges) - for i in none_indices: - values.insert(i, None) - return values - else: - return StoreV3.get_partial_values(self, key_ranges) - - def supports_efficient_set_partial_values(self): - return False - - def set_partial_values(self, key_start_values): - # This does not yet implement efficient set_partial_values - StoreV3.set_partial_values(self, key_start_values) - - def rename(self, src_path: str, dst_path: str) -> None: - StoreV3.rename(self, src_path, dst_path) # type: ignore[arg-type] - - def list_prefix(self, prefix): - return StoreV3.list_prefix(self, prefix) - - def erase_prefix(self, prefix): - if self._is_data_key(prefix): - StoreV3.erase_prefix(self, prefix) - else: - self.inner_store.erase_prefix(prefix) - - def rmdir(self, path=None): - path = normalize_storage_path(path) - _rmdir_from_keys_v3(self, path) - - def __contains__(self, key): - if self._is_data_key(key): - shard_key, chunk_subkeys = self._key_to_shard(key) - try: - index = self._get_index_from_store(shard_key) - except KeyError: - return False - chunk_slice = index.get_chunk_slice(chunk_subkeys) - return chunk_slice is not None - else: - return self._inner_store.__contains__(key) From f3902c49b190f911838340aaaff6dc63db2875be Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 11 Apr 2024 22:31:57 +0200 Subject: [PATCH 11/22] chore: prune out n5, abs, sqlite, zip, redis, mongodb, dbm, lmdb stores --- src/zarr/__init__.py | 8 - src/zarr/_storage/absstore.py | 224 -------- src/zarr/n5.py | 896 ----------------------------- src/zarr/storage.py | 1009 +-------------------------------- tests/test_core.py | 389 +------------ tests/test_creation.py | 35 -- tests/test_hierarchy.py | 88 +-- tests/test_meta_array.py | 6 +- tests/test_n5.py | 53 -- tests/test_storage.py | 492 ---------------- 10 files changed, 7 insertions(+), 3193 deletions(-) delete mode 100644 src/zarr/_storage/absstore.py delete mode 100644 src/zarr/n5.py delete mode 100644 tests/test_n5.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 601b1295ab..725ad0a783 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -30,22 +30,14 @@ ) from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group -from zarr.n5 import N5Store, N5FSStore from zarr.storage import ( - ABSStore, - DBMStore, DictStore, DirectoryStore, KVStore, - LMDBStore, LRUStoreCache, MemoryStore, - MongoDBStore, NestedDirectoryStore, - RedisStore, - SQLiteStore, TempStore, - ZipStore, ) from zarr.sync import ProcessSynchronizer, ThreadSynchronizer from zarr._version import version as __version__ diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py deleted file mode 100644 index d8e292535c..0000000000 --- a/src/zarr/_storage/absstore.py +++ /dev/null @@ -1,224 +0,0 @@ -"""This module contains storage classes related to Azure Blob Storage (ABS)""" - -import warnings -from numcodecs.compat import ensure_bytes -from zarr.util import normalize_storage_path -from zarr._storage.store import Store - -__doctest_requires__ = { - ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], -} - - -class ABSStore(Store): - """Storage class using Azure Blob Storage (ABS). - - Parameters - ---------- - container : string - The name of the ABS container to use. - - .. deprecated:: - Use ``client`` instead. - - prefix : string - Location of the "directory" to use as the root of the storage hierarchy - within the container. - - account_name : string - The Azure blob storage account name. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - account_key : string - The Azure blob storage account access key. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - blob_service_kwargs : dictionary - Extra arguments to be passed into the azure blob client, for e.g. when - using the emulator, pass in blob_service_kwargs={'is_emulated': True}. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - client : azure.storage.blob.ContainerClient, optional - And ``azure.storage.blob.ContainerClient`` to connect with. See - `here `_ # noqa - for more. - - .. versionadded:: 2.8.3 - - Notes - ----- - In order to use this store, you must install the Microsoft Azure Storage SDK for Python, - ``azure-storage-blob>=12.5.0``. - """ # noqa: E501 - - def __init__( - self, - container=None, - prefix="", - account_name=None, - account_key=None, - blob_service_kwargs=None, - dimension_separator=None, - client=None, - ): - self._dimension_separator = dimension_separator - self.prefix = normalize_storage_path(prefix) - if client is None: - # deprecated option, try to construct the client for them - msg = ( - "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" - "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " - "'client' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - from azure.storage.blob import ContainerClient - - blob_service_kwargs = blob_service_kwargs or {} - client = ContainerClient( - "https://{}.blob.core.windows.net/".format(account_name), - container, - credential=account_key, - **blob_service_kwargs, - ) - - self.client = client - self._container = container - self._account_name = account_name - self._account_key = account_key - - @staticmethod - def _warn_deprecated(property_): - msg = ( - "The {} property is deprecated and will be removed in a future " - "version. Get the property from 'ABSStore.client' instead." - ) - warnings.warn(msg.format(property_), FutureWarning, stacklevel=3) - - @property - def container(self): - self._warn_deprecated("container") - return self._container - - @property - def account_name(self): - self._warn_deprecated("account_name") - return self._account_name - - @property - def account_key(self): - self._warn_deprecated("account_key") - return self._account_key - - def _append_path_to_prefix(self, path): - if self.prefix == "": - return normalize_storage_path(path) - else: - return "/".join([self.prefix, normalize_storage_path(path)]) - - @staticmethod - def _strip_prefix_from_path(path, prefix): - # normalized things will not have any leading or trailing slashes - path_norm = normalize_storage_path(path) - prefix_norm = normalize_storage_path(prefix) - if prefix: - return path_norm[(len(prefix_norm) + 1) :] - else: - return path_norm - - def __getitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - blob_name = self._append_path_to_prefix(key) - try: - return self.client.download_blob(blob_name).readall() - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % blob_name) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - blob_name = self._append_path_to_prefix(key) - self.client.upload_blob(blob_name, value, overwrite=True) - - def __delitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - try: - self.client.delete_blob(self._append_path_to_prefix(key)) - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % key) - - def __eq__(self, other): - return ( - isinstance(other, ABSStore) - and self.client == other.client - and self.prefix == other.prefix - ) - - def keys(self): - return list(self.__iter__()) - - def __iter__(self): - if self.prefix: - list_blobs_prefix = self.prefix + "/" - else: - list_blobs_prefix = None - for blob in self.client.list_blobs(list_blobs_prefix): - yield self._strip_prefix_from_path(blob.name, self.prefix) - - def __len__(self): - return len(self.keys()) - - def __contains__(self, key): - blob_name = self._append_path_to_prefix(key) - return self.client.get_blob_client(blob_name).exists() - - def listdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - items = [ - self._strip_prefix_from_path(blob.name, dir_path) - for blob in self.client.walk_blobs(name_starts_with=dir_path, delimiter="/") - ] - return items - - def rmdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - for blob in self.client.list_blobs(name_starts_with=dir_path): - self.client.delete_blob(blob) - - def getsize(self, path=None): - store_path = normalize_storage_path(path) - fs_path = self._append_path_to_prefix(store_path) - if fs_path: - blob_client = self.client.get_blob_client(fs_path) - else: - blob_client = None - - if blob_client and blob_client.exists(): - return blob_client.get_blob_properties().size - else: - size = 0 - if fs_path == "": - fs_path = None - elif not fs_path.endswith("/"): - fs_path += "/" - for blob in self.client.walk_blobs(name_starts_with=fs_path, delimiter="/"): - blob_client = self.client.get_blob_client(blob) - if blob_client.exists(): - size += blob_client.get_blob_properties().size - return size - - def clear(self): - self.rmdir() diff --git a/src/zarr/n5.py b/src/zarr/n5.py deleted file mode 100644 index 79bab20576..0000000000 --- a/src/zarr/n5.py +++ /dev/null @@ -1,896 +0,0 @@ -"""This module contains a storage class and codec to support the N5 format. -""" -import os -import struct -import sys -from typing import Any, Dict, Optional, cast -import warnings - -import numpy as np -from numcodecs.abc import Codec -from numcodecs.compat import ndarray_copy -from numcodecs.registry import get_codec, register_codec - -from .meta import ZARR_FORMAT, json_dumps, json_loads -from .storage import FSStore -from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path -from .storage import array_meta_key as zarr_array_meta_key -from .storage import attrs_key as zarr_attrs_key -from .storage import group_meta_key as zarr_group_meta_key - -N5_FORMAT = "2.0.0" - -zarr_to_n5_keys = [ - ("chunks", "blockSize"), - ("dtype", "dataType"), - ("compressor", "compression"), - ("shape", "dimensions"), -] -n5_attrs_key = "attributes.json" -n5_keywords = ["n5", "dataType", "dimensions", "blockSize", "compression"] - - -class N5Store(NestedDirectoryStore): - """Storage class using directories and files on a standard file system, - following the N5 format (https://github.com/saalfeldlab/n5). - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5Store('data/array.n5') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5Store('data/group.n5') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - - This is an experimental feature. - - Safe to write in multiple threads or processes. - - """ - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs: - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # remove previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - super().__delitem__(key_new) - - def __contains__(self, key): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - return super().__contains__(key_new) - - def __eq__(self, other): - return isinstance(other, N5Store) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - path = cast(str, path) - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and os.path.isdir(entry_path): - for dir_path, _, file_names in os.walk(entry_path): - for file_name in file_names: - file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path + os.path.sep)[1] - new_child = rel_path.replace(os.path.sep, ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - return sorted(children) - - else: - return children - - def _load_n5_attrs(self, path: str) -> Dict[str, Any]: - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - if not path.endswith(n5_attrs_key): - attrs_key = os.path.join(path, n5_attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -class N5FSStore(FSStore): - """Implementation of the N5 format (https://github.com/saalfeldlab/n5) - using `fsspec`, which allows storage on a variety of filesystems. Based - on `zarr.N5Store`. - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - This is an experimental feature. - Safe to write in multiple threads or processes. - - Be advised that the `_dimension_separator` property of this store - (and arrays it creates) is ".", but chunks saved by this store will - in fact be "/" separated, as proscribed by the N5 format. - - This is counter-intuitive (to say the least), but not arbitrary. - Chunks in N5 format are stored with reversed dimension order - relative to Zarr chunks: a chunk of a 3D Zarr array would be stored - on a file system as `/0/1/2`, but in N5 the same chunk would be - stored as `/2/1/0`. Therefore, stores targeting N5 must intercept - chunk keys and flip the order of the dimensions before writing to - storage, and this procedure requires chunk keys with "." separated - dimensions, hence the Zarr arrays targeting N5 have the deceptive - "." dimension separator. - """ - - _array_meta_key = "attributes.json" - _group_meta_key = "attributes.json" - _attrs_key = "attributes.json" - - def __init__(self, *args, **kwargs): - if "dimension_separator" in kwargs: - warnings.warn("Keyword argument `dimension_separator` will be ignored") - kwargs["dimension_separator"] = "." - super().__init__(*args, **kwargs) - - @staticmethod - def _swap_separator(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - def _normalize_key(self, key: str): - if is_chunk_key(key): - key = invert_chunk_coords(key) - - key = normalize_storage_path(key).lstrip("/") - if key: - *bits, end = key.split("/") - - if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): - end = end.replace(".", "/") - key = "/".join(bits + [end]) - return key.lower() if self.normalize_keys else key - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs.keys(): - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # replace previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - elif is_chunk_key(key): - key_new = self._swap_separator(key) - else: - key_new = key - super().__delitem__(key_new) - - def __contains__(self, key: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - return super().__contains__(key_new) - - def __eq__(self, other: Any): - return isinstance(other, N5FSStore) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._array_meta_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and self.fs.isdir(entry_path): - for file_name in self.fs.find(entry_path): - file_path = os.path.join(root_path, file_name) - rel_path = file_path.split(root_path)[1] - new_child = rel_path.lstrip("/").replace("/", ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._group_meta_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - return sorted(children) - else: - return children - - def _load_n5_attrs(self, path: str): - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - if not path.endswith(self._attrs_key): - attrs_key = os.path.join(path, self._attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -def is_chunk_key(key: str): - rv = False - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - rv = bool(_prog_ckey.match(last_segment)) - return rv - - -def invert_chunk_coords(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - -def group_metadata_to_n5(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from zarr to N5 format.""" - del group_metadata["zarr_format"] - # TODO: This should only exist at the top-level - group_metadata["n5"] = N5_FORMAT - return group_metadata - - -def group_metadata_to_zarr(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from N5 to zarr format.""" - # This only exists at the top level - group_metadata.pop("n5", None) - group_metadata["zarr_format"] = ZARR_FORMAT - return group_metadata - - -def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dict[str, Any]: - """Convert array metadata from zarr to N5 format. If the `top_level` keyword argument is True, - then the `N5` : N5_FORMAT key : value pair will be inserted into the metadata.""" - - for f, t in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - del array_metadata["zarr_format"] - if top_level: - array_metadata["n5"] = N5_FORMAT - try: - dtype = np.dtype(array_metadata["dataType"]) - except TypeError: - raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") - - array_metadata["dataType"] = dtype.name - array_metadata["dimensions"] = array_metadata["dimensions"][::-1] - array_metadata["blockSize"] = array_metadata["blockSize"][::-1] - - if "fill_value" in array_metadata: - if array_metadata["fill_value"] != 0 and array_metadata["fill_value"] is not None: - raise ValueError( - f"""Received fill_value = {array_metadata['fill_value']}, - but N5 only supports fill_value = 0""" - ) - del array_metadata["fill_value"] - - if "order" in array_metadata: - if array_metadata["order"] != "C": - raise ValueError( - f"Received order = {array_metadata['order']}, but N5 only supports order = C" - ) - del array_metadata["order"] - - if "filters" in array_metadata: - if array_metadata["filters"] != [] and array_metadata["filters"] is not None: - raise ValueError("Received filters, but N5 storage does not support zarr filters") - del array_metadata["filters"] - - assert "compression" in array_metadata - compressor_config = array_metadata["compression"] - compressor_config = compressor_config_to_n5(compressor_config) - array_metadata["compression"] = compressor_config - - if "dimension_separator" in array_metadata: - del array_metadata["dimension_separator"] - - return array_metadata - - -def array_metadata_to_zarr( - array_metadata: Dict[str, Any], top_level: bool = False -) -> Dict[str, Any]: - """Convert array metadata from N5 to zarr format. - If the `top_level` keyword argument is True, then the `N5` key will be removed from metadata""" - for t, f in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - if top_level: - array_metadata.pop("n5") - array_metadata["zarr_format"] = ZARR_FORMAT - - array_metadata["shape"] = array_metadata["shape"][::-1] - array_metadata["chunks"] = array_metadata["chunks"][::-1] - array_metadata["fill_value"] = 0 # also if None was requested - array_metadata["order"] = "C" - array_metadata["filters"] = [] - array_metadata["dimension_separator"] = "." - array_metadata["dtype"] = np.dtype(array_metadata["dtype"]).str - - compressor_config = array_metadata["compressor"] - compressor_config = compressor_config_to_zarr(compressor_config) - array_metadata["compressor"] = { - "id": N5ChunkWrapper.codec_id, - "compressor_config": compressor_config, - "dtype": array_metadata["dtype"], - "chunk_shape": array_metadata["chunks"], - } - - return array_metadata - - -def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: - """Get all zarr attributes from an N5 attributes dictionary (i.e., - all non-keyword attributes).""" - - # remove all N5 keywords - for n5_key in n5_keywords: - if n5_key in attrs: - del attrs[n5_key] - - return attrs - - -def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: - if compressor_config is None: - return {"type": "raw"} - else: - _compressor_config = compressor_config - - # peel wrapper, if present - if _compressor_config["id"] == N5ChunkWrapper.codec_id: - _compressor_config = _compressor_config["compressor_config"] - - codec_id = _compressor_config["id"] - n5_config = {"type": codec_id} - - if codec_id == "bz2": - n5_config["type"] = "bzip2" - n5_config["blockSize"] = _compressor_config["level"] - - elif codec_id == "blosc": - n5_config["cname"] = _compressor_config["cname"] - n5_config["clevel"] = _compressor_config["clevel"] - n5_config["shuffle"] = _compressor_config["shuffle"] - n5_config["blocksize"] = _compressor_config["blocksize"] - - elif codec_id == "lzma": - # Switch to XZ for N5 if we are using the default XZ format. - # Note: 4 is the default, which is lzma.CHECK_CRC64. - if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: - n5_config["type"] = "xz" - else: - warnings.warn( - "Not all N5 implementations support lzma compression (yet). You " - "might not be able to open the dataset with another N5 library.", - RuntimeWarning, - ) - n5_config["format"] = _compressor_config["format"] - n5_config["check"] = _compressor_config["check"] - n5_config["filters"] = _compressor_config["filters"] - - # The default is lzma.PRESET_DEFAULT, which is 6. - if _compressor_config["preset"]: - n5_config["preset"] = _compressor_config["preset"] - else: - n5_config["preset"] = 6 - - elif codec_id == "zlib": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = True - - elif codec_id == "gzip": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = False - - else: - n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) - - return n5_config - - -def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: - codec_id = compressor_config["type"] - zarr_config = {"id": codec_id} - - if codec_id == "bzip2": - zarr_config["id"] = "bz2" - zarr_config["level"] = compressor_config["blockSize"] - - elif codec_id == "blosc": - zarr_config["cname"] = compressor_config["cname"] - zarr_config["clevel"] = compressor_config["clevel"] - zarr_config["shuffle"] = compressor_config["shuffle"] - zarr_config["blocksize"] = compressor_config["blocksize"] - - elif codec_id == "lzma": - zarr_config["format"] = compressor_config["format"] - zarr_config["check"] = compressor_config["check"] - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = compressor_config["filters"] - - elif codec_id == "xz": - zarr_config["id"] = "lzma" - zarr_config["format"] = 1 # lzma.FORMAT_XZ - zarr_config["check"] = -1 - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = None - - elif codec_id == "gzip": - if "useZlib" in compressor_config and compressor_config["useZlib"]: - zarr_config["id"] = "zlib" - zarr_config["level"] = compressor_config["level"] - else: - zarr_config["id"] = "gzip" - zarr_config["level"] = compressor_config["level"] - - elif codec_id == "raw": - return None - - else: - zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) - - return zarr_config - - -class N5ChunkWrapper(Codec): - codec_id = "n5_wrapper" - - def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): - self.dtype = np.dtype(dtype) - self.chunk_shape = tuple(chunk_shape) - # is the dtype a little endian format? - self._little_endian = self.dtype.byteorder == "<" or ( - self.dtype.byteorder == "=" and sys.byteorder == "little" - ) - - if compressor: - if compressor_config is not None: - raise ValueError("Only one of compressor_config or compressor should be given.") - compressor_config = compressor.get_config() - - if compressor_config is None and compressor is None or compressor_config["id"] == "raw": - self.compressor_config = None - self._compressor = None - else: - self._compressor = get_codec(compressor_config) - self.compressor_config = self._compressor.get_config() - - def get_config(self): - config = {"id": self.codec_id, "compressor_config": self.compressor_config} - return config - - def encode(self, chunk): - assert chunk.flags.c_contiguous - - header = self._create_header(chunk) - chunk = self._to_big_endian(chunk) - - if self._compressor: - return header + self._compressor.encode(chunk) - else: - return header + chunk.tobytes(order="A") - - def decode(self, chunk, out=None) -> bytes: - len_header, chunk_shape = self._read_header(chunk) - chunk = chunk[len_header:] - - if out is not None: - # out should only be used if we read a complete chunk - assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( - self.chunk_shape, chunk_shape - ) - - if self._compressor: - self._compressor.decode(chunk, out) - else: - ndarray_copy(chunk, out) - - # we can byteswap in-place - if self._little_endian: - out.byteswap(True) - - return out - - else: - if self._compressor: - chunk = self._compressor.decode(chunk) - - # more expensive byteswap - chunk = self._from_big_endian(chunk) - - # read partial chunk - if chunk_shape != self.chunk_shape: - chunk = np.frombuffer(chunk, dtype=self.dtype) - chunk = chunk.reshape(chunk_shape) - complete_chunk = np.zeros(self.chunk_shape, dtype=self.dtype) - target_slices = tuple(slice(0, s) for s in chunk_shape) - complete_chunk[target_slices] = chunk - chunk = complete_chunk - - return chunk - - @staticmethod - def _create_header(chunk): - mode = struct.pack(">H", 0) - num_dims = struct.pack(">H", len(chunk.shape)) - shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) - - return mode + num_dims + shape - - @staticmethod - def _read_header(chunk): - num_dims = struct.unpack(">H", chunk[2:4])[0] - shape = tuple( - struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) - )[::-1] - - len_header = 4 + num_dims * 4 - - return len_header, shape - - def _to_big_endian(self, data): - # assumes data is ndarray - - if self._little_endian: - return data.byteswap() - return data - - def _from_big_endian(self, data): - # assumes data is byte array in big endian - - if not self._little_endian: - return data - - a = np.frombuffer(data, self.dtype.newbyteorder(">")) - return a.astype(self.dtype) - - -register_codec(N5ChunkWrapper, N5ChunkWrapper.codec_id) diff --git a/src/zarr/storage.py b/src/zarr/storage.py index a7bd22a6b9..7d4ae3a56c 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -17,25 +17,19 @@ import atexit import errno import glob -import multiprocessing -import operator import os import re import shutil -import sys import tempfile import warnings -import zipfile from collections import OrderedDict from collections.abc import MutableMapping from os import scandir -from pickle import PicklingError -from threading import Lock, RLock +from threading import Lock from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any import uuid -import time -from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray_like from numcodecs.registry import codec_registry from zarr.context import Context @@ -51,7 +45,6 @@ from zarr.util import ( buffer_size, json_loads, - nolock, normalize_chunks, normalize_dimension_separator, normalize_dtype, @@ -63,7 +56,6 @@ ensure_contiguous_ndarray_or_bytes, ) -from zarr._storage.absstore import ABSStore # noqa: F401 from zarr._storage.store import ( # noqa: F401 _listdir_from_keys, _rename_from_keys, @@ -79,13 +71,6 @@ Store, ) -__doctest_requires__ = { - ("RedisStore", "RedisStore.*"): ["redis"], - ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], - ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], -} - - try: # noinspection PyUnresolvedReferences from zarr.codecs import Blosc @@ -142,12 +127,6 @@ def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore return FSStore(store, mode=mode, **(storage_options or {})) elif storage_options: raise ValueError("storage_options passed with non-fsspec path") - if store.endswith(".zip"): - return ZipStore(store, mode=mode) - elif store.endswith(".n5"): - from zarr.n5 import N5Store - - return N5Store(store) else: return DirectoryStore(store) else: @@ -1508,258 +1487,6 @@ def __eq__(self, other): return isinstance(other, NestedDirectoryStore) and self.path == other.path -# noinspection PyPep8Naming -class ZipStore(Store): - """Storage class using a Zip file. - - Parameters - ---------- - path : string - Location of file. - compression : integer, optional - Compression method to use when writing to the archive. - allowZip64 : bool, optional - If True (the default) will create ZIP files that use the ZIP64 - extensions when the zipfile is larger than 2 GiB. If False - will raise an exception when the ZIP file would require ZIP64 - extensions. - mode : string, optional - One of 'r' to read an existing file, 'w' to truncate and write a new - file, 'a' to append to an existing file, or 'x' to exclusively create - and write a new file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.ZipStore('data/array.zip', mode='w') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.ZipStore('data/group.zip', mode='w') - >>> root = zarr.group(store=store) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a ZipStore, the ``close()`` method must be called, otherwise - essential data will not be written to the underlying Zip file. The ZipStore - class also supports the context manager protocol, which ensures the ``close()`` - method is called on leaving the context, e.g.:: - - >>> with zarr.ZipStore('data/array.zip', mode='w') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - Each chunk of an array is stored as a separate entry in the Zip file. Note - that Zip files do not provide any way to remove or replace existing entries. - If an attempt is made to replace an entry, then a warning is generated by - the Python standard library about a duplicate Zip file entry. This can be - triggered if you attempt to write data to a Zarr array more than once, - e.g.:: - - >>> store = zarr.ZipStore('data/example.zip', mode='w') - >>> z = zarr.zeros(100, chunks=10, store=store) - >>> # first write OK - ... z[...] = 42 - >>> # second write generates warnings - ... z[...] = 42 # doctest: +SKIP - >>> store.close() - - This can also happen in a more subtle situation, where data are written only - once to a Zarr array, but the write operations are not aligned with chunk - boundaries, e.g.:: - - >>> store = zarr.ZipStore('data/example.zip', mode='w') - >>> z = zarr.zeros(100, chunks=10, store=store) - >>> z[5:15] = 42 - >>> # write overlaps chunk previously written, generates warnings - ... z[15:25] = 42 # doctest: +SKIP - - To avoid creating duplicate entries, only write data once, and align writes - with chunk boundaries. This alignment is done automatically if you call - ``z[...] = ...`` or create an array from existing data via :func:`zarr.array`. - - Alternatively, use a :class:`DirectoryStore` when writing the data, then - manually Zip the directory and use the Zip file for subsequent reads. - Take note that the files in the Zip file must be relative to the root of the - Zarr archive. You may find it easier to create such a Zip file with ``7z``, e.g.:: - - 7z a -tzip archive.zarr.zip archive.zarr/. - - Safe to write in multiple threads but not in multiple processes. - - """ - - _erasable = False - - def __init__( - self, - path, - compression=zipfile.ZIP_STORED, - allowZip64=True, - mode="a", - dimension_separator=None, - ): - # store properties - path = os.path.abspath(path) - self.path = path - self.compression = compression - self.allowZip64 = allowZip64 - self.mode = mode - self._dimension_separator = dimension_separator - - # Current understanding is that zipfile module in stdlib is not thread-safe, - # and so locking is required for both read and write. However, this has not - # been investigated in detail, perhaps no lock is needed if mode='r'. - self.mutex = RLock() - - # open zip file - self.zf = zipfile.ZipFile(path, mode=mode, compression=compression, allowZip64=allowZip64) - - def __getstate__(self): - self.flush() - return self.path, self.compression, self.allowZip64, self.mode - - def __setstate__(self, state): - path, compression, allowZip64, mode = state - # if initially opened with mode 'w' or 'x', re-open in mode 'a' so file doesn't - # get clobbered - if mode in "wx": - mode = "a" - self.__init__(path=path, compression=compression, allowZip64=allowZip64, mode=mode) - - def close(self): - """Closes the underlying zip file, ensuring all records are written.""" - with self.mutex: - self.zf.close() - - def flush(self): - """Closes the underlying zip file, ensuring all records are written, - then re-opens the file for further modifications.""" - if self.mode != "r": - with self.mutex: - self.zf.close() - # N.B., re-open with mode 'a' regardless of initial mode so we don't wipe - # what's been written - self.zf = zipfile.ZipFile( - self.path, mode="a", compression=self.compression, allowZip64=self.allowZip64 - ) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - with self.mutex: - with self.zf.open(key) as f: # will raise KeyError - return f.read() - - def __setitem__(self, key, value): - if self.mode == "r": - raise ReadOnlyError() - value = ensure_contiguous_ndarray_like(value).view("u1") - with self.mutex: - # writestr(key, value) writes with default permissions from - # zipfile (600) that are too restrictive, build ZipInfo for - # the key to work around limitation - keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6]) - keyinfo.compress_type = self.compression - if keyinfo.filename[-1] == os.sep: - keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x - keyinfo.external_attr |= 0x10 # MS-DOS directory flag - else: - keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- - - self.zf.writestr(keyinfo, value) - - def __delitem__(self, key): - raise NotImplementedError - - def __eq__(self, other): - return ( - isinstance(other, ZipStore) - and self.path == other.path - and self.compression == other.compression - and self.allowZip64 == other.allowZip64 - ) - - def keylist(self): - with self.mutex: - return sorted(self.zf.namelist()) - - def keys(self): - yield from self.keylist() - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - try: - with self.mutex: - self.zf.getinfo(key) - except KeyError: - return False - else: - return True - - def listdir(self, path=None): - path = normalize_storage_path(path) - return _listdir_from_keys(self, path) - - def getsize(self, path=None): - path = normalize_storage_path(path) - with self.mutex: - children = self.listdir(path) - if children: - size = 0 - for child in children: - if path: - name = path + "/" + child - else: - name = child - try: - info = self.zf.getinfo(name) - except KeyError: - pass - else: - size += info.compress_size - return size - elif path: - try: - info = self.zf.getinfo(path) - return info.compress_size - except KeyError: - return 0 - else: - return 0 - - def clear(self): - if self.mode == "r": - raise ReadOnlyError() - with self.mutex: - self.close() - os.remove(self.path) - self.zf = zipfile.ZipFile( - self.path, mode=self.mode, compression=self.compression, allowZip64=self.allowZip64 - ) - - def migrate_1to2(store): """Migrate array metadata in `store` from Zarr format version 1 to version 2. @@ -1813,386 +1540,6 @@ def migrate_1to2(store): del store["attrs"] -# noinspection PyShadowingBuiltins -class DBMStore(Store): - """Storage class using a DBM-style database. - - Parameters - ---------- - path : string - Location of database file. - flag : string, optional - Flags for opening the database file. - mode : int - File mode used if a new file is created. - open : function, optional - Function to open the database file. If not provided, :func:`dbm.open` will be - used on Python 3, and :func:`anydbm.open` will be used on Python 2. - write_lock: bool, optional - Use a lock to prevent concurrent writes from multiple threads (True by default). - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk.e - **open_kwargs - Keyword arguments to pass the `open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.DBMStore('data/array.db') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.DBMStore('data/group.db') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.DBMStore('data/array.db') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - A different database library can be used by passing a different function to - the `open` parameter. For example, if the `bsddb3 - `_ package is installed, a - Berkeley DB database can be used:: - - >>> import bsddb3 - >>> store = zarr.DBMStore('data/array.bdb', open=bsddb3.btopen) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() - - Notes - ----- - Please note that, by default, this class will use the Python standard - library `dbm.open` function to open the database file (or `anydbm.open` on - Python 2). There are up to three different implementations of DBM-style - databases available in any Python installation, and which one is used may - vary from one system to another. Database file formats are not compatible - between these different implementations. Also, some implementations are - more efficient than others. In particular, the "dumb" implementation will be - the fall-back on many systems, and has very poor performance for some usage - scenarios. If you want to ensure a specific implementation is used, pass the - corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM - library. - - Safe to write in multiple threads. May be safe to write in multiple processes, - depending on which DBM implementation is being used, although this has not been - tested. - - """ - - def __init__( - self, - path, - flag="c", - mode=0o666, - open=None, - write_lock=True, - dimension_separator=None, - **open_kwargs, - ): - if open is None: - import dbm - - open = dbm.open - path = os.path.abspath(path) - # noinspection PyArgumentList - self.db = open(path, flag, mode, **open_kwargs) - self.path = path - self.flag = flag - self.mode = mode - self.open = open - self.write_lock = write_lock - if write_lock: - # This may not be required as some dbm implementations manage their own - # locks, but err on the side of caution. - self.write_mutex = Lock() - else: - self.write_mutex = nolock - self.open_kwargs = open_kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # needed for ndbm - except Exception: - # flush may fail if db has already been closed - pass - return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) - - def __setstate__(self, state): - path, flag, mode, open, write_lock, open_kws = state - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.__init__(path=path, flag=flag, mode=mode, open=open, write_lock=write_lock, **open_kws) - - def close(self): - """Closes the underlying database file.""" - if hasattr(self.db, "close"): - with self.write_mutex: - self.db.close() - - def flush(self): - """Synchronizes data to the underlying database file.""" - if self.flag[0] != "r": - with self.write_mutex: - if hasattr(self.db, "sync"): - self.db.sync() - else: # pragma: no cover - # we don't cover this branch anymore as ndbm (oracle) is not packaged - # by conda-forge on non-mac OS: - # https://github.com/conda-forge/staged-recipes/issues/4476 - # fall-back, close and re-open, needed for ndbm - flag = self.flag - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.db.close() - # noinspection PyArgumentList - self.db = self.open(self.path, flag, self.mode, **self.open_kwargs) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return self.db[key] - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - value = ensure_bytes(value) - with self.write_mutex: - self.db[key] = value - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.write_mutex: - del self.db[key] - - def __eq__(self, other): - return ( - isinstance(other, DBMStore) - and self.path == other.path - and - # allow flag and mode to differ - self.open == other.open - and self.open_kwargs == other.open_kwargs - ) - - def keys(self): - return (ensure_text(k, "ascii") for k in iter(self.db.keys())) - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return key in self.db - - def rmdir(self, path: str = "") -> None: - path = normalize_storage_path(path) - _rmdir_from_keys(self, path) - - -class LMDBStore(Store): - """Storage class using LMDB. Requires the `lmdb `_ - package to be installed. - - - Parameters - ---------- - path : string - Location of database file. - buffers : bool, optional - If True (default) use support for buffers, which should increase performance by - reducing memory copies. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `lmdb.open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.LMDBStore('data/array.mdb') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.LMDBStore('data/group.mdb') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.LMDBStore('data/array.mdb') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - By default writes are not immediately flushed to disk to increase performance. You - can ensure data are flushed to disk by calling the ``flush()`` or ``close()`` methods. - - Should be safe to write in multiple threads or processes due to the synchronization - support within LMDB, although writing from multiple processes has not been tested. - - """ - - def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): - import lmdb - - # set default memory map size to something larger than the lmdb default, which is - # very likely to be too small for any moderate array (logic copied from zict) - map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 - kwargs.setdefault("map_size", map_size) - - # don't initialize buffers to zero by default, shouldn't be necessary - kwargs.setdefault("meminit", False) - - # decide whether to use the writemap option based on the operating system's - # support for sparse files - writemap requires sparse file support otherwise - # the whole# `map_size` may be reserved up front on disk (logic copied from zict) - writemap = sys.platform.startswith("linux") - kwargs.setdefault("writemap", writemap) - - # decide options for when data are flushed to disk - choose to delay syncing - # data to filesystem, otherwise pay a large performance penalty (zict also does - # this) - kwargs.setdefault("metasync", False) - kwargs.setdefault("sync", False) - kwargs.setdefault("map_async", False) - - # set default option for number of cached transactions - max_spare_txns = multiprocessing.cpu_count() - kwargs.setdefault("max_spare_txns", max_spare_txns) - - # normalize path - path = os.path.abspath(path) - - # open database - self.db = lmdb.open(path, **kwargs) - - # store properties - self.buffers = buffers - self.path = path - self.kwargs = kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # just in case - except Exception: - # flush may fail if db has already been closed - pass - return self.path, self.buffers, self.kwargs - - def __setstate__(self, state): - path, buffers, kwargs = state - self.__init__(path=path, buffers=buffers, **kwargs) - - def close(self): - """Closes the underlying database.""" - self.db.close() - - def flush(self): - """Synchronizes data to the file system.""" - self.db.sync() - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - # use the buffers option, should avoid a memory copy - with self.db.begin(buffers=self.buffers) as txn: - value = txn.get(key) - if value is None: - raise KeyError(key) - return value - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True, buffers=self.buffers) as txn: - txn.put(key, value) - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True) as txn: - if not txn.delete(key): - raise KeyError(key) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - return cursor.set_key(key) - - def items(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k, v in cursor.iternext(keys=True, values=True): - yield ensure_text(k, "ascii"), v - - def keys(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k in cursor.iternext(keys=True, values=False): - yield ensure_text(k, "ascii") - - def values(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - yield from cursor.iternext(keys=False, values=True) - - def __iter__(self): - return self.keys() - - def __len__(self): - return self.db.stat()["entries"] - - class LRUStoreCache(Store): """Storage class that implements a least-recently-used (LRU) cache layer over some other store. Intended primarily for use with stores that can be slow to @@ -2392,358 +1739,6 @@ def __delitem__(self, key): self._invalidate_value(key) -class SQLiteStore(Store): - """Storage class using SQLite. - - Parameters - ---------- - path : string - Location of database file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `sqlite3.connect` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.SQLiteStore('data/array.sqldb') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.SQLiteStore('data/group.sqldb') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - """ - - def __init__(self, path, dimension_separator=None, **kwargs): - import sqlite3 - - self._dimension_separator = dimension_separator - - # normalize path - if path != ":memory:": - path = os.path.abspath(path) - - # store properties - self.path = path - self.kwargs = kwargs - - # allow threading if SQLite connections are thread-safe - # - # ref: https://www.sqlite.org/releaselog/3_3_1.html - # ref: https://github.com/python/cpython/issues/71377 - check_same_thread = True - if sqlite3.sqlite_version_info >= (3, 3, 1): - check_same_thread = False - - # keep a lock for serializing mutable operations - self.lock = Lock() - - # open database - self.db = sqlite3.connect( - self.path, - detect_types=0, - isolation_level=None, - check_same_thread=check_same_thread, - **self.kwargs, - ) - - # handle keys as `str`s - self.db.text_factory = str - - # get a cursor to read/write to the database - self.cursor = self.db.cursor() - - # initialize database with our table if missing - with self.lock: - self.cursor.execute("CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)") - - def __getstate__(self): - if self.path == ":memory:": - raise PicklingError("Cannot pickle in-memory SQLite databases") - return self.path, self.kwargs - - def __setstate__(self, state): - path, kwargs = state - self.__init__(path=path, **kwargs) - - def close(self): - """Closes the underlying database.""" - - # close cursor and db objects - self.cursor.close() - self.db.close() - - def __getitem__(self, key): - value = self.cursor.execute("SELECT v FROM zarr WHERE (k = ?)", (key,)) - for (v,) in value: - return v - raise KeyError(key) - - def __setitem__(self, key, value): - self.update({key: value}) - - def __delitem__(self, key): - with self.lock: - self.cursor.execute("DELETE FROM zarr WHERE (k = ?)", (key,)) - if self.cursor.rowcount < 1: - raise KeyError(key) - - def __contains__(self, key): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr WHERE (k = ?)", (key,)) - for (has,) in cs: - has = bool(has) - return has - - def items(self): - kvs = self.cursor.execute("SELECT k, v FROM zarr") - yield from kvs - - def keys(self): - ks = self.cursor.execute("SELECT k FROM zarr") - for (k,) in ks: - yield k - - def values(self): - vs = self.cursor.execute("SELECT v FROM zarr") - for (v,) in vs: - yield v - - def __iter__(self): - return self.keys() - - def __len__(self): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr") - for (c,) in cs: - return c - - def update(self, *args, **kwargs): - args += (kwargs,) - - kv_list = [] - for dct in args: - for k, v in dct.items(): - v = ensure_contiguous_ndarray_like(v) - - # Accumulate key-value pairs for storage - kv_list.append((k, v)) - - with self.lock: - self.cursor.executemany("REPLACE INTO zarr VALUES (?, ?)", kv_list) - - def listdir(self, path=None): - path = normalize_storage_path(path) - sep = "_" if path == "" else "/" - keys = self.cursor.execute( - """ - SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( - SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m - FROM zarr WHERE k LIKE (? || "{sep}%") - ) ORDER BY l ASC - """.format(sep=sep), - (path, path), - ) - keys = list(map(operator.itemgetter(0), keys)) - return keys - - def getsize(self, path=None): - path = normalize_storage_path(path) - size = self.cursor.execute( - """ - SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - WHERE k LIKE (? || "%") AND - 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") - """, - (path, path), - ) - for (s,) in size: - return s - - def rmdir(self, path=None): - path = normalize_storage_path(path) - if path: - with self.lock: - self.cursor.execute('DELETE FROM zarr WHERE k LIKE (? || "/%")', (path,)) - else: - self.clear() - - def clear(self): - with self.lock: - self.cursor.executescript( - """ - BEGIN TRANSACTION; - DROP TABLE zarr; - CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); - COMMIT TRANSACTION; - """ - ) - - -class MongoDBStore(Store): - """Storage class using MongoDB. - - .. note:: This is an experimental feature. - - Requires the `pymongo `_ - package to be installed. - - Parameters - ---------- - database : string - Name of database - collection : string - Name of collection - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `pymongo.MongoClient` function. - - Notes - ----- - The maximum chunksize in MongoDB documents is 16 MB. - - """ - - _key = "key" - _value = "value" - - def __init__( - self, - database="mongodb_zarr", - collection="zarr_collection", - dimension_separator=None, - **kwargs, - ): - import pymongo - - self._database = database - self._collection = collection - self._dimension_separator = dimension_separator - self._kwargs = kwargs - - self.client = pymongo.MongoClient(**self._kwargs) - self.db = self.client.get_database(self._database) - self.collection = self.db.get_collection(self._collection) - - def __getitem__(self, key): - doc = self.collection.find_one({self._key: key}) - - if doc is None: - raise KeyError(key) - else: - return doc[self._value] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.collection.replace_one( - {self._key: key}, {self._key: key, self._value: value}, upsert=True - ) - - def __delitem__(self, key): - result = self.collection.delete_many({self._key: key}) - if not result.deleted_count == 1: - raise KeyError(key) - - def __iter__(self): - for f in self.collection.find({}): - yield f[self._key] - - def __len__(self): - return self.collection.count_documents({}) - - def __getstate__(self): - return self._database, self._collection, self._kwargs - - def __setstate__(self, state): - database, collection, kwargs = state - self.__init__(database=database, collection=collection, **kwargs) - - def close(self): - """Cleanup client resources and disconnect from MongoDB.""" - self.client.close() - - def clear(self): - """Remove all items from store.""" - self.collection.delete_many({}) - - -class RedisStore(Store): - """Storage class using Redis. - - .. note:: This is an experimental feature. - - Requires the `redis `_ - package to be installed. - - Parameters - ---------- - prefix : string - Name of prefix for Redis keys - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `redis.Redis` function. - - """ - - def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): - import redis - - self._prefix = prefix - self._kwargs = kwargs - self._dimension_separator = dimension_separator - - self.client = redis.Redis(**kwargs) - - def _key(self, key): - return "{prefix}:{key}".format(prefix=self._prefix, key=key) - - def __getitem__(self, key): - return self.client[self._key(key)] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.client[self._key(key)] = value - - def __delitem__(self, key): - count = self.client.delete(self._key(key)) - if not count: - raise KeyError(key) - - def keylist(self): - offset = len(self._key("")) # length of prefix - return [key[offset:].decode("utf-8") for key in self.client.keys(self._key("*"))] - - def keys(self): - yield from self.keylist() - - def __iter__(self): - yield from self.keys() - - def __len__(self): - return len(self.keylist()) - - def __getstate__(self): - return self._prefix, self._kwargs - - def __setstate__(self, state): - prefix, kwargs = state - self.__init__(prefix=prefix, **kwargs) - - def clear(self): - for key in self.keys(): - del self[key] - - class ConsolidatedMetadataStore(Store): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/tests/test_core.py b/tests/test_core.py index 6303371793..d996af5563 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,4 @@ import atexit -import os -import sys import pickle import shutil from typing import Any, Literal, Optional, Tuple, Union @@ -37,26 +35,19 @@ from zarr.core import Array from zarr.meta import json_loads -from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( - ABSStore, - DBMStore, DirectoryStore, FSStore, KVStore, - LMDBStore, LRUStoreCache, NestedDirectoryStore, - SQLiteStore, - atexit_rmglob, - atexit_rmtree, init_array, init_group, normalize_store_arg, ) from zarr.util import buffer_size -from .util import abs_container, skip_test_env_var, have_fsspec, mktemp +from .util import have_fsspec # noinspection PyMethodMayBeStatic @@ -1655,24 +1646,6 @@ def test_array_init_from_dict(): assert isinstance(a.store, KVStore) -@skip_test_env_var("ZARR_TEST_ABS") -class TestArrayWithABSStore(TestArray): - def create_store(self): - client = abs_container() - store = ABSStore(client=client) - store.rmdir() - return store - - @pytest.mark.xfail - def test_nbytes_stored(self): - return super().test_nbytes_stored() - - @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") - def test_pickle(self): - # internal attribute on ContainerClient isn't serializable for py36 and earlier - super().test_pickle() - - class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): def create_store(self): path = mkdtemp() @@ -1690,366 +1663,6 @@ def expected(self): ] -class TestArrayWithN5Store(TestArrayWithDirectoryStore): - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = N5Store(path) - return store - - def test_array_0d(self): - # test behaviour for array with 0 dimensions - - # setup - a = np.zeros(()) - z = self.create_array(shape=(), dtype=a.dtype, fill_value=0) - - # check properties - assert a.ndim == z.ndim - assert a.shape == z.shape - assert a.size == z.size - assert a.dtype == z.dtype - assert a.nbytes == z.nbytes - with pytest.raises(TypeError): - len(z) - assert () == z.chunks - assert 1 == z.nchunks - assert (1,) == z.cdata_shape - # compressor always None - no point in compressing a single value - assert z.compressor.compressor_config is None - - # check __getitem__ - b = z[...] - assert isinstance(b, np.ndarray) - assert a.shape == b.shape - assert a.dtype == b.dtype - assert_array_equal(a, np.array(z)) - assert_array_equal(a, z[...]) - assert a[()] == z[()] - with pytest.raises(IndexError): - z[0] - with pytest.raises(IndexError): - z[:] - - # check __setitem__ - z[...] = 42 - assert 42 == z[()] - z[()] = 43 - assert 43 == z[()] - with pytest.raises(IndexError): - z[0] = 42 - with pytest.raises(IndexError): - z[:] = 42 - with pytest.raises(ValueError): - z[...] = np.array([1, 2, 3]) - - def test_array_1d_fill_value(self): - nvalues = 1050 - dtype = np.int32 - for fill_value in 0, None: - a = np.arange(nvalues, dtype=dtype) - f = np.empty_like(a) - f.fill(fill_value or 0) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, fill_value=fill_value) - z[190:310] = a[190:310] - - assert_array_equal(f[:190], z[:190]) - assert_array_equal(a[190:310], z[190:310]) - assert_array_equal(f[310:], z[310:]) - - with pytest.raises(ValueError): - z = self.create_array(shape=(nvalues,), chunks=100, dtype=dtype, fill_value=1) - - def test_nchunks_initialized(self): - fill_value = 0 - dtype = "int" - z = self.create_array( - shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=True - ) - - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store["foo"] = b"bar" - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[z._chunk_key((0,))] - assert 9 == z.nchunks_initialized - - # second round of similar tests with write_empty_chunks set to - # False - z = self.create_array( - shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=False - ) - z[:] = 42 - assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[z._chunk_key((0,))] - assert 9 == z.nchunks_initialized - z[:] = z.fill_value - assert 0 == z.nchunks_initialized - - def test_array_order(self): - # N5 only supports 'C' at the moment - with pytest.raises(ValueError): - self.create_array(shape=(10, 11), chunks=(10, 11), dtype="i8", order="F") - - # 1D - a = np.arange(1050) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, order="C") - assert z.order == "C" - assert z[:].flags.c_contiguous - z[:] = a - assert_array_equal(a, z[:]) - - # 2D - a = np.arange(10000).reshape((100, 100)) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype, order="C") - - assert z.order == "C" - assert z[:].flags.c_contiguous - z[:] = a - actual = z[:] - assert_array_equal(a, actual) - - def test_structured_array(self): - d = np.array( - [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], - dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], - ) - fill_values = None, b"", (b"zzz", 42, 16.8) - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_structured_array_subshapes(self): - d = np.array( - [ - (0, ((0, 1, 2), (1, 2, 3)), b"aaa"), - (1, ((1, 2, 3), (2, 3, 4)), b"bbb"), - (2, ((2, 3, 4), (3, 4, 5)), b"ccc"), - ], - dtype=[("foo", "i8"), ("bar", "(2, 3)f4"), ("baz", "S3")], - ) - fill_values = None, b"", (0, ((0, 0, 0), (1, 1, 1)), b"zzz") - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_structured_array_nested(self): - d = np.array( - [ - (0, (0, ((0, 1), (1, 2), (2, 3)), 0), b"aaa"), - (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b"bbb"), - (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b"ccc"), - ], - dtype=[ - ("foo", "i8"), - ("bar", [("foo", "i4"), ("bar", "(3, 2)f4"), ("baz", "u1")]), - ("baz", "S3"), - ], - ) - fill_values = None, b"", (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b"zzz") - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_dtypes(self): - # integers - for dtype in "u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8": - z = self.create_array(shape=10, chunks=3, dtype=dtype) - assert z.dtype == np.dtype(dtype) - a = np.arange(z.shape[0], dtype=dtype) - z[:] = a - assert_array_equal(a, z[:]) - - # floats - for dtype in "f2", "f4", "f8": - z = self.create_array(shape=10, chunks=3, dtype=dtype) - assert z.dtype == np.dtype(dtype) - a = np.linspace(0, 1, z.shape[0], dtype=dtype) - z[:] = a - assert_array_almost_equal(a, z[:]) - - # check that datetime generic units are not allowed - with pytest.raises(ValueError): - self.create_array(shape=100, dtype="M8") - with pytest.raises(ValueError): - self.create_array(shape=100, dtype="m8") - - def test_object_arrays(self): - # an object_codec is required for object arrays - with pytest.raises(ValueError): - self.create_array(shape=10, chunks=3, dtype=object) - - # an object_codec is required for object arrays, but allow to be provided via - # filters to maintain API backwards compatibility - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - self.create_array(shape=10, chunks=3, dtype=object, filters=[MsgPack()]) - - # create an object array using an object codec - with pytest.raises(ValueError): - self.create_array(shape=10, chunks=3, dtype=object, object_codec=MsgPack()) - - def test_object_arrays_vlen_text(self): - data = np.array(greetings * 1000, dtype=object) - - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=object, object_codec=VLenUTF8()) - - # convenience API - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=str) - - def test_object_arrays_vlen_bytes(self): - greetings_bytes = [g.encode("utf8") for g in greetings] - data = np.array(greetings_bytes * 1000, dtype=object) - - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=object, object_codec=VLenBytes()) - - # convenience API - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=bytes) - - def test_object_arrays_vlen_array(self): - data = np.array( - [np.array([1, 3, 7]), np.array([5]), np.array([2, 8, 12])] * 1000, dtype=object - ) - - codecs = VLenArray(int), VLenArray("> 16) -# assert perm == "0o644" -# info = z.getinfo(baz_key) -# perm = oct(info.external_attr >> 16) -# # only for posix platforms -# if os.name == "posix": -# if self.version == 2: -# assert perm == "0o40775" -# else: -# # baz/ on v2, but baz on v3, so not a directory -# assert perm == "0o644" -# z.close() - -# def test_store_and_retrieve_ndarray(self): -# store = ZipStore("data/store.zip") -# x = np.array([[1, 2], [3, 4]]) -# store["foo"] = x -# y = np.frombuffer(store["foo"], dtype=x.dtype).reshape(x.shape) -# assert np.array_equiv(y, x) - - -# class TestDBMStore(StoreTests): -# def create_store(self, dimension_separator=None): -# path = mktemp(suffix=".anydbm") -# atexit.register(atexit_rmglob, path + "*") -# # create store using default dbm implementation -# store = DBMStore(path, flag="n", dimension_separator=dimension_separator) -# return store - -# def test_context_manager(self): -# with self.create_store() as store: -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"qux" -# assert 2 == len(store) - - -# class TestDBMStoreDumb(TestDBMStore): -# def create_store(self, **kwargs): -# path = mktemp(suffix=".dumbdbm") -# atexit.register(atexit_rmglob, path + "*") - -# import dbm.dumb as dumbdbm - -# store = DBMStore(path, flag="n", open=dumbdbm.open, **kwargs) -# return store - - -# class TestDBMStoreGnu(TestDBMStore): -# def create_store(self, **kwargs): -# gdbm = pytest.importorskip("dbm.gnu") -# path = mktemp(suffix=".gdbm") # pragma: no cover -# atexit.register(os.remove, path) # pragma: no cover -# store = DBMStore( -# path, flag="n", open=gdbm.open, write_lock=False, **kwargs -# ) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreNDBM(TestDBMStore): -# def create_store(self, **kwargs): -# ndbm = pytest.importorskip("dbm.ndbm") -# path = mktemp(suffix=".ndbm") # pragma: no cover -# atexit.register(atexit_rmglob, path + "*") # pragma: no cover -# store = DBMStore(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreBerkeleyDB(TestDBMStore): -# def create_store(self, **kwargs): -# bsddb3 = pytest.importorskip("bsddb3") -# path = mktemp(suffix=".dbm") -# atexit.register(os.remove, path) -# store = DBMStore(path, flag="n", open=bsddb3.btopen, write_lock=False, **kwargs) -# return store - - -# class TestLMDBStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("lmdb") -# path = mktemp(suffix=".lmdb") -# atexit.register(atexit_rmtree, path) -# buffers = True -# store = LMDBStore(path, buffers=buffers, **kwargs) -# return store - -# def test_context_manager(self): -# with self.create_store() as store: -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"qux" -# assert 2 == len(store) - - -# class TestSQLiteStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStore(path, **kwargs) -# return store - -# def test_underscore_in_name(self): -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStore(path) -# store["a"] = b"aaa" -# store["a_b"] = b"aa_bb" -# store.rmdir("a") -# assert "a_b" in store - - -# class TestSQLiteStoreInMemory(TestSQLiteStore): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# store = SQLiteStore(":memory:", **kwargs) -# return store - -# def test_pickle(self): - -# # setup store -# store = self.create_store() -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"quux" - -# # round-trip through pickle -# with pytest.raises(PicklingError): -# pickle.dumps(store) - - -# @skip_test_env_var("ZARR_TEST_MONGO") -# class TestMongoDBStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("pymongo") -# store = MongoDBStore( -# host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs -# ) -# # start with an empty store -# store.clear() -# return store - - -# @skip_test_env_var("ZARR_TEST_REDIS") -# class TestRedisStore(StoreTests): -# def create_store(self, **kwargs): -# # TODO: this is the default host for Redis on Travis, -# # we probably want to generalize this though -# pytest.importorskip("redis") -# store = RedisStore(host="localhost", port=6379, **kwargs) -# # start with an empty store -# store.clear() -# return store - - # class TestLRUStoreCache(StoreTests): # CountingClass = CountingDict From 84a882cabe5d86d88ed64dec6d41e1eacb2c2942 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 13:08:49 +0200 Subject: [PATCH 12/22] Revert "chore: prune out n5, abs, sqlite, zip, redis, mongodb, dbm, lmdb stores" This reverts commit f3902c49b190f911838340aaaff6dc63db2875be. --- src/zarr/__init__.py | 8 + src/zarr/_storage/absstore.py | 224 ++++++++ src/zarr/n5.py | 896 +++++++++++++++++++++++++++++ src/zarr/storage.py | 1009 ++++++++++++++++++++++++++++++++- tests/test_core.py | 389 ++++++++++++- tests/test_creation.py | 35 ++ tests/test_hierarchy.py | 88 ++- tests/test_meta_array.py | 6 +- tests/test_n5.py | 53 ++ tests/test_storage.py | 492 ++++++++++++++++ 10 files changed, 3193 insertions(+), 7 deletions(-) create mode 100644 src/zarr/_storage/absstore.py create mode 100644 src/zarr/n5.py create mode 100644 tests/test_n5.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 725ad0a783..601b1295ab 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -30,14 +30,22 @@ ) from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group +from zarr.n5 import N5Store, N5FSStore from zarr.storage import ( + ABSStore, + DBMStore, DictStore, DirectoryStore, KVStore, + LMDBStore, LRUStoreCache, MemoryStore, + MongoDBStore, NestedDirectoryStore, + RedisStore, + SQLiteStore, TempStore, + ZipStore, ) from zarr.sync import ProcessSynchronizer, ThreadSynchronizer from zarr._version import version as __version__ diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py new file mode 100644 index 0000000000..d8e292535c --- /dev/null +++ b/src/zarr/_storage/absstore.py @@ -0,0 +1,224 @@ +"""This module contains storage classes related to Azure Blob Storage (ABS)""" + +import warnings +from numcodecs.compat import ensure_bytes +from zarr.util import normalize_storage_path +from zarr._storage.store import Store + +__doctest_requires__ = { + ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], +} + + +class ABSStore(Store): + """Storage class using Azure Blob Storage (ABS). + + Parameters + ---------- + container : string + The name of the ABS container to use. + + .. deprecated:: + Use ``client`` instead. + + prefix : string + Location of the "directory" to use as the root of the storage hierarchy + within the container. + + account_name : string + The Azure blob storage account name. + + .. deprecated:: 2.8.3 + Use ``client`` instead. + + account_key : string + The Azure blob storage account access key. + + .. deprecated:: 2.8.3 + Use ``client`` instead. + + blob_service_kwargs : dictionary + Extra arguments to be passed into the azure blob client, for e.g. when + using the emulator, pass in blob_service_kwargs={'is_emulated': True}. + + .. deprecated:: 2.8.3 + Use ``client`` instead. + + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + client : azure.storage.blob.ContainerClient, optional + And ``azure.storage.blob.ContainerClient`` to connect with. See + `here `_ # noqa + for more. + + .. versionadded:: 2.8.3 + + Notes + ----- + In order to use this store, you must install the Microsoft Azure Storage SDK for Python, + ``azure-storage-blob>=12.5.0``. + """ # noqa: E501 + + def __init__( + self, + container=None, + prefix="", + account_name=None, + account_key=None, + blob_service_kwargs=None, + dimension_separator=None, + client=None, + ): + self._dimension_separator = dimension_separator + self.prefix = normalize_storage_path(prefix) + if client is None: + # deprecated option, try to construct the client for them + msg = ( + "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" + "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " + "'client' instead." + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + from azure.storage.blob import ContainerClient + + blob_service_kwargs = blob_service_kwargs or {} + client = ContainerClient( + "https://{}.blob.core.windows.net/".format(account_name), + container, + credential=account_key, + **blob_service_kwargs, + ) + + self.client = client + self._container = container + self._account_name = account_name + self._account_key = account_key + + @staticmethod + def _warn_deprecated(property_): + msg = ( + "The {} property is deprecated and will be removed in a future " + "version. Get the property from 'ABSStore.client' instead." + ) + warnings.warn(msg.format(property_), FutureWarning, stacklevel=3) + + @property + def container(self): + self._warn_deprecated("container") + return self._container + + @property + def account_name(self): + self._warn_deprecated("account_name") + return self._account_name + + @property + def account_key(self): + self._warn_deprecated("account_key") + return self._account_key + + def _append_path_to_prefix(self, path): + if self.prefix == "": + return normalize_storage_path(path) + else: + return "/".join([self.prefix, normalize_storage_path(path)]) + + @staticmethod + def _strip_prefix_from_path(path, prefix): + # normalized things will not have any leading or trailing slashes + path_norm = normalize_storage_path(path) + prefix_norm = normalize_storage_path(prefix) + if prefix: + return path_norm[(len(prefix_norm) + 1) :] + else: + return path_norm + + def __getitem__(self, key): + from azure.core.exceptions import ResourceNotFoundError + + blob_name = self._append_path_to_prefix(key) + try: + return self.client.download_blob(blob_name).readall() + except ResourceNotFoundError: + raise KeyError("Blob %s not found" % blob_name) + + def __setitem__(self, key, value): + value = ensure_bytes(value) + blob_name = self._append_path_to_prefix(key) + self.client.upload_blob(blob_name, value, overwrite=True) + + def __delitem__(self, key): + from azure.core.exceptions import ResourceNotFoundError + + try: + self.client.delete_blob(self._append_path_to_prefix(key)) + except ResourceNotFoundError: + raise KeyError("Blob %s not found" % key) + + def __eq__(self, other): + return ( + isinstance(other, ABSStore) + and self.client == other.client + and self.prefix == other.prefix + ) + + def keys(self): + return list(self.__iter__()) + + def __iter__(self): + if self.prefix: + list_blobs_prefix = self.prefix + "/" + else: + list_blobs_prefix = None + for blob in self.client.list_blobs(list_blobs_prefix): + yield self._strip_prefix_from_path(blob.name, self.prefix) + + def __len__(self): + return len(self.keys()) + + def __contains__(self, key): + blob_name = self._append_path_to_prefix(key) + return self.client.get_blob_client(blob_name).exists() + + def listdir(self, path=None): + dir_path = normalize_storage_path(self._append_path_to_prefix(path)) + if dir_path: + dir_path += "/" + items = [ + self._strip_prefix_from_path(blob.name, dir_path) + for blob in self.client.walk_blobs(name_starts_with=dir_path, delimiter="/") + ] + return items + + def rmdir(self, path=None): + dir_path = normalize_storage_path(self._append_path_to_prefix(path)) + if dir_path: + dir_path += "/" + for blob in self.client.list_blobs(name_starts_with=dir_path): + self.client.delete_blob(blob) + + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self._append_path_to_prefix(store_path) + if fs_path: + blob_client = self.client.get_blob_client(fs_path) + else: + blob_client = None + + if blob_client and blob_client.exists(): + return blob_client.get_blob_properties().size + else: + size = 0 + if fs_path == "": + fs_path = None + elif not fs_path.endswith("/"): + fs_path += "/" + for blob in self.client.walk_blobs(name_starts_with=fs_path, delimiter="/"): + blob_client = self.client.get_blob_client(blob) + if blob_client.exists(): + size += blob_client.get_blob_properties().size + return size + + def clear(self): + self.rmdir() diff --git a/src/zarr/n5.py b/src/zarr/n5.py new file mode 100644 index 0000000000..79bab20576 --- /dev/null +++ b/src/zarr/n5.py @@ -0,0 +1,896 @@ +"""This module contains a storage class and codec to support the N5 format. +""" +import os +import struct +import sys +from typing import Any, Dict, Optional, cast +import warnings + +import numpy as np +from numcodecs.abc import Codec +from numcodecs.compat import ndarray_copy +from numcodecs.registry import get_codec, register_codec + +from .meta import ZARR_FORMAT, json_dumps, json_loads +from .storage import FSStore +from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path +from .storage import array_meta_key as zarr_array_meta_key +from .storage import attrs_key as zarr_attrs_key +from .storage import group_meta_key as zarr_group_meta_key + +N5_FORMAT = "2.0.0" + +zarr_to_n5_keys = [ + ("chunks", "blockSize"), + ("dtype", "dataType"), + ("compressor", "compression"), + ("shape", "dimensions"), +] +n5_attrs_key = "attributes.json" +n5_keywords = ["n5", "dataType", "dimensions", "blockSize", "compression"] + + +class N5Store(NestedDirectoryStore): + """Storage class using directories and files on a standard file system, + following the N5 format (https://github.com/saalfeldlab/n5). + + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.N5Store('data/array.n5') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.N5Store('data/group.n5') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + Notes + ----- + + This is an experimental feature. + + Safe to write in multiple threads or processes. + + """ + + def __getitem__(self, key: str) -> bytes: + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) + + return json_dumps(value) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + top_level = key == zarr_array_meta_key + value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) + return json_dumps(value) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + value = attrs_to_zarr(self._load_n5_attrs(key_new)) + + if len(value) == 0: + raise KeyError(key_new) + else: + return json_dumps(value) + + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + + else: + key_new = key + + return super().__getitem__(key_new) + + def __setitem__(self, key: str, value: Any): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**group_metadata_to_n5(json_loads(value))) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + top_level = key == zarr_array_meta_key + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + + n5_attrs = self._load_n5_attrs(key_new) + zarr_attrs = json_loads(value) + + for k in n5_keywords: + if k in zarr_attrs: + warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) + + # remove previous user attributes + for k in list(n5_attrs.keys()): + if k not in n5_keywords: + del n5_attrs[k] + + # add new user attributes + n5_attrs.update(**zarr_attrs) + + value = json_dumps(n5_attrs) + + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + + else: + key_new = key + + super().__setitem__(key_new, value) + + def __delitem__(self, key: str): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + else: + key_new = key + + super().__delitem__(key_new) + + def __contains__(self, key): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + if key_new not in self: + return False + # group if not a dataset (attributes do not contain 'dimensions') + return "dimensions" not in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + # array if attributes contain 'dimensions' + return "dimensions" in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + return self._contains_attrs(key_new) + + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + else: + key_new = key + + return super().__contains__(key_new) + + def __eq__(self, other): + return isinstance(other, N5Store) and self.path == other.path + + def listdir(self, path: Optional[str] = None): + if path is not None: + path = invert_chunk_coords(path) + path = cast(str, path) + # We can't use NestedDirectoryStore's listdir, as it requires + # array_meta_key to be present in array directories, which this store + # doesn't provide. + children = super().listdir(path=path) + + if self._is_array(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(n5_attrs_key) + children.append(zarr_array_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + # special handling of directories containing an array to map + # inverted nested chunk keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and os.path.isdir(entry_path): + for dir_path, _, file_names in os.walk(entry_path): + for file_name in file_names: + file_path = os.path.join(dir_path, file_name) + rel_path = file_path.split(root_path + os.path.sep)[1] + new_child = rel_path.replace(os.path.sep, ".") + new_children.append(invert_chunk_coords(new_child)) + else: + new_children.append(entry) + + return sorted(new_children) + + elif self._is_group(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(n5_attrs_key) + children.append(zarr_group_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + return sorted(children) + + else: + return children + + def _load_n5_attrs(self, path: str) -> Dict[str, Any]: + try: + s = super().__getitem__(path) + return json_loads(s) + except KeyError: + return {} + + def _is_group(self, path: str): + if path is None: + attrs_key = n5_attrs_key + else: + attrs_key = os.path.join(path, n5_attrs_key) + + n5_attrs = self._load_n5_attrs(attrs_key) + return len(n5_attrs) > 0 and "dimensions" not in n5_attrs + + def _is_array(self, path: str): + if path is None: + attrs_key = n5_attrs_key + else: + attrs_key = os.path.join(path, n5_attrs_key) + + return "dimensions" in self._load_n5_attrs(attrs_key) + + def _contains_attrs(self, path: str): + if path is None: + attrs_key = n5_attrs_key + else: + if not path.endswith(n5_attrs_key): + attrs_key = os.path.join(path, n5_attrs_key) + else: + attrs_key = path + + attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) + return len(attrs) > 0 + + +class N5FSStore(FSStore): + """Implementation of the N5 format (https://github.com/saalfeldlab/n5) + using `fsspec`, which allows storage on a variety of filesystems. Based + on `zarr.N5Store`. + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + Notes + ----- + This is an experimental feature. + Safe to write in multiple threads or processes. + + Be advised that the `_dimension_separator` property of this store + (and arrays it creates) is ".", but chunks saved by this store will + in fact be "/" separated, as proscribed by the N5 format. + + This is counter-intuitive (to say the least), but not arbitrary. + Chunks in N5 format are stored with reversed dimension order + relative to Zarr chunks: a chunk of a 3D Zarr array would be stored + on a file system as `/0/1/2`, but in N5 the same chunk would be + stored as `/2/1/0`. Therefore, stores targeting N5 must intercept + chunk keys and flip the order of the dimensions before writing to + storage, and this procedure requires chunk keys with "." separated + dimensions, hence the Zarr arrays targeting N5 have the deceptive + "." dimension separator. + """ + + _array_meta_key = "attributes.json" + _group_meta_key = "attributes.json" + _attrs_key = "attributes.json" + + def __init__(self, *args, **kwargs): + if "dimension_separator" in kwargs: + warnings.warn("Keyword argument `dimension_separator` will be ignored") + kwargs["dimension_separator"] = "." + super().__init__(*args, **kwargs) + + @staticmethod + def _swap_separator(key: str): + segments = list(key.split("/")) + if segments: + last_segment = segments[-1] + if _prog_ckey.match(last_segment): + coords = list(last_segment.split(".")) + last_segment = "/".join(coords[::-1]) + segments = segments[:-1] + [last_segment] + key = "/".join(segments) + return key + + def _normalize_key(self, key: str): + if is_chunk_key(key): + key = invert_chunk_coords(key) + + key = normalize_storage_path(key).lstrip("/") + if key: + *bits, end = key.split("/") + + if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): + end = end.replace(".", "/") + key = "/".join(bits + [end]) + return key.lower() if self.normalize_keys else key + + def __getitem__(self, key: str) -> bytes: + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) + + return json_dumps(value) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + top_level = key == zarr_array_meta_key + value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) + return json_dumps(value) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + value = attrs_to_zarr(self._load_n5_attrs(key_new)) + + if len(value) == 0: + raise KeyError(key_new) + else: + return json_dumps(value) + + elif is_chunk_key(key): + key_new = self._swap_separator(key) + + else: + key_new = key + + return super().__getitem__(key_new) + + def __setitem__(self, key: str, value: Any): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**group_metadata_to_n5(json_loads(value))) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + top_level = key == zarr_array_meta_key + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + + n5_attrs = self._load_n5_attrs(key_new) + zarr_attrs = json_loads(value) + + for k in n5_keywords: + if k in zarr_attrs.keys(): + warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) + + # replace previous user attributes + for k in list(n5_attrs.keys()): + if k not in n5_keywords: + del n5_attrs[k] + + # add new user attributes + n5_attrs.update(**zarr_attrs) + + value = json_dumps(n5_attrs) + + elif is_chunk_key(key): + key_new = self._swap_separator(key) + + else: + key_new = key + + super().__setitem__(key_new, value) + + def __delitem__(self, key: str): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + elif is_chunk_key(key): + key_new = self._swap_separator(key) + else: + key_new = key + super().__delitem__(key_new) + + def __contains__(self, key: Any): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + if key_new not in self: + return False + # group if not a dataset (attributes do not contain 'dimensions') + return "dimensions" not in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + # array if attributes contain 'dimensions' + return "dimensions" in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + return self._contains_attrs(key_new) + + elif is_chunk_key(key): + key_new = self._swap_separator(key) + + else: + key_new = key + return super().__contains__(key_new) + + def __eq__(self, other: Any): + return isinstance(other, N5FSStore) and self.path == other.path + + def listdir(self, path: Optional[str] = None): + if path is not None: + path = invert_chunk_coords(path) + + # We can't use NestedDirectoryStore's listdir, as it requires + # array_meta_key to be present in array directories, which this store + # doesn't provide. + children = super().listdir(path=path) + if self._is_array(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(self._array_meta_key) + children.append(zarr_array_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + # special handling of directories containing an array to map + # inverted nested chunk keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and self.fs.isdir(entry_path): + for file_name in self.fs.find(entry_path): + file_path = os.path.join(root_path, file_name) + rel_path = file_path.split(root_path)[1] + new_child = rel_path.lstrip("/").replace("/", ".") + new_children.append(invert_chunk_coords(new_child)) + else: + new_children.append(entry) + return sorted(new_children) + + elif self._is_group(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(self._group_meta_key) + children.append(zarr_group_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + return sorted(children) + else: + return children + + def _load_n5_attrs(self, path: str): + try: + s = super().__getitem__(path) + return json_loads(s) + except KeyError: + return {} + + def _is_group(self, path: Optional[str]): + if path is None: + attrs_key = self._attrs_key + else: + attrs_key = os.path.join(path, self._attrs_key) + + n5_attrs = self._load_n5_attrs(attrs_key) + return len(n5_attrs) > 0 and "dimensions" not in n5_attrs + + def _is_array(self, path: Optional[str]): + if path is None: + attrs_key = self._attrs_key + else: + attrs_key = os.path.join(path, self._attrs_key) + + return "dimensions" in self._load_n5_attrs(attrs_key) + + def _contains_attrs(self, path: Optional[str]): + if path is None: + attrs_key = self._attrs_key + else: + if not path.endswith(self._attrs_key): + attrs_key = os.path.join(path, self._attrs_key) + else: + attrs_key = path + + attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) + return len(attrs) > 0 + + +def is_chunk_key(key: str): + rv = False + segments = list(key.split("/")) + if segments: + last_segment = segments[-1] + rv = bool(_prog_ckey.match(last_segment)) + return rv + + +def invert_chunk_coords(key: str): + segments = list(key.split("/")) + if segments: + last_segment = segments[-1] + if _prog_ckey.match(last_segment): + coords = list(last_segment.split(".")) + last_segment = "/".join(coords[::-1]) + segments = segments[:-1] + [last_segment] + key = "/".join(segments) + return key + + +def group_metadata_to_n5(group_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Convert group metadata from zarr to N5 format.""" + del group_metadata["zarr_format"] + # TODO: This should only exist at the top-level + group_metadata["n5"] = N5_FORMAT + return group_metadata + + +def group_metadata_to_zarr(group_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Convert group metadata from N5 to zarr format.""" + # This only exists at the top level + group_metadata.pop("n5", None) + group_metadata["zarr_format"] = ZARR_FORMAT + return group_metadata + + +def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dict[str, Any]: + """Convert array metadata from zarr to N5 format. If the `top_level` keyword argument is True, + then the `N5` : N5_FORMAT key : value pair will be inserted into the metadata.""" + + for f, t in zarr_to_n5_keys: + array_metadata[t] = array_metadata.pop(f) + del array_metadata["zarr_format"] + if top_level: + array_metadata["n5"] = N5_FORMAT + try: + dtype = np.dtype(array_metadata["dataType"]) + except TypeError: + raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") + + array_metadata["dataType"] = dtype.name + array_metadata["dimensions"] = array_metadata["dimensions"][::-1] + array_metadata["blockSize"] = array_metadata["blockSize"][::-1] + + if "fill_value" in array_metadata: + if array_metadata["fill_value"] != 0 and array_metadata["fill_value"] is not None: + raise ValueError( + f"""Received fill_value = {array_metadata['fill_value']}, + but N5 only supports fill_value = 0""" + ) + del array_metadata["fill_value"] + + if "order" in array_metadata: + if array_metadata["order"] != "C": + raise ValueError( + f"Received order = {array_metadata['order']}, but N5 only supports order = C" + ) + del array_metadata["order"] + + if "filters" in array_metadata: + if array_metadata["filters"] != [] and array_metadata["filters"] is not None: + raise ValueError("Received filters, but N5 storage does not support zarr filters") + del array_metadata["filters"] + + assert "compression" in array_metadata + compressor_config = array_metadata["compression"] + compressor_config = compressor_config_to_n5(compressor_config) + array_metadata["compression"] = compressor_config + + if "dimension_separator" in array_metadata: + del array_metadata["dimension_separator"] + + return array_metadata + + +def array_metadata_to_zarr( + array_metadata: Dict[str, Any], top_level: bool = False +) -> Dict[str, Any]: + """Convert array metadata from N5 to zarr format. + If the `top_level` keyword argument is True, then the `N5` key will be removed from metadata""" + for t, f in zarr_to_n5_keys: + array_metadata[t] = array_metadata.pop(f) + if top_level: + array_metadata.pop("n5") + array_metadata["zarr_format"] = ZARR_FORMAT + + array_metadata["shape"] = array_metadata["shape"][::-1] + array_metadata["chunks"] = array_metadata["chunks"][::-1] + array_metadata["fill_value"] = 0 # also if None was requested + array_metadata["order"] = "C" + array_metadata["filters"] = [] + array_metadata["dimension_separator"] = "." + array_metadata["dtype"] = np.dtype(array_metadata["dtype"]).str + + compressor_config = array_metadata["compressor"] + compressor_config = compressor_config_to_zarr(compressor_config) + array_metadata["compressor"] = { + "id": N5ChunkWrapper.codec_id, + "compressor_config": compressor_config, + "dtype": array_metadata["dtype"], + "chunk_shape": array_metadata["chunks"], + } + + return array_metadata + + +def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: + """Get all zarr attributes from an N5 attributes dictionary (i.e., + all non-keyword attributes).""" + + # remove all N5 keywords + for n5_key in n5_keywords: + if n5_key in attrs: + del attrs[n5_key] + + return attrs + + +def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: + if compressor_config is None: + return {"type": "raw"} + else: + _compressor_config = compressor_config + + # peel wrapper, if present + if _compressor_config["id"] == N5ChunkWrapper.codec_id: + _compressor_config = _compressor_config["compressor_config"] + + codec_id = _compressor_config["id"] + n5_config = {"type": codec_id} + + if codec_id == "bz2": + n5_config["type"] = "bzip2" + n5_config["blockSize"] = _compressor_config["level"] + + elif codec_id == "blosc": + n5_config["cname"] = _compressor_config["cname"] + n5_config["clevel"] = _compressor_config["clevel"] + n5_config["shuffle"] = _compressor_config["shuffle"] + n5_config["blocksize"] = _compressor_config["blocksize"] + + elif codec_id == "lzma": + # Switch to XZ for N5 if we are using the default XZ format. + # Note: 4 is the default, which is lzma.CHECK_CRC64. + if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: + n5_config["type"] = "xz" + else: + warnings.warn( + "Not all N5 implementations support lzma compression (yet). You " + "might not be able to open the dataset with another N5 library.", + RuntimeWarning, + ) + n5_config["format"] = _compressor_config["format"] + n5_config["check"] = _compressor_config["check"] + n5_config["filters"] = _compressor_config["filters"] + + # The default is lzma.PRESET_DEFAULT, which is 6. + if _compressor_config["preset"]: + n5_config["preset"] = _compressor_config["preset"] + else: + n5_config["preset"] = 6 + + elif codec_id == "zlib": + n5_config["type"] = "gzip" + n5_config["level"] = _compressor_config["level"] + n5_config["useZlib"] = True + + elif codec_id == "gzip": + n5_config["type"] = "gzip" + n5_config["level"] = _compressor_config["level"] + n5_config["useZlib"] = False + + else: + n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) + + return n5_config + + +def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: + codec_id = compressor_config["type"] + zarr_config = {"id": codec_id} + + if codec_id == "bzip2": + zarr_config["id"] = "bz2" + zarr_config["level"] = compressor_config["blockSize"] + + elif codec_id == "blosc": + zarr_config["cname"] = compressor_config["cname"] + zarr_config["clevel"] = compressor_config["clevel"] + zarr_config["shuffle"] = compressor_config["shuffle"] + zarr_config["blocksize"] = compressor_config["blocksize"] + + elif codec_id == "lzma": + zarr_config["format"] = compressor_config["format"] + zarr_config["check"] = compressor_config["check"] + zarr_config["preset"] = compressor_config["preset"] + zarr_config["filters"] = compressor_config["filters"] + + elif codec_id == "xz": + zarr_config["id"] = "lzma" + zarr_config["format"] = 1 # lzma.FORMAT_XZ + zarr_config["check"] = -1 + zarr_config["preset"] = compressor_config["preset"] + zarr_config["filters"] = None + + elif codec_id == "gzip": + if "useZlib" in compressor_config and compressor_config["useZlib"]: + zarr_config["id"] = "zlib" + zarr_config["level"] = compressor_config["level"] + else: + zarr_config["id"] = "gzip" + zarr_config["level"] = compressor_config["level"] + + elif codec_id == "raw": + return None + + else: + zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) + + return zarr_config + + +class N5ChunkWrapper(Codec): + codec_id = "n5_wrapper" + + def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): + self.dtype = np.dtype(dtype) + self.chunk_shape = tuple(chunk_shape) + # is the dtype a little endian format? + self._little_endian = self.dtype.byteorder == "<" or ( + self.dtype.byteorder == "=" and sys.byteorder == "little" + ) + + if compressor: + if compressor_config is not None: + raise ValueError("Only one of compressor_config or compressor should be given.") + compressor_config = compressor.get_config() + + if compressor_config is None and compressor is None or compressor_config["id"] == "raw": + self.compressor_config = None + self._compressor = None + else: + self._compressor = get_codec(compressor_config) + self.compressor_config = self._compressor.get_config() + + def get_config(self): + config = {"id": self.codec_id, "compressor_config": self.compressor_config} + return config + + def encode(self, chunk): + assert chunk.flags.c_contiguous + + header = self._create_header(chunk) + chunk = self._to_big_endian(chunk) + + if self._compressor: + return header + self._compressor.encode(chunk) + else: + return header + chunk.tobytes(order="A") + + def decode(self, chunk, out=None) -> bytes: + len_header, chunk_shape = self._read_header(chunk) + chunk = chunk[len_header:] + + if out is not None: + # out should only be used if we read a complete chunk + assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( + self.chunk_shape, chunk_shape + ) + + if self._compressor: + self._compressor.decode(chunk, out) + else: + ndarray_copy(chunk, out) + + # we can byteswap in-place + if self._little_endian: + out.byteswap(True) + + return out + + else: + if self._compressor: + chunk = self._compressor.decode(chunk) + + # more expensive byteswap + chunk = self._from_big_endian(chunk) + + # read partial chunk + if chunk_shape != self.chunk_shape: + chunk = np.frombuffer(chunk, dtype=self.dtype) + chunk = chunk.reshape(chunk_shape) + complete_chunk = np.zeros(self.chunk_shape, dtype=self.dtype) + target_slices = tuple(slice(0, s) for s in chunk_shape) + complete_chunk[target_slices] = chunk + chunk = complete_chunk + + return chunk + + @staticmethod + def _create_header(chunk): + mode = struct.pack(">H", 0) + num_dims = struct.pack(">H", len(chunk.shape)) + shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) + + return mode + num_dims + shape + + @staticmethod + def _read_header(chunk): + num_dims = struct.unpack(">H", chunk[2:4])[0] + shape = tuple( + struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) + )[::-1] + + len_header = 4 + num_dims * 4 + + return len_header, shape + + def _to_big_endian(self, data): + # assumes data is ndarray + + if self._little_endian: + return data.byteswap() + return data + + def _from_big_endian(self, data): + # assumes data is byte array in big endian + + if not self._little_endian: + return data + + a = np.frombuffer(data, self.dtype.newbyteorder(">")) + return a.astype(self.dtype) + + +register_codec(N5ChunkWrapper, N5ChunkWrapper.codec_id) diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 7d4ae3a56c..a7bd22a6b9 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -17,19 +17,25 @@ import atexit import errno import glob +import multiprocessing +import operator import os import re import shutil +import sys import tempfile import warnings +import zipfile from collections import OrderedDict from collections.abc import MutableMapping from os import scandir -from threading import Lock +from pickle import PicklingError +from threading import Lock, RLock from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any import uuid +import time -from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray_like +from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like from numcodecs.registry import codec_registry from zarr.context import Context @@ -45,6 +51,7 @@ from zarr.util import ( buffer_size, json_loads, + nolock, normalize_chunks, normalize_dimension_separator, normalize_dtype, @@ -56,6 +63,7 @@ ensure_contiguous_ndarray_or_bytes, ) +from zarr._storage.absstore import ABSStore # noqa: F401 from zarr._storage.store import ( # noqa: F401 _listdir_from_keys, _rename_from_keys, @@ -71,6 +79,13 @@ Store, ) +__doctest_requires__ = { + ("RedisStore", "RedisStore.*"): ["redis"], + ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], + ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], +} + + try: # noinspection PyUnresolvedReferences from zarr.codecs import Blosc @@ -127,6 +142,12 @@ def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore return FSStore(store, mode=mode, **(storage_options or {})) elif storage_options: raise ValueError("storage_options passed with non-fsspec path") + if store.endswith(".zip"): + return ZipStore(store, mode=mode) + elif store.endswith(".n5"): + from zarr.n5 import N5Store + + return N5Store(store) else: return DirectoryStore(store) else: @@ -1487,6 +1508,258 @@ def __eq__(self, other): return isinstance(other, NestedDirectoryStore) and self.path == other.path +# noinspection PyPep8Naming +class ZipStore(Store): + """Storage class using a Zip file. + + Parameters + ---------- + path : string + Location of file. + compression : integer, optional + Compression method to use when writing to the archive. + allowZip64 : bool, optional + If True (the default) will create ZIP files that use the ZIP64 + extensions when the zipfile is larger than 2 GiB. If False + will raise an exception when the ZIP file would require ZIP64 + extensions. + mode : string, optional + One of 'r' to read an existing file, 'w' to truncate and write a new + file, 'a' to append to an existing file, or 'x' to exclusively create + and write a new file. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.ZipStore('data/array.zip', mode='w') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.ZipStore('data/group.zip', mode='w') + >>> root = zarr.group(store=store) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a ZipStore, the ``close()`` method must be called, otherwise + essential data will not be written to the underlying Zip file. The ZipStore + class also supports the context manager protocol, which ensures the ``close()`` + method is called on leaving the context, e.g.:: + + >>> with zarr.ZipStore('data/array.zip', mode='w') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + ... z[...] = 42 + ... # no need to call store.close() + + Notes + ----- + Each chunk of an array is stored as a separate entry in the Zip file. Note + that Zip files do not provide any way to remove or replace existing entries. + If an attempt is made to replace an entry, then a warning is generated by + the Python standard library about a duplicate Zip file entry. This can be + triggered if you attempt to write data to a Zarr array more than once, + e.g.:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> z = zarr.zeros(100, chunks=10, store=store) + >>> # first write OK + ... z[...] = 42 + >>> # second write generates warnings + ... z[...] = 42 # doctest: +SKIP + >>> store.close() + + This can also happen in a more subtle situation, where data are written only + once to a Zarr array, but the write operations are not aligned with chunk + boundaries, e.g.:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> z = zarr.zeros(100, chunks=10, store=store) + >>> z[5:15] = 42 + >>> # write overlaps chunk previously written, generates warnings + ... z[15:25] = 42 # doctest: +SKIP + + To avoid creating duplicate entries, only write data once, and align writes + with chunk boundaries. This alignment is done automatically if you call + ``z[...] = ...`` or create an array from existing data via :func:`zarr.array`. + + Alternatively, use a :class:`DirectoryStore` when writing the data, then + manually Zip the directory and use the Zip file for subsequent reads. + Take note that the files in the Zip file must be relative to the root of the + Zarr archive. You may find it easier to create such a Zip file with ``7z``, e.g.:: + + 7z a -tzip archive.zarr.zip archive.zarr/. + + Safe to write in multiple threads but not in multiple processes. + + """ + + _erasable = False + + def __init__( + self, + path, + compression=zipfile.ZIP_STORED, + allowZip64=True, + mode="a", + dimension_separator=None, + ): + # store properties + path = os.path.abspath(path) + self.path = path + self.compression = compression + self.allowZip64 = allowZip64 + self.mode = mode + self._dimension_separator = dimension_separator + + # Current understanding is that zipfile module in stdlib is not thread-safe, + # and so locking is required for both read and write. However, this has not + # been investigated in detail, perhaps no lock is needed if mode='r'. + self.mutex = RLock() + + # open zip file + self.zf = zipfile.ZipFile(path, mode=mode, compression=compression, allowZip64=allowZip64) + + def __getstate__(self): + self.flush() + return self.path, self.compression, self.allowZip64, self.mode + + def __setstate__(self, state): + path, compression, allowZip64, mode = state + # if initially opened with mode 'w' or 'x', re-open in mode 'a' so file doesn't + # get clobbered + if mode in "wx": + mode = "a" + self.__init__(path=path, compression=compression, allowZip64=allowZip64, mode=mode) + + def close(self): + """Closes the underlying zip file, ensuring all records are written.""" + with self.mutex: + self.zf.close() + + def flush(self): + """Closes the underlying zip file, ensuring all records are written, + then re-opens the file for further modifications.""" + if self.mode != "r": + with self.mutex: + self.zf.close() + # N.B., re-open with mode 'a' regardless of initial mode so we don't wipe + # what's been written + self.zf = zipfile.ZipFile( + self.path, mode="a", compression=self.compression, allowZip64=self.allowZip64 + ) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __getitem__(self, key): + with self.mutex: + with self.zf.open(key) as f: # will raise KeyError + return f.read() + + def __setitem__(self, key, value): + if self.mode == "r": + raise ReadOnlyError() + value = ensure_contiguous_ndarray_like(value).view("u1") + with self.mutex: + # writestr(key, value) writes with default permissions from + # zipfile (600) that are too restrictive, build ZipInfo for + # the key to work around limitation + keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6]) + keyinfo.compress_type = self.compression + if keyinfo.filename[-1] == os.sep: + keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x + keyinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- + + self.zf.writestr(keyinfo, value) + + def __delitem__(self, key): + raise NotImplementedError + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) + and self.path == other.path + and self.compression == other.compression + and self.allowZip64 == other.allowZip64 + ) + + def keylist(self): + with self.mutex: + return sorted(self.zf.namelist()) + + def keys(self): + yield from self.keylist() + + def __iter__(self): + return self.keys() + + def __len__(self): + return sum(1 for _ in self.keys()) + + def __contains__(self, key): + try: + with self.mutex: + self.zf.getinfo(key) + except KeyError: + return False + else: + return True + + def listdir(self, path=None): + path = normalize_storage_path(path) + return _listdir_from_keys(self, path) + + def getsize(self, path=None): + path = normalize_storage_path(path) + with self.mutex: + children = self.listdir(path) + if children: + size = 0 + for child in children: + if path: + name = path + "/" + child + else: + name = child + try: + info = self.zf.getinfo(name) + except KeyError: + pass + else: + size += info.compress_size + return size + elif path: + try: + info = self.zf.getinfo(path) + return info.compress_size + except KeyError: + return 0 + else: + return 0 + + def clear(self): + if self.mode == "r": + raise ReadOnlyError() + with self.mutex: + self.close() + os.remove(self.path) + self.zf = zipfile.ZipFile( + self.path, mode=self.mode, compression=self.compression, allowZip64=self.allowZip64 + ) + + def migrate_1to2(store): """Migrate array metadata in `store` from Zarr format version 1 to version 2. @@ -1540,6 +1813,386 @@ def migrate_1to2(store): del store["attrs"] +# noinspection PyShadowingBuiltins +class DBMStore(Store): + """Storage class using a DBM-style database. + + Parameters + ---------- + path : string + Location of database file. + flag : string, optional + Flags for opening the database file. + mode : int + File mode used if a new file is created. + open : function, optional + Function to open the database file. If not provided, :func:`dbm.open` will be + used on Python 3, and :func:`anydbm.open` will be used on Python 2. + write_lock: bool, optional + Use a lock to prevent concurrent writes from multiple threads (True by default). + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk.e + **open_kwargs + Keyword arguments to pass the `open` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.DBMStore('data/array.db') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.DBMStore('data/group.db') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a DBMStore, the ``close()`` method must be called, otherwise + essential data may not be written to the underlying database file. The + DBMStore class also supports the context manager protocol, which ensures the + ``close()`` method is called on leaving the context, e.g.:: + + >>> with zarr.DBMStore('data/array.db') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + ... z[...] = 42 + ... # no need to call store.close() + + A different database library can be used by passing a different function to + the `open` parameter. For example, if the `bsddb3 + `_ package is installed, a + Berkeley DB database can be used:: + + >>> import bsddb3 + >>> store = zarr.DBMStore('data/array.bdb', open=bsddb3.btopen) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() + + Notes + ----- + Please note that, by default, this class will use the Python standard + library `dbm.open` function to open the database file (or `anydbm.open` on + Python 2). There are up to three different implementations of DBM-style + databases available in any Python installation, and which one is used may + vary from one system to another. Database file formats are not compatible + between these different implementations. Also, some implementations are + more efficient than others. In particular, the "dumb" implementation will be + the fall-back on many systems, and has very poor performance for some usage + scenarios. If you want to ensure a specific implementation is used, pass the + corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM + library. + + Safe to write in multiple threads. May be safe to write in multiple processes, + depending on which DBM implementation is being used, although this has not been + tested. + + """ + + def __init__( + self, + path, + flag="c", + mode=0o666, + open=None, + write_lock=True, + dimension_separator=None, + **open_kwargs, + ): + if open is None: + import dbm + + open = dbm.open + path = os.path.abspath(path) + # noinspection PyArgumentList + self.db = open(path, flag, mode, **open_kwargs) + self.path = path + self.flag = flag + self.mode = mode + self.open = open + self.write_lock = write_lock + if write_lock: + # This may not be required as some dbm implementations manage their own + # locks, but err on the side of caution. + self.write_mutex = Lock() + else: + self.write_mutex = nolock + self.open_kwargs = open_kwargs + self._dimension_separator = dimension_separator + + def __getstate__(self): + try: + self.flush() # needed for ndbm + except Exception: + # flush may fail if db has already been closed + pass + return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) + + def __setstate__(self, state): + path, flag, mode, open, write_lock, open_kws = state + if flag[0] == "n": + flag = "c" + flag[1:] # don't clobber an existing database + self.__init__(path=path, flag=flag, mode=mode, open=open, write_lock=write_lock, **open_kws) + + def close(self): + """Closes the underlying database file.""" + if hasattr(self.db, "close"): + with self.write_mutex: + self.db.close() + + def flush(self): + """Synchronizes data to the underlying database file.""" + if self.flag[0] != "r": + with self.write_mutex: + if hasattr(self.db, "sync"): + self.db.sync() + else: # pragma: no cover + # we don't cover this branch anymore as ndbm (oracle) is not packaged + # by conda-forge on non-mac OS: + # https://github.com/conda-forge/staged-recipes/issues/4476 + # fall-back, close and re-open, needed for ndbm + flag = self.flag + if flag[0] == "n": + flag = "c" + flag[1:] # don't clobber an existing database + self.db.close() + # noinspection PyArgumentList + self.db = self.open(self.path, flag, self.mode, **self.open_kwargs) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __getitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + return self.db[key] + + def __setitem__(self, key, value): + if isinstance(key, str): + key = key.encode("ascii") + value = ensure_bytes(value) + with self.write_mutex: + self.db[key] = value + + def __delitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + with self.write_mutex: + del self.db[key] + + def __eq__(self, other): + return ( + isinstance(other, DBMStore) + and self.path == other.path + and + # allow flag and mode to differ + self.open == other.open + and self.open_kwargs == other.open_kwargs + ) + + def keys(self): + return (ensure_text(k, "ascii") for k in iter(self.db.keys())) + + def __iter__(self): + return self.keys() + + def __len__(self): + return sum(1 for _ in self.keys()) + + def __contains__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + return key in self.db + + def rmdir(self, path: str = "") -> None: + path = normalize_storage_path(path) + _rmdir_from_keys(self, path) + + +class LMDBStore(Store): + """Storage class using LMDB. Requires the `lmdb `_ + package to be installed. + + + Parameters + ---------- + path : string + Location of database file. + buffers : bool, optional + If True (default) use support for buffers, which should increase performance by + reducing memory copies. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `lmdb.open` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.LMDBStore('data/array.mdb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.LMDBStore('data/group.mdb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a DBMStore, the ``close()`` method must be called, otherwise + essential data may not be written to the underlying database file. The + DBMStore class also supports the context manager protocol, which ensures the + ``close()`` method is called on leaving the context, e.g.:: + + >>> with zarr.LMDBStore('data/array.mdb') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + ... z[...] = 42 + ... # no need to call store.close() + + Notes + ----- + By default writes are not immediately flushed to disk to increase performance. You + can ensure data are flushed to disk by calling the ``flush()`` or ``close()`` methods. + + Should be safe to write in multiple threads or processes due to the synchronization + support within LMDB, although writing from multiple processes has not been tested. + + """ + + def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): + import lmdb + + # set default memory map size to something larger than the lmdb default, which is + # very likely to be too small for any moderate array (logic copied from zict) + map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 + kwargs.setdefault("map_size", map_size) + + # don't initialize buffers to zero by default, shouldn't be necessary + kwargs.setdefault("meminit", False) + + # decide whether to use the writemap option based on the operating system's + # support for sparse files - writemap requires sparse file support otherwise + # the whole# `map_size` may be reserved up front on disk (logic copied from zict) + writemap = sys.platform.startswith("linux") + kwargs.setdefault("writemap", writemap) + + # decide options for when data are flushed to disk - choose to delay syncing + # data to filesystem, otherwise pay a large performance penalty (zict also does + # this) + kwargs.setdefault("metasync", False) + kwargs.setdefault("sync", False) + kwargs.setdefault("map_async", False) + + # set default option for number of cached transactions + max_spare_txns = multiprocessing.cpu_count() + kwargs.setdefault("max_spare_txns", max_spare_txns) + + # normalize path + path = os.path.abspath(path) + + # open database + self.db = lmdb.open(path, **kwargs) + + # store properties + self.buffers = buffers + self.path = path + self.kwargs = kwargs + self._dimension_separator = dimension_separator + + def __getstate__(self): + try: + self.flush() # just in case + except Exception: + # flush may fail if db has already been closed + pass + return self.path, self.buffers, self.kwargs + + def __setstate__(self, state): + path, buffers, kwargs = state + self.__init__(path=path, buffers=buffers, **kwargs) + + def close(self): + """Closes the underlying database.""" + self.db.close() + + def flush(self): + """Synchronizes data to the file system.""" + self.db.sync() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __getitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + # use the buffers option, should avoid a memory copy + with self.db.begin(buffers=self.buffers) as txn: + value = txn.get(key) + if value is None: + raise KeyError(key) + return value + + def __setitem__(self, key, value): + if isinstance(key, str): + key = key.encode("ascii") + with self.db.begin(write=True, buffers=self.buffers) as txn: + txn.put(key, value) + + def __delitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + with self.db.begin(write=True) as txn: + if not txn.delete(key): + raise KeyError(key) + + def __contains__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + return cursor.set_key(key) + + def items(self): + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + for k, v in cursor.iternext(keys=True, values=True): + yield ensure_text(k, "ascii"), v + + def keys(self): + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + for k in cursor.iternext(keys=True, values=False): + yield ensure_text(k, "ascii") + + def values(self): + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + yield from cursor.iternext(keys=False, values=True) + + def __iter__(self): + return self.keys() + + def __len__(self): + return self.db.stat()["entries"] + + class LRUStoreCache(Store): """Storage class that implements a least-recently-used (LRU) cache layer over some other store. Intended primarily for use with stores that can be slow to @@ -1739,6 +2392,358 @@ def __delitem__(self, key): self._invalidate_value(key) +class SQLiteStore(Store): + """Storage class using SQLite. + + Parameters + ---------- + path : string + Location of database file. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `sqlite3.connect` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.SQLiteStore('data/array.sqldb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.SQLiteStore('data/group.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + """ + + def __init__(self, path, dimension_separator=None, **kwargs): + import sqlite3 + + self._dimension_separator = dimension_separator + + # normalize path + if path != ":memory:": + path = os.path.abspath(path) + + # store properties + self.path = path + self.kwargs = kwargs + + # allow threading if SQLite connections are thread-safe + # + # ref: https://www.sqlite.org/releaselog/3_3_1.html + # ref: https://github.com/python/cpython/issues/71377 + check_same_thread = True + if sqlite3.sqlite_version_info >= (3, 3, 1): + check_same_thread = False + + # keep a lock for serializing mutable operations + self.lock = Lock() + + # open database + self.db = sqlite3.connect( + self.path, + detect_types=0, + isolation_level=None, + check_same_thread=check_same_thread, + **self.kwargs, + ) + + # handle keys as `str`s + self.db.text_factory = str + + # get a cursor to read/write to the database + self.cursor = self.db.cursor() + + # initialize database with our table if missing + with self.lock: + self.cursor.execute("CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)") + + def __getstate__(self): + if self.path == ":memory:": + raise PicklingError("Cannot pickle in-memory SQLite databases") + return self.path, self.kwargs + + def __setstate__(self, state): + path, kwargs = state + self.__init__(path=path, **kwargs) + + def close(self): + """Closes the underlying database.""" + + # close cursor and db objects + self.cursor.close() + self.db.close() + + def __getitem__(self, key): + value = self.cursor.execute("SELECT v FROM zarr WHERE (k = ?)", (key,)) + for (v,) in value: + return v + raise KeyError(key) + + def __setitem__(self, key, value): + self.update({key: value}) + + def __delitem__(self, key): + with self.lock: + self.cursor.execute("DELETE FROM zarr WHERE (k = ?)", (key,)) + if self.cursor.rowcount < 1: + raise KeyError(key) + + def __contains__(self, key): + cs = self.cursor.execute("SELECT COUNT(*) FROM zarr WHERE (k = ?)", (key,)) + for (has,) in cs: + has = bool(has) + return has + + def items(self): + kvs = self.cursor.execute("SELECT k, v FROM zarr") + yield from kvs + + def keys(self): + ks = self.cursor.execute("SELECT k FROM zarr") + for (k,) in ks: + yield k + + def values(self): + vs = self.cursor.execute("SELECT v FROM zarr") + for (v,) in vs: + yield v + + def __iter__(self): + return self.keys() + + def __len__(self): + cs = self.cursor.execute("SELECT COUNT(*) FROM zarr") + for (c,) in cs: + return c + + def update(self, *args, **kwargs): + args += (kwargs,) + + kv_list = [] + for dct in args: + for k, v in dct.items(): + v = ensure_contiguous_ndarray_like(v) + + # Accumulate key-value pairs for storage + kv_list.append((k, v)) + + with self.lock: + self.cursor.executemany("REPLACE INTO zarr VALUES (?, ?)", kv_list) + + def listdir(self, path=None): + path = normalize_storage_path(path) + sep = "_" if path == "" else "/" + keys = self.cursor.execute( + """ + SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( + SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m + FROM zarr WHERE k LIKE (? || "{sep}%") + ) ORDER BY l ASC + """.format(sep=sep), + (path, path), + ) + keys = list(map(operator.itemgetter(0), keys)) + return keys + + def getsize(self, path=None): + path = normalize_storage_path(path) + size = self.cursor.execute( + """ + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || "%") AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + """, + (path, path), + ) + for (s,) in size: + return s + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + with self.lock: + self.cursor.execute('DELETE FROM zarr WHERE k LIKE (? || "/%")', (path,)) + else: + self.clear() + + def clear(self): + with self.lock: + self.cursor.executescript( + """ + BEGIN TRANSACTION; + DROP TABLE zarr; + CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); + COMMIT TRANSACTION; + """ + ) + + +class MongoDBStore(Store): + """Storage class using MongoDB. + + .. note:: This is an experimental feature. + + Requires the `pymongo `_ + package to be installed. + + Parameters + ---------- + database : string + Name of database + collection : string + Name of collection + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `pymongo.MongoClient` function. + + Notes + ----- + The maximum chunksize in MongoDB documents is 16 MB. + + """ + + _key = "key" + _value = "value" + + def __init__( + self, + database="mongodb_zarr", + collection="zarr_collection", + dimension_separator=None, + **kwargs, + ): + import pymongo + + self._database = database + self._collection = collection + self._dimension_separator = dimension_separator + self._kwargs = kwargs + + self.client = pymongo.MongoClient(**self._kwargs) + self.db = self.client.get_database(self._database) + self.collection = self.db.get_collection(self._collection) + + def __getitem__(self, key): + doc = self.collection.find_one({self._key: key}) + + if doc is None: + raise KeyError(key) + else: + return doc[self._value] + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.collection.replace_one( + {self._key: key}, {self._key: key, self._value: value}, upsert=True + ) + + def __delitem__(self, key): + result = self.collection.delete_many({self._key: key}) + if not result.deleted_count == 1: + raise KeyError(key) + + def __iter__(self): + for f in self.collection.find({}): + yield f[self._key] + + def __len__(self): + return self.collection.count_documents({}) + + def __getstate__(self): + return self._database, self._collection, self._kwargs + + def __setstate__(self, state): + database, collection, kwargs = state + self.__init__(database=database, collection=collection, **kwargs) + + def close(self): + """Cleanup client resources and disconnect from MongoDB.""" + self.client.close() + + def clear(self): + """Remove all items from store.""" + self.collection.delete_many({}) + + +class RedisStore(Store): + """Storage class using Redis. + + .. note:: This is an experimental feature. + + Requires the `redis `_ + package to be installed. + + Parameters + ---------- + prefix : string + Name of prefix for Redis keys + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `redis.Redis` function. + + """ + + def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): + import redis + + self._prefix = prefix + self._kwargs = kwargs + self._dimension_separator = dimension_separator + + self.client = redis.Redis(**kwargs) + + def _key(self, key): + return "{prefix}:{key}".format(prefix=self._prefix, key=key) + + def __getitem__(self, key): + return self.client[self._key(key)] + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.client[self._key(key)] = value + + def __delitem__(self, key): + count = self.client.delete(self._key(key)) + if not count: + raise KeyError(key) + + def keylist(self): + offset = len(self._key("")) # length of prefix + return [key[offset:].decode("utf-8") for key in self.client.keys(self._key("*"))] + + def keys(self): + yield from self.keylist() + + def __iter__(self): + yield from self.keys() + + def __len__(self): + return len(self.keylist()) + + def __getstate__(self): + return self._prefix, self._kwargs + + def __setstate__(self, state): + prefix, kwargs = state + self.__init__(prefix=prefix, **kwargs) + + def clear(self): + for key in self.keys(): + del self[key] + + class ConsolidatedMetadataStore(Store): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/tests/test_core.py b/tests/test_core.py index d996af5563..6303371793 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,4 +1,6 @@ import atexit +import os +import sys import pickle import shutil from typing import Any, Literal, Optional, Tuple, Union @@ -35,19 +37,26 @@ from zarr.core import Array from zarr.meta import json_loads +from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( + ABSStore, + DBMStore, DirectoryStore, FSStore, KVStore, + LMDBStore, LRUStoreCache, NestedDirectoryStore, + SQLiteStore, + atexit_rmglob, + atexit_rmtree, init_array, init_group, normalize_store_arg, ) from zarr.util import buffer_size -from .util import have_fsspec +from .util import abs_container, skip_test_env_var, have_fsspec, mktemp # noinspection PyMethodMayBeStatic @@ -1646,6 +1655,24 @@ def test_array_init_from_dict(): assert isinstance(a.store, KVStore) +@skip_test_env_var("ZARR_TEST_ABS") +class TestArrayWithABSStore(TestArray): + def create_store(self): + client = abs_container() + store = ABSStore(client=client) + store.rmdir() + return store + + @pytest.mark.xfail + def test_nbytes_stored(self): + return super().test_nbytes_stored() + + @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") + def test_pickle(self): + # internal attribute on ContainerClient isn't serializable for py36 and earlier + super().test_pickle() + + class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): def create_store(self): path = mkdtemp() @@ -1663,6 +1690,366 @@ def expected(self): ] +class TestArrayWithN5Store(TestArrayWithDirectoryStore): + def create_store(self): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = N5Store(path) + return store + + def test_array_0d(self): + # test behaviour for array with 0 dimensions + + # setup + a = np.zeros(()) + z = self.create_array(shape=(), dtype=a.dtype, fill_value=0) + + # check properties + assert a.ndim == z.ndim + assert a.shape == z.shape + assert a.size == z.size + assert a.dtype == z.dtype + assert a.nbytes == z.nbytes + with pytest.raises(TypeError): + len(z) + assert () == z.chunks + assert 1 == z.nchunks + assert (1,) == z.cdata_shape + # compressor always None - no point in compressing a single value + assert z.compressor.compressor_config is None + + # check __getitem__ + b = z[...] + assert isinstance(b, np.ndarray) + assert a.shape == b.shape + assert a.dtype == b.dtype + assert_array_equal(a, np.array(z)) + assert_array_equal(a, z[...]) + assert a[()] == z[()] + with pytest.raises(IndexError): + z[0] + with pytest.raises(IndexError): + z[:] + + # check __setitem__ + z[...] = 42 + assert 42 == z[()] + z[()] = 43 + assert 43 == z[()] + with pytest.raises(IndexError): + z[0] = 42 + with pytest.raises(IndexError): + z[:] = 42 + with pytest.raises(ValueError): + z[...] = np.array([1, 2, 3]) + + def test_array_1d_fill_value(self): + nvalues = 1050 + dtype = np.int32 + for fill_value in 0, None: + a = np.arange(nvalues, dtype=dtype) + f = np.empty_like(a) + f.fill(fill_value or 0) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, fill_value=fill_value) + z[190:310] = a[190:310] + + assert_array_equal(f[:190], z[:190]) + assert_array_equal(a[190:310], z[190:310]) + assert_array_equal(f[310:], z[310:]) + + with pytest.raises(ValueError): + z = self.create_array(shape=(nvalues,), chunks=100, dtype=dtype, fill_value=1) + + def test_nchunks_initialized(self): + fill_value = 0 + dtype = "int" + z = self.create_array( + shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=True + ) + + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store["foo"] = b"bar" + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[z._chunk_key((0,))] + assert 9 == z.nchunks_initialized + + # second round of similar tests with write_empty_chunks set to + # False + z = self.create_array( + shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=False + ) + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[z._chunk_key((0,))] + assert 9 == z.nchunks_initialized + z[:] = z.fill_value + assert 0 == z.nchunks_initialized + + def test_array_order(self): + # N5 only supports 'C' at the moment + with pytest.raises(ValueError): + self.create_array(shape=(10, 11), chunks=(10, 11), dtype="i8", order="F") + + # 1D + a = np.arange(1050) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, order="C") + assert z.order == "C" + assert z[:].flags.c_contiguous + z[:] = a + assert_array_equal(a, z[:]) + + # 2D + a = np.arange(10000).reshape((100, 100)) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype, order="C") + + assert z.order == "C" + assert z[:].flags.c_contiguous + z[:] = a + actual = z[:] + assert_array_equal(a, actual) + + def test_structured_array(self): + d = np.array( + [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], + dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], + ) + fill_values = None, b"", (b"zzz", 42, 16.8) + with pytest.raises(TypeError): + self.check_structured_array(d, fill_values) + + def test_structured_array_subshapes(self): + d = np.array( + [ + (0, ((0, 1, 2), (1, 2, 3)), b"aaa"), + (1, ((1, 2, 3), (2, 3, 4)), b"bbb"), + (2, ((2, 3, 4), (3, 4, 5)), b"ccc"), + ], + dtype=[("foo", "i8"), ("bar", "(2, 3)f4"), ("baz", "S3")], + ) + fill_values = None, b"", (0, ((0, 0, 0), (1, 1, 1)), b"zzz") + with pytest.raises(TypeError): + self.check_structured_array(d, fill_values) + + def test_structured_array_nested(self): + d = np.array( + [ + (0, (0, ((0, 1), (1, 2), (2, 3)), 0), b"aaa"), + (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b"bbb"), + (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b"ccc"), + ], + dtype=[ + ("foo", "i8"), + ("bar", [("foo", "i4"), ("bar", "(3, 2)f4"), ("baz", "u1")]), + ("baz", "S3"), + ], + ) + fill_values = None, b"", (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b"zzz") + with pytest.raises(TypeError): + self.check_structured_array(d, fill_values) + + def test_dtypes(self): + # integers + for dtype in "u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8": + z = self.create_array(shape=10, chunks=3, dtype=dtype) + assert z.dtype == np.dtype(dtype) + a = np.arange(z.shape[0], dtype=dtype) + z[:] = a + assert_array_equal(a, z[:]) + + # floats + for dtype in "f2", "f4", "f8": + z = self.create_array(shape=10, chunks=3, dtype=dtype) + assert z.dtype == np.dtype(dtype) + a = np.linspace(0, 1, z.shape[0], dtype=dtype) + z[:] = a + assert_array_almost_equal(a, z[:]) + + # check that datetime generic units are not allowed + with pytest.raises(ValueError): + self.create_array(shape=100, dtype="M8") + with pytest.raises(ValueError): + self.create_array(shape=100, dtype="m8") + + def test_object_arrays(self): + # an object_codec is required for object arrays + with pytest.raises(ValueError): + self.create_array(shape=10, chunks=3, dtype=object) + + # an object_codec is required for object arrays, but allow to be provided via + # filters to maintain API backwards compatibility + with pytest.raises(ValueError): + with pytest.warns(FutureWarning): + self.create_array(shape=10, chunks=3, dtype=object, filters=[MsgPack()]) + + # create an object array using an object codec + with pytest.raises(ValueError): + self.create_array(shape=10, chunks=3, dtype=object, object_codec=MsgPack()) + + def test_object_arrays_vlen_text(self): + data = np.array(greetings * 1000, dtype=object) + + with pytest.raises(ValueError): + self.create_array(shape=data.shape, dtype=object, object_codec=VLenUTF8()) + + # convenience API + with pytest.raises(ValueError): + self.create_array(shape=data.shape, dtype=str) + + def test_object_arrays_vlen_bytes(self): + greetings_bytes = [g.encode("utf8") for g in greetings] + data = np.array(greetings_bytes * 1000, dtype=object) + + with pytest.raises(ValueError): + self.create_array(shape=data.shape, dtype=object, object_codec=VLenBytes()) + + # convenience API + with pytest.raises(ValueError): + self.create_array(shape=data.shape, dtype=bytes) + + def test_object_arrays_vlen_array(self): + data = np.array( + [np.array([1, 3, 7]), np.array([5]), np.array([2, 8, 12])] * 1000, dtype=object + ) + + codecs = VLenArray(int), VLenArray("> 16) +# assert perm == "0o644" +# info = z.getinfo(baz_key) +# perm = oct(info.external_attr >> 16) +# # only for posix platforms +# if os.name == "posix": +# if self.version == 2: +# assert perm == "0o40775" +# else: +# # baz/ on v2, but baz on v3, so not a directory +# assert perm == "0o644" +# z.close() + +# def test_store_and_retrieve_ndarray(self): +# store = ZipStore("data/store.zip") +# x = np.array([[1, 2], [3, 4]]) +# store["foo"] = x +# y = np.frombuffer(store["foo"], dtype=x.dtype).reshape(x.shape) +# assert np.array_equiv(y, x) + + +# class TestDBMStore(StoreTests): +# def create_store(self, dimension_separator=None): +# path = mktemp(suffix=".anydbm") +# atexit.register(atexit_rmglob, path + "*") +# # create store using default dbm implementation +# store = DBMStore(path, flag="n", dimension_separator=dimension_separator) +# return store + +# def test_context_manager(self): +# with self.create_store() as store: +# store[self.root + "foo"] = b"bar" +# store[self.root + "baz"] = b"qux" +# assert 2 == len(store) + + +# class TestDBMStoreDumb(TestDBMStore): +# def create_store(self, **kwargs): +# path = mktemp(suffix=".dumbdbm") +# atexit.register(atexit_rmglob, path + "*") + +# import dbm.dumb as dumbdbm + +# store = DBMStore(path, flag="n", open=dumbdbm.open, **kwargs) +# return store + + +# class TestDBMStoreGnu(TestDBMStore): +# def create_store(self, **kwargs): +# gdbm = pytest.importorskip("dbm.gnu") +# path = mktemp(suffix=".gdbm") # pragma: no cover +# atexit.register(os.remove, path) # pragma: no cover +# store = DBMStore( +# path, flag="n", open=gdbm.open, write_lock=False, **kwargs +# ) # pragma: no cover +# return store # pragma: no cover + + +# class TestDBMStoreNDBM(TestDBMStore): +# def create_store(self, **kwargs): +# ndbm = pytest.importorskip("dbm.ndbm") +# path = mktemp(suffix=".ndbm") # pragma: no cover +# atexit.register(atexit_rmglob, path + "*") # pragma: no cover +# store = DBMStore(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover +# return store # pragma: no cover + + +# class TestDBMStoreBerkeleyDB(TestDBMStore): +# def create_store(self, **kwargs): +# bsddb3 = pytest.importorskip("bsddb3") +# path = mktemp(suffix=".dbm") +# atexit.register(os.remove, path) +# store = DBMStore(path, flag="n", open=bsddb3.btopen, write_lock=False, **kwargs) +# return store + + +# class TestLMDBStore(StoreTests): +# def create_store(self, **kwargs): +# pytest.importorskip("lmdb") +# path = mktemp(suffix=".lmdb") +# atexit.register(atexit_rmtree, path) +# buffers = True +# store = LMDBStore(path, buffers=buffers, **kwargs) +# return store + +# def test_context_manager(self): +# with self.create_store() as store: +# store[self.root + "foo"] = b"bar" +# store[self.root + "baz"] = b"qux" +# assert 2 == len(store) + + +# class TestSQLiteStore(StoreTests): +# def create_store(self, **kwargs): +# pytest.importorskip("sqlite3") +# path = mktemp(suffix=".db") +# atexit.register(atexit_rmtree, path) +# store = SQLiteStore(path, **kwargs) +# return store + +# def test_underscore_in_name(self): +# path = mktemp(suffix=".db") +# atexit.register(atexit_rmtree, path) +# store = SQLiteStore(path) +# store["a"] = b"aaa" +# store["a_b"] = b"aa_bb" +# store.rmdir("a") +# assert "a_b" in store + + +# class TestSQLiteStoreInMemory(TestSQLiteStore): +# def create_store(self, **kwargs): +# pytest.importorskip("sqlite3") +# store = SQLiteStore(":memory:", **kwargs) +# return store + +# def test_pickle(self): + +# # setup store +# store = self.create_store() +# store[self.root + "foo"] = b"bar" +# store[self.root + "baz"] = b"quux" + +# # round-trip through pickle +# with pytest.raises(PicklingError): +# pickle.dumps(store) + + +# @skip_test_env_var("ZARR_TEST_MONGO") +# class TestMongoDBStore(StoreTests): +# def create_store(self, **kwargs): +# pytest.importorskip("pymongo") +# store = MongoDBStore( +# host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs +# ) +# # start with an empty store +# store.clear() +# return store + + +# @skip_test_env_var("ZARR_TEST_REDIS") +# class TestRedisStore(StoreTests): +# def create_store(self, **kwargs): +# # TODO: this is the default host for Redis on Travis, +# # we probably want to generalize this though +# pytest.importorskip("redis") +# store = RedisStore(host="localhost", port=6379, **kwargs) +# # start with an empty store +# store.clear() +# return store + + # class TestLRUStoreCache(StoreTests): # CountingClass = CountingDict From ff406dd118bc66cb957c68081607bd4a4cecf40f Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 13:16:55 +0200 Subject: [PATCH 13/22] chore: remove n5 storage --- docs/api.rst | 1 - docs/tutorial.rst | 14 - src/zarr/__init__.py | 1 - src/zarr/n5.py | 896 ----------------------------------------- src/zarr/storage.py | 4 - tests/test_core.py | 298 -------------- tests/test_creation.py | 35 -- tests/test_n5.py | 53 --- tests/test_storage.py | 272 +------------ 9 files changed, 1 insertion(+), 1573 deletions(-) delete mode 100644 src/zarr/n5.py delete mode 100644 tests/test_n5.py diff --git a/docs/api.rst b/docs/api.rst index e200dd908d..b784f0d006 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -8,7 +8,6 @@ API reference api/core api/hierarchy api/storage - api/n5 api/convenience api/codecs api/attrs diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 4099bac1c8..e259fdf079 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -845,20 +845,6 @@ with `MongoDB `_ (an object oriented NoSQL database). respectively require the `redis-py `_ and `pymongo `_ packages to be installed. -For compatibility with the `N5 `_ data format, Zarr also provides -an N5 backend (this is currently an experimental feature). Similar to the zip storage class, an -:class:`zarr.n5.N5Store` can be instantiated directly:: - - >>> store = zarr.N5Store('data/example.n5') - >>> root = zarr.group(store=store) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - -For convenience, the N5 backend will automatically be chosen when the filename -ends with `.n5`:: - - >>> root = zarr.open('data/example.n5', mode='w') - Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 601b1295ab..9647f2dc0b 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -30,7 +30,6 @@ ) from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group -from zarr.n5 import N5Store, N5FSStore from zarr.storage import ( ABSStore, DBMStore, diff --git a/src/zarr/n5.py b/src/zarr/n5.py deleted file mode 100644 index 79bab20576..0000000000 --- a/src/zarr/n5.py +++ /dev/null @@ -1,896 +0,0 @@ -"""This module contains a storage class and codec to support the N5 format. -""" -import os -import struct -import sys -from typing import Any, Dict, Optional, cast -import warnings - -import numpy as np -from numcodecs.abc import Codec -from numcodecs.compat import ndarray_copy -from numcodecs.registry import get_codec, register_codec - -from .meta import ZARR_FORMAT, json_dumps, json_loads -from .storage import FSStore -from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path -from .storage import array_meta_key as zarr_array_meta_key -from .storage import attrs_key as zarr_attrs_key -from .storage import group_meta_key as zarr_group_meta_key - -N5_FORMAT = "2.0.0" - -zarr_to_n5_keys = [ - ("chunks", "blockSize"), - ("dtype", "dataType"), - ("compressor", "compression"), - ("shape", "dimensions"), -] -n5_attrs_key = "attributes.json" -n5_keywords = ["n5", "dataType", "dimensions", "blockSize", "compression"] - - -class N5Store(NestedDirectoryStore): - """Storage class using directories and files on a standard file system, - following the N5 format (https://github.com/saalfeldlab/n5). - - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5Store('data/array.n5') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5Store('data/group.n5') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - - This is an experimental feature. - - Safe to write in multiple threads or processes. - - """ - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs: - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # remove previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - super().__delitem__(key_new) - - def __contains__(self, key): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = invert_chunk_coords(key) - else: - key_new = key - - return super().__contains__(key_new) - - def __eq__(self, other): - return isinstance(other, N5Store) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - path = cast(str, path) - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and os.path.isdir(entry_path): - for dir_path, _, file_names in os.walk(entry_path): - for file_name in file_names: - file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path + os.path.sep)[1] - new_child = rel_path.replace(os.path.sep, ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(n5_attrs_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - return sorted(children) - - else: - return children - - def _load_n5_attrs(self, path: str) -> Dict[str, Any]: - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - attrs_key = os.path.join(path, n5_attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: str): - if path is None: - attrs_key = n5_attrs_key - else: - if not path.endswith(n5_attrs_key): - attrs_key = os.path.join(path, n5_attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -class N5FSStore(FSStore): - """Implementation of the N5 format (https://github.com/saalfeldlab/n5) - using `fsspec`, which allows storage on a variety of filesystems. Based - on `zarr.N5Store`. - Parameters - ---------- - path : string - Location of directory to use as the root of the storage hierarchy. - normalize_keys : bool, optional - If True, all store keys will be normalized to use lower case characters - (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be - useful to avoid potential discrepancies between case-sensitive and - case-insensitive file system. Default value is False. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - - Store a group:: - - >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - - Notes - ----- - This is an experimental feature. - Safe to write in multiple threads or processes. - - Be advised that the `_dimension_separator` property of this store - (and arrays it creates) is ".", but chunks saved by this store will - in fact be "/" separated, as proscribed by the N5 format. - - This is counter-intuitive (to say the least), but not arbitrary. - Chunks in N5 format are stored with reversed dimension order - relative to Zarr chunks: a chunk of a 3D Zarr array would be stored - on a file system as `/0/1/2`, but in N5 the same chunk would be - stored as `/2/1/0`. Therefore, stores targeting N5 must intercept - chunk keys and flip the order of the dimensions before writing to - storage, and this procedure requires chunk keys with "." separated - dimensions, hence the Zarr arrays targeting N5 have the deceptive - "." dimension separator. - """ - - _array_meta_key = "attributes.json" - _group_meta_key = "attributes.json" - _attrs_key = "attributes.json" - - def __init__(self, *args, **kwargs): - if "dimension_separator" in kwargs: - warnings.warn("Keyword argument `dimension_separator` will be ignored") - kwargs["dimension_separator"] = "." - super().__init__(*args, **kwargs) - - @staticmethod - def _swap_separator(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - def _normalize_key(self, key: str): - if is_chunk_key(key): - key = invert_chunk_coords(key) - - key = normalize_storage_path(key).lstrip("/") - if key: - *bits, end = key.split("/") - - if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): - end = end.replace(".", "/") - key = "/".join(bits + [end]) - return key.lower() if self.normalize_keys else key - - def __getitem__(self, key: str) -> bytes: - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) - - return json_dumps(value) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) - return json_dumps(value) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - value = attrs_to_zarr(self._load_n5_attrs(key_new)) - - if len(value) == 0: - raise KeyError(key_new) - else: - return json_dumps(value) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - return super().__getitem__(key_new) - - def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**group_metadata_to_n5(json_loads(value))) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - top_level = key == zarr_array_meta_key - n5_attrs = self._load_n5_attrs(key_new) - n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) - - value = json_dumps(n5_attrs) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - - n5_attrs = self._load_n5_attrs(key_new) - zarr_attrs = json_loads(value) - - for k in n5_keywords: - if k in zarr_attrs.keys(): - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) - - # replace previous user attributes - for k in list(n5_attrs.keys()): - if k not in n5_keywords: - del n5_attrs[k] - - # add new user attributes - n5_attrs.update(**zarr_attrs) - - value = json_dumps(n5_attrs) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - - super().__setitem__(key_new, value) - - def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - elif is_chunk_key(key): - key_new = self._swap_separator(key) - else: - key_new = key - super().__delitem__(key_new) - - def __contains__(self, key: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) - if key_new not in self: - return False - # group if not a dataset (attributes do not contain 'dimensions') - return "dimensions" not in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) - # array if attributes contain 'dimensions' - return "dimensions" in self._load_n5_attrs(key_new) - - elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) - return self._contains_attrs(key_new) - - elif is_chunk_key(key): - key_new = self._swap_separator(key) - - else: - key_new = key - return super().__contains__(key_new) - - def __eq__(self, other: Any): - return isinstance(other, N5FSStore) and self.path == other.path - - def listdir(self, path: Optional[str] = None): - if path is not None: - path = invert_chunk_coords(path) - - # We can't use NestedDirectoryStore's listdir, as it requires - # array_meta_key to be present in array directories, which this store - # doesn't provide. - children = super().listdir(path=path) - if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._array_meta_key) - children.append(zarr_array_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - - # special handling of directories containing an array to map - # inverted nested chunk keys back to standard chunk keys - new_children = [] - root_path = self.dir_path(path) - for entry in children: - entry_path = os.path.join(root_path, entry) - if _prog_number.match(entry) and self.fs.isdir(entry_path): - for file_name in self.fs.find(entry_path): - file_path = os.path.join(root_path, file_name) - rel_path = file_path.split(root_path)[1] - new_child = rel_path.lstrip("/").replace("/", ".") - new_children.append(invert_chunk_coords(new_child)) - else: - new_children.append(entry) - return sorted(new_children) - - elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files - children.remove(self._group_meta_key) - children.append(zarr_group_meta_key) - if self._contains_attrs(path): - children.append(zarr_attrs_key) - return sorted(children) - else: - return children - - def _load_n5_attrs(self, path: str): - try: - s = super().__getitem__(path) - return json_loads(s) - except KeyError: - return {} - - def _is_group(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - n5_attrs = self._load_n5_attrs(attrs_key) - return len(n5_attrs) > 0 and "dimensions" not in n5_attrs - - def _is_array(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - attrs_key = os.path.join(path, self._attrs_key) - - return "dimensions" in self._load_n5_attrs(attrs_key) - - def _contains_attrs(self, path: Optional[str]): - if path is None: - attrs_key = self._attrs_key - else: - if not path.endswith(self._attrs_key): - attrs_key = os.path.join(path, self._attrs_key) - else: - attrs_key = path - - attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) - return len(attrs) > 0 - - -def is_chunk_key(key: str): - rv = False - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - rv = bool(_prog_ckey.match(last_segment)) - return rv - - -def invert_chunk_coords(key: str): - segments = list(key.split("/")) - if segments: - last_segment = segments[-1] - if _prog_ckey.match(last_segment): - coords = list(last_segment.split(".")) - last_segment = "/".join(coords[::-1]) - segments = segments[:-1] + [last_segment] - key = "/".join(segments) - return key - - -def group_metadata_to_n5(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from zarr to N5 format.""" - del group_metadata["zarr_format"] - # TODO: This should only exist at the top-level - group_metadata["n5"] = N5_FORMAT - return group_metadata - - -def group_metadata_to_zarr(group_metadata: Dict[str, Any]) -> Dict[str, Any]: - """Convert group metadata from N5 to zarr format.""" - # This only exists at the top level - group_metadata.pop("n5", None) - group_metadata["zarr_format"] = ZARR_FORMAT - return group_metadata - - -def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dict[str, Any]: - """Convert array metadata from zarr to N5 format. If the `top_level` keyword argument is True, - then the `N5` : N5_FORMAT key : value pair will be inserted into the metadata.""" - - for f, t in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - del array_metadata["zarr_format"] - if top_level: - array_metadata["n5"] = N5_FORMAT - try: - dtype = np.dtype(array_metadata["dataType"]) - except TypeError: - raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") - - array_metadata["dataType"] = dtype.name - array_metadata["dimensions"] = array_metadata["dimensions"][::-1] - array_metadata["blockSize"] = array_metadata["blockSize"][::-1] - - if "fill_value" in array_metadata: - if array_metadata["fill_value"] != 0 and array_metadata["fill_value"] is not None: - raise ValueError( - f"""Received fill_value = {array_metadata['fill_value']}, - but N5 only supports fill_value = 0""" - ) - del array_metadata["fill_value"] - - if "order" in array_metadata: - if array_metadata["order"] != "C": - raise ValueError( - f"Received order = {array_metadata['order']}, but N5 only supports order = C" - ) - del array_metadata["order"] - - if "filters" in array_metadata: - if array_metadata["filters"] != [] and array_metadata["filters"] is not None: - raise ValueError("Received filters, but N5 storage does not support zarr filters") - del array_metadata["filters"] - - assert "compression" in array_metadata - compressor_config = array_metadata["compression"] - compressor_config = compressor_config_to_n5(compressor_config) - array_metadata["compression"] = compressor_config - - if "dimension_separator" in array_metadata: - del array_metadata["dimension_separator"] - - return array_metadata - - -def array_metadata_to_zarr( - array_metadata: Dict[str, Any], top_level: bool = False -) -> Dict[str, Any]: - """Convert array metadata from N5 to zarr format. - If the `top_level` keyword argument is True, then the `N5` key will be removed from metadata""" - for t, f in zarr_to_n5_keys: - array_metadata[t] = array_metadata.pop(f) - if top_level: - array_metadata.pop("n5") - array_metadata["zarr_format"] = ZARR_FORMAT - - array_metadata["shape"] = array_metadata["shape"][::-1] - array_metadata["chunks"] = array_metadata["chunks"][::-1] - array_metadata["fill_value"] = 0 # also if None was requested - array_metadata["order"] = "C" - array_metadata["filters"] = [] - array_metadata["dimension_separator"] = "." - array_metadata["dtype"] = np.dtype(array_metadata["dtype"]).str - - compressor_config = array_metadata["compressor"] - compressor_config = compressor_config_to_zarr(compressor_config) - array_metadata["compressor"] = { - "id": N5ChunkWrapper.codec_id, - "compressor_config": compressor_config, - "dtype": array_metadata["dtype"], - "chunk_shape": array_metadata["chunks"], - } - - return array_metadata - - -def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: - """Get all zarr attributes from an N5 attributes dictionary (i.e., - all non-keyword attributes).""" - - # remove all N5 keywords - for n5_key in n5_keywords: - if n5_key in attrs: - del attrs[n5_key] - - return attrs - - -def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: - if compressor_config is None: - return {"type": "raw"} - else: - _compressor_config = compressor_config - - # peel wrapper, if present - if _compressor_config["id"] == N5ChunkWrapper.codec_id: - _compressor_config = _compressor_config["compressor_config"] - - codec_id = _compressor_config["id"] - n5_config = {"type": codec_id} - - if codec_id == "bz2": - n5_config["type"] = "bzip2" - n5_config["blockSize"] = _compressor_config["level"] - - elif codec_id == "blosc": - n5_config["cname"] = _compressor_config["cname"] - n5_config["clevel"] = _compressor_config["clevel"] - n5_config["shuffle"] = _compressor_config["shuffle"] - n5_config["blocksize"] = _compressor_config["blocksize"] - - elif codec_id == "lzma": - # Switch to XZ for N5 if we are using the default XZ format. - # Note: 4 is the default, which is lzma.CHECK_CRC64. - if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: - n5_config["type"] = "xz" - else: - warnings.warn( - "Not all N5 implementations support lzma compression (yet). You " - "might not be able to open the dataset with another N5 library.", - RuntimeWarning, - ) - n5_config["format"] = _compressor_config["format"] - n5_config["check"] = _compressor_config["check"] - n5_config["filters"] = _compressor_config["filters"] - - # The default is lzma.PRESET_DEFAULT, which is 6. - if _compressor_config["preset"]: - n5_config["preset"] = _compressor_config["preset"] - else: - n5_config["preset"] = 6 - - elif codec_id == "zlib": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = True - - elif codec_id == "gzip": - n5_config["type"] = "gzip" - n5_config["level"] = _compressor_config["level"] - n5_config["useZlib"] = False - - else: - n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) - - return n5_config - - -def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: - codec_id = compressor_config["type"] - zarr_config = {"id": codec_id} - - if codec_id == "bzip2": - zarr_config["id"] = "bz2" - zarr_config["level"] = compressor_config["blockSize"] - - elif codec_id == "blosc": - zarr_config["cname"] = compressor_config["cname"] - zarr_config["clevel"] = compressor_config["clevel"] - zarr_config["shuffle"] = compressor_config["shuffle"] - zarr_config["blocksize"] = compressor_config["blocksize"] - - elif codec_id == "lzma": - zarr_config["format"] = compressor_config["format"] - zarr_config["check"] = compressor_config["check"] - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = compressor_config["filters"] - - elif codec_id == "xz": - zarr_config["id"] = "lzma" - zarr_config["format"] = 1 # lzma.FORMAT_XZ - zarr_config["check"] = -1 - zarr_config["preset"] = compressor_config["preset"] - zarr_config["filters"] = None - - elif codec_id == "gzip": - if "useZlib" in compressor_config and compressor_config["useZlib"]: - zarr_config["id"] = "zlib" - zarr_config["level"] = compressor_config["level"] - else: - zarr_config["id"] = "gzip" - zarr_config["level"] = compressor_config["level"] - - elif codec_id == "raw": - return None - - else: - zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) - - return zarr_config - - -class N5ChunkWrapper(Codec): - codec_id = "n5_wrapper" - - def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): - self.dtype = np.dtype(dtype) - self.chunk_shape = tuple(chunk_shape) - # is the dtype a little endian format? - self._little_endian = self.dtype.byteorder == "<" or ( - self.dtype.byteorder == "=" and sys.byteorder == "little" - ) - - if compressor: - if compressor_config is not None: - raise ValueError("Only one of compressor_config or compressor should be given.") - compressor_config = compressor.get_config() - - if compressor_config is None and compressor is None or compressor_config["id"] == "raw": - self.compressor_config = None - self._compressor = None - else: - self._compressor = get_codec(compressor_config) - self.compressor_config = self._compressor.get_config() - - def get_config(self): - config = {"id": self.codec_id, "compressor_config": self.compressor_config} - return config - - def encode(self, chunk): - assert chunk.flags.c_contiguous - - header = self._create_header(chunk) - chunk = self._to_big_endian(chunk) - - if self._compressor: - return header + self._compressor.encode(chunk) - else: - return header + chunk.tobytes(order="A") - - def decode(self, chunk, out=None) -> bytes: - len_header, chunk_shape = self._read_header(chunk) - chunk = chunk[len_header:] - - if out is not None: - # out should only be used if we read a complete chunk - assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( - self.chunk_shape, chunk_shape - ) - - if self._compressor: - self._compressor.decode(chunk, out) - else: - ndarray_copy(chunk, out) - - # we can byteswap in-place - if self._little_endian: - out.byteswap(True) - - return out - - else: - if self._compressor: - chunk = self._compressor.decode(chunk) - - # more expensive byteswap - chunk = self._from_big_endian(chunk) - - # read partial chunk - if chunk_shape != self.chunk_shape: - chunk = np.frombuffer(chunk, dtype=self.dtype) - chunk = chunk.reshape(chunk_shape) - complete_chunk = np.zeros(self.chunk_shape, dtype=self.dtype) - target_slices = tuple(slice(0, s) for s in chunk_shape) - complete_chunk[target_slices] = chunk - chunk = complete_chunk - - return chunk - - @staticmethod - def _create_header(chunk): - mode = struct.pack(">H", 0) - num_dims = struct.pack(">H", len(chunk.shape)) - shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) - - return mode + num_dims + shape - - @staticmethod - def _read_header(chunk): - num_dims = struct.unpack(">H", chunk[2:4])[0] - shape = tuple( - struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) - )[::-1] - - len_header = 4 + num_dims * 4 - - return len_header, shape - - def _to_big_endian(self, data): - # assumes data is ndarray - - if self._little_endian: - return data.byteswap() - return data - - def _from_big_endian(self, data): - # assumes data is byte array in big endian - - if not self._little_endian: - return data - - a = np.frombuffer(data, self.dtype.newbyteorder(">")) - return a.astype(self.dtype) - - -register_codec(N5ChunkWrapper, N5ChunkWrapper.codec_id) diff --git a/src/zarr/storage.py b/src/zarr/storage.py index a7bd22a6b9..315eb0bbd1 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -144,10 +144,6 @@ def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore raise ValueError("storage_options passed with non-fsspec path") if store.endswith(".zip"): return ZipStore(store, mode=mode) - elif store.endswith(".n5"): - from zarr.n5 import N5Store - - return N5Store(store) else: return DirectoryStore(store) else: diff --git a/tests/test_core.py b/tests/test_core.py index 6303371793..6b2095c0c9 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -37,7 +37,6 @@ from zarr.core import Array from zarr.meta import json_loads -from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( ABSStore, DBMStore, @@ -1690,303 +1689,6 @@ def expected(self): ] -class TestArrayWithN5Store(TestArrayWithDirectoryStore): - def create_store(self): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = N5Store(path) - return store - - def test_array_0d(self): - # test behaviour for array with 0 dimensions - - # setup - a = np.zeros(()) - z = self.create_array(shape=(), dtype=a.dtype, fill_value=0) - - # check properties - assert a.ndim == z.ndim - assert a.shape == z.shape - assert a.size == z.size - assert a.dtype == z.dtype - assert a.nbytes == z.nbytes - with pytest.raises(TypeError): - len(z) - assert () == z.chunks - assert 1 == z.nchunks - assert (1,) == z.cdata_shape - # compressor always None - no point in compressing a single value - assert z.compressor.compressor_config is None - - # check __getitem__ - b = z[...] - assert isinstance(b, np.ndarray) - assert a.shape == b.shape - assert a.dtype == b.dtype - assert_array_equal(a, np.array(z)) - assert_array_equal(a, z[...]) - assert a[()] == z[()] - with pytest.raises(IndexError): - z[0] - with pytest.raises(IndexError): - z[:] - - # check __setitem__ - z[...] = 42 - assert 42 == z[()] - z[()] = 43 - assert 43 == z[()] - with pytest.raises(IndexError): - z[0] = 42 - with pytest.raises(IndexError): - z[:] = 42 - with pytest.raises(ValueError): - z[...] = np.array([1, 2, 3]) - - def test_array_1d_fill_value(self): - nvalues = 1050 - dtype = np.int32 - for fill_value in 0, None: - a = np.arange(nvalues, dtype=dtype) - f = np.empty_like(a) - f.fill(fill_value or 0) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, fill_value=fill_value) - z[190:310] = a[190:310] - - assert_array_equal(f[:190], z[:190]) - assert_array_equal(a[190:310], z[190:310]) - assert_array_equal(f[310:], z[310:]) - - with pytest.raises(ValueError): - z = self.create_array(shape=(nvalues,), chunks=100, dtype=dtype, fill_value=1) - - def test_nchunks_initialized(self): - fill_value = 0 - dtype = "int" - z = self.create_array( - shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=True - ) - - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store["foo"] = b"bar" - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[z._chunk_key((0,))] - assert 9 == z.nchunks_initialized - - # second round of similar tests with write_empty_chunks set to - # False - z = self.create_array( - shape=100, chunks=10, fill_value=fill_value, dtype=dtype, write_empty_chunks=False - ) - z[:] = 42 - assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[z._chunk_key((0,))] - assert 9 == z.nchunks_initialized - z[:] = z.fill_value - assert 0 == z.nchunks_initialized - - def test_array_order(self): - # N5 only supports 'C' at the moment - with pytest.raises(ValueError): - self.create_array(shape=(10, 11), chunks=(10, 11), dtype="i8", order="F") - - # 1D - a = np.arange(1050) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, order="C") - assert z.order == "C" - assert z[:].flags.c_contiguous - z[:] = a - assert_array_equal(a, z[:]) - - # 2D - a = np.arange(10000).reshape((100, 100)) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype, order="C") - - assert z.order == "C" - assert z[:].flags.c_contiguous - z[:] = a - actual = z[:] - assert_array_equal(a, actual) - - def test_structured_array(self): - d = np.array( - [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], - dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], - ) - fill_values = None, b"", (b"zzz", 42, 16.8) - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_structured_array_subshapes(self): - d = np.array( - [ - (0, ((0, 1, 2), (1, 2, 3)), b"aaa"), - (1, ((1, 2, 3), (2, 3, 4)), b"bbb"), - (2, ((2, 3, 4), (3, 4, 5)), b"ccc"), - ], - dtype=[("foo", "i8"), ("bar", "(2, 3)f4"), ("baz", "S3")], - ) - fill_values = None, b"", (0, ((0, 0, 0), (1, 1, 1)), b"zzz") - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_structured_array_nested(self): - d = np.array( - [ - (0, (0, ((0, 1), (1, 2), (2, 3)), 0), b"aaa"), - (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b"bbb"), - (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b"ccc"), - ], - dtype=[ - ("foo", "i8"), - ("bar", [("foo", "i4"), ("bar", "(3, 2)f4"), ("baz", "u1")]), - ("baz", "S3"), - ], - ) - fill_values = None, b"", (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b"zzz") - with pytest.raises(TypeError): - self.check_structured_array(d, fill_values) - - def test_dtypes(self): - # integers - for dtype in "u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8": - z = self.create_array(shape=10, chunks=3, dtype=dtype) - assert z.dtype == np.dtype(dtype) - a = np.arange(z.shape[0], dtype=dtype) - z[:] = a - assert_array_equal(a, z[:]) - - # floats - for dtype in "f2", "f4", "f8": - z = self.create_array(shape=10, chunks=3, dtype=dtype) - assert z.dtype == np.dtype(dtype) - a = np.linspace(0, 1, z.shape[0], dtype=dtype) - z[:] = a - assert_array_almost_equal(a, z[:]) - - # check that datetime generic units are not allowed - with pytest.raises(ValueError): - self.create_array(shape=100, dtype="M8") - with pytest.raises(ValueError): - self.create_array(shape=100, dtype="m8") - - def test_object_arrays(self): - # an object_codec is required for object arrays - with pytest.raises(ValueError): - self.create_array(shape=10, chunks=3, dtype=object) - - # an object_codec is required for object arrays, but allow to be provided via - # filters to maintain API backwards compatibility - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - self.create_array(shape=10, chunks=3, dtype=object, filters=[MsgPack()]) - - # create an object array using an object codec - with pytest.raises(ValueError): - self.create_array(shape=10, chunks=3, dtype=object, object_codec=MsgPack()) - - def test_object_arrays_vlen_text(self): - data = np.array(greetings * 1000, dtype=object) - - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=object, object_codec=VLenUTF8()) - - # convenience API - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=str) - - def test_object_arrays_vlen_bytes(self): - greetings_bytes = [g.encode("utf8") for g in greetings] - data = np.array(greetings_bytes * 1000, dtype=object) - - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=object, object_codec=VLenBytes()) - - # convenience API - with pytest.raises(ValueError): - self.create_array(shape=data.shape, dtype=bytes) - - def test_object_arrays_vlen_array(self): - data = np.array( - [np.array([1, 3, 7]), np.array([5]), np.array([2, 8, 12])] * 1000, dtype=object - ) - - codecs = VLenArray(int), VLenArray(" Date: Mon, 22 Apr 2024 13:22:53 +0200 Subject: [PATCH 14/22] chore: remove redis storage --- docs/api/storage.rst | 1 - docs/api/v3.rst | 1 - docs/tutorial.rst | 7 ---- src/zarr/__init__.py | 1 - src/zarr/storage.py | 69 ---------------------------------------- tests/test_storage.py | 13 -------- tests/test_storage_v3.py | 14 -------- 7 files changed, 106 deletions(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 4321837449..713a25a8fa 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -26,7 +26,6 @@ Storage (``zarr.storage``) .. automethod:: close .. autoclass:: MongoDBStore -.. autoclass:: RedisStore .. autoclass:: LRUStoreCache .. automethod:: invalidate diff --git a/docs/api/v3.rst b/docs/api/v3.rst index 7665b2ddd1..7e444f876b 100644 --- a/docs/api/v3.rst +++ b/docs/api/v3.rst @@ -50,7 +50,6 @@ You can also check ``Store type`` here (which indicates Zarr V3). .. autoclass:: MemoryStoreV3 .. autoclass:: DirectoryStoreV3 .. autoclass:: ZipStoreV3 -.. autoclass:: RedisStoreV3 .. autoclass:: MongoDBStoreV3 .. autoclass:: DBMStoreV3 .. autoclass:: LMDBStoreV3 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e259fdf079..cfd7dd7e07 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -838,13 +838,6 @@ Python is built with SQLite support):: >>> z[:] = 42 >>> store.close() -Also added in Zarr version 2.3 are two storage classes for interfacing with server-client -databases. The :class:`zarr.storage.RedisStore` class interfaces `Redis `_ -(an in memory data structure store), and the :class:`zarr.storage.MongoDB` class interfaces -with `MongoDB `_ (an object oriented NoSQL database). These stores -respectively require the `redis-py `_ and -`pymongo `_ packages to be installed. - Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 9647f2dc0b..04a94286f7 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -41,7 +41,6 @@ MemoryStore, MongoDBStore, NestedDirectoryStore, - RedisStore, SQLiteStore, TempStore, ZipStore, diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 315eb0bbd1..5301b7cf30 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -80,7 +80,6 @@ ) __doctest_requires__ = { - ("RedisStore", "RedisStore.*"): ["redis"], ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], } @@ -2672,74 +2671,6 @@ def clear(self): self.collection.delete_many({}) -class RedisStore(Store): - """Storage class using Redis. - - .. note:: This is an experimental feature. - - Requires the `redis `_ - package to be installed. - - Parameters - ---------- - prefix : string - Name of prefix for Redis keys - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `redis.Redis` function. - - """ - - def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): - import redis - - self._prefix = prefix - self._kwargs = kwargs - self._dimension_separator = dimension_separator - - self.client = redis.Redis(**kwargs) - - def _key(self, key): - return "{prefix}:{key}".format(prefix=self._prefix, key=key) - - def __getitem__(self, key): - return self.client[self._key(key)] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.client[self._key(key)] = value - - def __delitem__(self, key): - count = self.client.delete(self._key(key)) - if not count: - raise KeyError(key) - - def keylist(self): - offset = len(self._key("")) # length of prefix - return [key[offset:].decode("utf-8") for key in self.client.keys(self._key("*"))] - - def keys(self): - yield from self.keylist() - - def __iter__(self): - yield from self.keys() - - def __len__(self): - return len(self.keylist()) - - def __getstate__(self): - return self._prefix, self._kwargs - - def __setstate__(self, state): - prefix, kwargs = state - self.__init__(prefix=prefix, **kwargs) - - def clear(self): - for key in self.keys(): - del self[key] - - class ConsolidatedMetadataStore(Store): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/tests/test_storage.py b/tests/test_storage.py index 3bc6c4bf6a..24d6a03895 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -42,7 +42,6 @@ # MemoryStore, # MongoDBStore, # NestedDirectoryStore, - # RedisStore, # SQLiteStore, # Store, # TempStore, @@ -1725,18 +1724,6 @@ def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): # return store -# @skip_test_env_var("ZARR_TEST_REDIS") -# class TestRedisStore(StoreTests): -# def create_store(self, **kwargs): -# # TODO: this is the default host for Redis on Travis, -# # we probably want to generalize this though -# pytest.importorskip("redis") -# store = RedisStore(host="localhost", port=6379, **kwargs) -# # start with an empty store -# store.clear() -# return store - - # class TestLRUStoreCache(StoreTests): # CountingClass = CountingDict diff --git a/tests/test_storage_v3.py b/tests/test_storage_v3.py index 3d8024de70..f1450b8d40 100644 --- a/tests/test_storage_v3.py +++ b/tests/test_storage_v3.py @@ -41,7 +41,6 @@ # LRUStoreCacheV3, # MemoryStoreV3, # MongoDBStoreV3, -# RedisStoreV3, # SQLiteStoreV3, # StoreV3, # ZipStoreV3, @@ -506,18 +505,6 @@ # return store -# @skip_test_env_var("ZARR_TEST_REDIS") -# class TestRedisStoreV3(StoreV3Tests): -# def create_store(self, **kwargs): -# # TODO: this is the default host for Redis on Travis, -# # we probably want to generalize this though -# pytest.importorskip("redis") -# store = RedisStoreV3(host="localhost", port=6379, **kwargs) -# # start with an empty store -# store.clear() -# return store - - # @pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") # class TestStorageTransformerV3(TestMappingStoreV3): # def create_store(self, **kwargs): @@ -663,7 +650,6 @@ # "LRUStoreCacheV3", # "MemoryStoreV3", # "MongoDBStoreV3", -# "RedisStoreV3", # "SQLiteStoreV3", # "ZipStoreV3", # ]: From 3da088c33446cb4f50cc80ffb2425ed8f0247628 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 13:26:44 +0200 Subject: [PATCH 15/22] chore: remove mongo storage --- docs/api/storage.rst | 1 - docs/api/v3.rst | 1 - docs/contributing.rst | 5 --- src/zarr/__init__.py | 1 - src/zarr/storage.py | 89 ---------------------------------------- tests/test_storage.py | 13 ------ tests/test_storage_v3.py | 14 ------- 7 files changed, 124 deletions(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 713a25a8fa..71b1ae74c2 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -25,7 +25,6 @@ Storage (``zarr.storage``) .. automethod:: close -.. autoclass:: MongoDBStore .. autoclass:: LRUStoreCache .. automethod:: invalidate diff --git a/docs/api/v3.rst b/docs/api/v3.rst index 7e444f876b..7f56b49468 100644 --- a/docs/api/v3.rst +++ b/docs/api/v3.rst @@ -50,7 +50,6 @@ You can also check ``Store type`` here (which indicates Zarr V3). .. autoclass:: MemoryStoreV3 .. autoclass:: DirectoryStoreV3 .. autoclass:: ZipStoreV3 -.. autoclass:: MongoDBStoreV3 .. autoclass:: DBMStoreV3 .. autoclass:: LMDBStoreV3 .. autoclass:: SQLiteStoreV3 diff --git a/docs/contributing.rst b/docs/contributing.rst index a65b3d104d..9bc835a9ce 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -168,11 +168,6 @@ storage emulator (e.g., azurite) and set the environment variable docker run --rm -p 10000:10000 mcr.microsoft.com/azure-storage/azurite azurite-blob --loose --blobHost 0.0.0.0 -To run the Mongo DB storage tests, run a Mongo -server locally and set the environment variable ``ZARR_TEST_MONGO=1``. -To run the Redis storage tests, run a Redis server locally on port -6379 and set the environment variable ``ZARR_TEST_REDIS=1``. - All tests are automatically run via GitHub Actions for every pull request and must pass before code can be accepted. Test coverage is also collected automatically via the Codecov service, and total diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 04a94286f7..114fc3aa64 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -39,7 +39,6 @@ LMDBStore, LRUStoreCache, MemoryStore, - MongoDBStore, NestedDirectoryStore, SQLiteStore, TempStore, diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 5301b7cf30..3acb6c0b89 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -80,7 +80,6 @@ ) __doctest_requires__ = { - ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], } @@ -2583,94 +2582,6 @@ def clear(self): ) -class MongoDBStore(Store): - """Storage class using MongoDB. - - .. note:: This is an experimental feature. - - Requires the `pymongo `_ - package to be installed. - - Parameters - ---------- - database : string - Name of database - collection : string - Name of collection - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `pymongo.MongoClient` function. - - Notes - ----- - The maximum chunksize in MongoDB documents is 16 MB. - - """ - - _key = "key" - _value = "value" - - def __init__( - self, - database="mongodb_zarr", - collection="zarr_collection", - dimension_separator=None, - **kwargs, - ): - import pymongo - - self._database = database - self._collection = collection - self._dimension_separator = dimension_separator - self._kwargs = kwargs - - self.client = pymongo.MongoClient(**self._kwargs) - self.db = self.client.get_database(self._database) - self.collection = self.db.get_collection(self._collection) - - def __getitem__(self, key): - doc = self.collection.find_one({self._key: key}) - - if doc is None: - raise KeyError(key) - else: - return doc[self._value] - - def __setitem__(self, key, value): - value = ensure_bytes(value) - self.collection.replace_one( - {self._key: key}, {self._key: key, self._value: value}, upsert=True - ) - - def __delitem__(self, key): - result = self.collection.delete_many({self._key: key}) - if not result.deleted_count == 1: - raise KeyError(key) - - def __iter__(self): - for f in self.collection.find({}): - yield f[self._key] - - def __len__(self): - return self.collection.count_documents({}) - - def __getstate__(self): - return self._database, self._collection, self._kwargs - - def __setstate__(self, state): - database, collection, kwargs = state - self.__init__(database=database, collection=collection, **kwargs) - - def close(self): - """Cleanup client resources and disconnect from MongoDB.""" - self.client.close() - - def clear(self): - """Remove all items from store.""" - self.collection.delete_many({}) - - class ConsolidatedMetadataStore(Store): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/tests/test_storage.py b/tests/test_storage.py index 24d6a03895..2afda351f1 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -40,7 +40,6 @@ # LMDBStore, # LRUStoreCache, # MemoryStore, - # MongoDBStore, # NestedDirectoryStore, # SQLiteStore, # Store, @@ -1712,18 +1711,6 @@ def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): # pickle.dumps(store) -# @skip_test_env_var("ZARR_TEST_MONGO") -# class TestMongoDBStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("pymongo") -# store = MongoDBStore( -# host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs -# ) -# # start with an empty store -# store.clear() -# return store - - # class TestLRUStoreCache(StoreTests): # CountingClass = CountingDict diff --git a/tests/test_storage_v3.py b/tests/test_storage_v3.py index f1450b8d40..82ee1b5b94 100644 --- a/tests/test_storage_v3.py +++ b/tests/test_storage_v3.py @@ -40,7 +40,6 @@ # LMDBStoreV3, # LRUStoreCacheV3, # MemoryStoreV3, -# MongoDBStoreV3, # SQLiteStoreV3, # StoreV3, # ZipStoreV3, @@ -493,18 +492,6 @@ # return store -# @skip_test_env_var("ZARR_TEST_MONGO") -# class TestMongoDBStoreV3(StoreV3Tests): -# def create_store(self, **kwargs): -# pytest.importorskip("pymongo") -# store = MongoDBStoreV3( -# host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs -# ) -# # start with an empty store -# store.clear() -# return store - - # @pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") # class TestStorageTransformerV3(TestMappingStoreV3): # def create_store(self, **kwargs): @@ -649,7 +636,6 @@ # "LMDBStoreV3", # "LRUStoreCacheV3", # "MemoryStoreV3", -# "MongoDBStoreV3", # "SQLiteStoreV3", # "ZipStoreV3", # ]: From 6e956730f323871a606f4250692c52c3e518c514 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 13:38:08 +0200 Subject: [PATCH 16/22] chore: remove abs storage --- .github/workflows/windows-testing.yml | 1 - docs/api/storage.rst | 2 - docs/contributing.rst | 7 - docs/tutorial.rst | 17 -- src/zarr/__init__.py | 1 - src/zarr/_storage/absstore.py | 224 -------------------------- src/zarr/storage.py | 1 - tests/test_core.py | 22 +-- tests/test_hierarchy.py | 19 +-- tests/test_storage.py | 73 --------- tests/test_storage_v3.py | 9 -- 11 files changed, 2 insertions(+), 374 deletions(-) delete mode 100644 src/zarr/_storage/absstore.py diff --git a/.github/workflows/windows-testing.yml b/.github/workflows/windows-testing.yml index 78945e97aa..7d36730538 100644 --- a/.github/workflows/windows-testing.yml +++ b/.github/workflows/windows-testing.yml @@ -50,7 +50,6 @@ jobs: azurite -l ~/blob_emulator --debug debug.log 2>&1 > stdouterr.log & pytest -sv --timeout=300 env: - ZARR_TEST_ABS: 1 ZARR_V3_EXPERIMENTAL_API: 1 ZARR_V3_SHARDING: 1 - name: Conda info diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 71b1ae74c2..cc96f0418e 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -31,8 +31,6 @@ Storage (``zarr.storage``) .. automethod:: invalidate_values .. automethod:: invalidate_keys -.. autoclass:: ABSStore - .. autoclass:: FSStore .. autoclass:: ConsolidatedMetadataStore diff --git a/docs/contributing.rst b/docs/contributing.rst index 9bc835a9ce..d078a82d9b 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -161,13 +161,6 @@ optional dependencies to be installed), run:: $ python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst -Note that some tests also require storage services to be running -locally. To run the Azure Blob Service storage tests, run an Azure -storage emulator (e.g., azurite) and set the environment variable -``ZARR_TEST_ABS=1``. If you're using Docker to run azurite, start the service with:: - - docker run --rm -p 10000:10000 mcr.microsoft.com/azure-storage/azurite azurite-blob --loose --blobHost 0.0.0.0 - All tests are automatically run via GitHub Actions for every pull request and must pass before code can be accepted. Test coverage is also collected automatically via the Codecov service, and total diff --git a/docs/tutorial.rst b/docs/tutorial.rst index cfd7dd7e07..2d6d42cffc 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -881,23 +881,6 @@ Here is an example using S3Map to read an array created previously:: >>> z[:].tobytes() b'Hello from the cloud!' -Zarr now also has a builtin storage backend for Azure Blob Storage. -The class is :class:`zarr.storage.ABSStore` (requires -`azure-storage-blob `_ -to be installed):: - - >>> import azure.storage.blob - >>> container_client = azure.storage.blob.ContainerClient(...) # doctest: +SKIP - >>> store = zarr.ABSStore(client=container_client, prefix='zarr-testing') # doctest: +SKIP - >>> root = zarr.group(store=store, overwrite=True) # doctest: +SKIP - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') # doctest: +SKIP - >>> z[:] = 42 # doctest: +SKIP - -When using an actual storage account, provide ``account_name`` and -``account_key`` arguments to :class:`zarr.storage.ABSStore`, the -above client is just testing against the emulator. Please also note -that this is an experimental feature. - Note that retrieving data from a remote service via the network can be significantly slower than retrieving data from a local file system, and will depend on network latency and bandwidth between the client and server systems. If you are experiencing poor diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 114fc3aa64..c40b86e7a5 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -31,7 +31,6 @@ from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group from zarr.storage import ( - ABSStore, DBMStore, DictStore, DirectoryStore, diff --git a/src/zarr/_storage/absstore.py b/src/zarr/_storage/absstore.py deleted file mode 100644 index d8e292535c..0000000000 --- a/src/zarr/_storage/absstore.py +++ /dev/null @@ -1,224 +0,0 @@ -"""This module contains storage classes related to Azure Blob Storage (ABS)""" - -import warnings -from numcodecs.compat import ensure_bytes -from zarr.util import normalize_storage_path -from zarr._storage.store import Store - -__doctest_requires__ = { - ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], -} - - -class ABSStore(Store): - """Storage class using Azure Blob Storage (ABS). - - Parameters - ---------- - container : string - The name of the ABS container to use. - - .. deprecated:: - Use ``client`` instead. - - prefix : string - Location of the "directory" to use as the root of the storage hierarchy - within the container. - - account_name : string - The Azure blob storage account name. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - account_key : string - The Azure blob storage account access key. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - blob_service_kwargs : dictionary - Extra arguments to be passed into the azure blob client, for e.g. when - using the emulator, pass in blob_service_kwargs={'is_emulated': True}. - - .. deprecated:: 2.8.3 - Use ``client`` instead. - - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - - client : azure.storage.blob.ContainerClient, optional - And ``azure.storage.blob.ContainerClient`` to connect with. See - `here `_ # noqa - for more. - - .. versionadded:: 2.8.3 - - Notes - ----- - In order to use this store, you must install the Microsoft Azure Storage SDK for Python, - ``azure-storage-blob>=12.5.0``. - """ # noqa: E501 - - def __init__( - self, - container=None, - prefix="", - account_name=None, - account_key=None, - blob_service_kwargs=None, - dimension_separator=None, - client=None, - ): - self._dimension_separator = dimension_separator - self.prefix = normalize_storage_path(prefix) - if client is None: - # deprecated option, try to construct the client for them - msg = ( - "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" - "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " - "'client' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - from azure.storage.blob import ContainerClient - - blob_service_kwargs = blob_service_kwargs or {} - client = ContainerClient( - "https://{}.blob.core.windows.net/".format(account_name), - container, - credential=account_key, - **blob_service_kwargs, - ) - - self.client = client - self._container = container - self._account_name = account_name - self._account_key = account_key - - @staticmethod - def _warn_deprecated(property_): - msg = ( - "The {} property is deprecated and will be removed in a future " - "version. Get the property from 'ABSStore.client' instead." - ) - warnings.warn(msg.format(property_), FutureWarning, stacklevel=3) - - @property - def container(self): - self._warn_deprecated("container") - return self._container - - @property - def account_name(self): - self._warn_deprecated("account_name") - return self._account_name - - @property - def account_key(self): - self._warn_deprecated("account_key") - return self._account_key - - def _append_path_to_prefix(self, path): - if self.prefix == "": - return normalize_storage_path(path) - else: - return "/".join([self.prefix, normalize_storage_path(path)]) - - @staticmethod - def _strip_prefix_from_path(path, prefix): - # normalized things will not have any leading or trailing slashes - path_norm = normalize_storage_path(path) - prefix_norm = normalize_storage_path(prefix) - if prefix: - return path_norm[(len(prefix_norm) + 1) :] - else: - return path_norm - - def __getitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - blob_name = self._append_path_to_prefix(key) - try: - return self.client.download_blob(blob_name).readall() - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % blob_name) - - def __setitem__(self, key, value): - value = ensure_bytes(value) - blob_name = self._append_path_to_prefix(key) - self.client.upload_blob(blob_name, value, overwrite=True) - - def __delitem__(self, key): - from azure.core.exceptions import ResourceNotFoundError - - try: - self.client.delete_blob(self._append_path_to_prefix(key)) - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % key) - - def __eq__(self, other): - return ( - isinstance(other, ABSStore) - and self.client == other.client - and self.prefix == other.prefix - ) - - def keys(self): - return list(self.__iter__()) - - def __iter__(self): - if self.prefix: - list_blobs_prefix = self.prefix + "/" - else: - list_blobs_prefix = None - for blob in self.client.list_blobs(list_blobs_prefix): - yield self._strip_prefix_from_path(blob.name, self.prefix) - - def __len__(self): - return len(self.keys()) - - def __contains__(self, key): - blob_name = self._append_path_to_prefix(key) - return self.client.get_blob_client(blob_name).exists() - - def listdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - items = [ - self._strip_prefix_from_path(blob.name, dir_path) - for blob in self.client.walk_blobs(name_starts_with=dir_path, delimiter="/") - ] - return items - - def rmdir(self, path=None): - dir_path = normalize_storage_path(self._append_path_to_prefix(path)) - if dir_path: - dir_path += "/" - for blob in self.client.list_blobs(name_starts_with=dir_path): - self.client.delete_blob(blob) - - def getsize(self, path=None): - store_path = normalize_storage_path(path) - fs_path = self._append_path_to_prefix(store_path) - if fs_path: - blob_client = self.client.get_blob_client(fs_path) - else: - blob_client = None - - if blob_client and blob_client.exists(): - return blob_client.get_blob_properties().size - else: - size = 0 - if fs_path == "": - fs_path = None - elif not fs_path.endswith("/"): - fs_path += "/" - for blob in self.client.walk_blobs(name_starts_with=fs_path, delimiter="/"): - blob_client = self.client.get_blob_client(blob) - if blob_client.exists(): - size += blob_client.get_blob_properties().size - return size - - def clear(self): - self.rmdir() diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 3acb6c0b89..424811fcf5 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -63,7 +63,6 @@ ensure_contiguous_ndarray_or_bytes, ) -from zarr._storage.absstore import ABSStore # noqa: F401 from zarr._storage.store import ( # noqa: F401 _listdir_from_keys, _rename_from_keys, diff --git a/tests/test_core.py b/tests/test_core.py index 6b2095c0c9..4f83aad0ff 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,5 @@ import atexit import os -import sys import pickle import shutil from typing import Any, Literal, Optional, Tuple, Union @@ -38,7 +37,6 @@ from zarr.core import Array from zarr.meta import json_loads from zarr.storage import ( - ABSStore, DBMStore, DirectoryStore, FSStore, @@ -55,7 +53,7 @@ ) from zarr.util import buffer_size -from .util import abs_container, skip_test_env_var, have_fsspec, mktemp +from .util import have_fsspec, mktemp # noinspection PyMethodMayBeStatic @@ -1654,24 +1652,6 @@ def test_array_init_from_dict(): assert isinstance(a.store, KVStore) -@skip_test_env_var("ZARR_TEST_ABS") -class TestArrayWithABSStore(TestArray): - def create_store(self): - client = abs_container() - store = ABSStore(client=client) - store.rmdir() - return store - - @pytest.mark.xfail - def test_nbytes_stored(self): - return super().test_nbytes_stored() - - @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") - def test_pickle(self): - # internal attribute on ContainerClient isn't serializable for py36 and earlier - super().test_pickle() - - class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): def create_store(self): path = mkdtemp() diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 8cd51cc940..0c101f6f26 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -1,6 +1,5 @@ import atexit import os -import sys import pickle import shutil import tempfile @@ -23,7 +22,6 @@ from zarr.creation import open_array from zarr.hierarchy import Group, group, open_group from zarr.storage import ( - ABSStore, DBMStore, KVStore, DirectoryStore, @@ -43,7 +41,7 @@ ) from zarr.util import InfoReporter -from .util import skip_test_env_var, have_fsspec, abs_container, mktemp +from .util import have_fsspec, mktemp # noinspection PyStatementEffect @@ -1029,21 +1027,6 @@ def create_store(): return store, None -@skip_test_env_var("ZARR_TEST_ABS") -class TestGroupWithABSStore(TestGroup): - @staticmethod - def create_store(): - container_client = abs_container() - store = ABSStore(client=container_client) - store.rmdir() - return store, None - - @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") - def test_pickle(self): - # internal attribute on ContainerClient isn't serializable for py36 and earlier - super().test_pickle() - - class TestGroupWithNestedDirectoryStore(TestGroup): @staticmethod def create_store(): diff --git a/tests/test_storage.py b/tests/test_storage.py index 2afda351f1..3c16ae3ead 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -31,7 +31,6 @@ # from zarr.meta import ZARR_FORMAT, decode_array_metadata from zarr.storage import ( - # ABSStore, # ConsolidatedMetadataStore, # DBMStore, # DictStore, @@ -2124,78 +2123,6 @@ def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): # assert compressor.get_config() == z.compressor.get_config() -# @skip_test_env_var("ZARR_TEST_ABS") -# class TestABSStore(StoreTests): - -# ABSStoreClass = ABSStore - -# def create_store(self, prefix=None, **kwargs): -# container_client = abs_container() -# store = self.ABSStoreClass( -# prefix=prefix, -# client=container_client, -# **kwargs, -# ) -# store.rmdir() -# return store - -# def test_non_client_deprecated(self): -# with pytest.warns(FutureWarning, match="Providing"): -# store = self.ABSStoreClass( -# "container", account_name="account_name", account_key="account_key" -# ) - -# for attr in ["container", "account_name", "account_key"]: -# with pytest.warns(FutureWarning, match=attr): -# result = getattr(store, attr) -# assert result == attr - -# def test_iterators_with_prefix(self): -# prefixes = ["test_prefix", "/test_prefix", "test_prefix/", "test/prefix"] - -# if self.version < 3: -# # empty prefix not allowed in v3 -# prefixes += ["", None] - -# for prefix in prefixes: -# store = self.create_store(prefix=prefix) - -# # test iterator methods on empty store -# assert 0 == len(store) -# assert set() == set(store) -# assert set() == set(store.keys()) -# assert set() == set(store.values()) -# assert set() == set(store.items()) - -# prefix = meta_root if self.version > 2 else "" -# # setup some values -# store[prefix + "a"] = b"aaa" -# store[prefix + "b"] = b"bbb" -# store[prefix + "c/d"] = b"ddd" -# store[prefix + "c/e/f"] = b"fff" - -# # test iterators on store with data -# assert 4 == len(store) -# keys = [prefix + "a", prefix + "b", prefix + "c/d", prefix + "c/e/f"] -# values = [b"aaa", b"bbb", b"ddd", b"fff"] -# items = list(zip(keys, values)) -# assert set(keys) == set(store) -# assert set(keys) == set(store.keys()) -# assert set(values) == set(store.values()) -# assert set(items) == set(store.items()) - -# def test_getsize(self): -# return super().test_getsize() - -# def test_hierarchy(self): -# return super().test_hierarchy() - -# @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") -# def test_pickle(self): -# # internal attribute on ContainerClient isn't serializable for py36 and earlier -# super().test_pickle() - - # class TestConsolidatedMetadataStore: # version = 2 diff --git a/tests/test_storage_v3.py b/tests/test_storage_v3.py index 82ee1b5b94..2c78cbe99e 100644 --- a/tests/test_storage_v3.py +++ b/tests/test_storage_v3.py @@ -30,7 +30,6 @@ # normalize_store_arg, # ) # from zarr._storage.v3 import ( -# ABSStoreV3, # ConsolidatedMetadataStoreV3, # DBMStoreV3, # DirectoryStoreV3, @@ -48,7 +47,6 @@ # # pytest will fail to run if the following fixtures aren't imported here # from .test_storage import StoreTests as _StoreTests -# from .test_storage import TestABSStore as _TestABSStore # from .test_storage import TestConsolidatedMetadataStore as _TestConsolidatedMetadataStore # from .test_storage import TestDBMStore as _TestDBMStore # from .test_storage import TestDBMStoreBerkeleyDB as _TestDBMStoreBerkeleyDB @@ -540,12 +538,6 @@ # LRUStoreClass = LRUStoreCacheV3 -# @skip_test_env_var("ZARR_TEST_ABS") -# class TestABSStoreV3(_TestABSStore, StoreV3Tests): - -# ABSStoreClass = ABSStoreV3 - - # def test_normalize_store_arg_v3(tmpdir): # fn = tmpdir.join("store.zip") @@ -629,7 +621,6 @@ # def test_top_level_imports(): # for store_name in [ -# "ABSStoreV3", # "DBMStoreV3", # "KVStoreV3", # "DirectoryStoreV3", From fb7629fd597f33413561e395c82d037fd22043ed Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 13:43:35 +0200 Subject: [PATCH 17/22] chore: remove lmdb storage --- docs/api/storage.rst | 5 - docs/api/v3.rst | 1 - docs/tutorial.rst | 10 -- notebooks/store_benchmark.ipynb | 2 - src/zarr/__init__.py | 1 - src/zarr/storage.py | 178 -------------------------------- tests/test_core.py | 28 ----- tests/test_hierarchy.py | 11 -- tests/test_storage.py | 17 --- tests/test_storage_v3.py | 13 --- 10 files changed, 266 deletions(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index cc96f0418e..4d8fd774e9 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -16,11 +16,6 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush -.. autoclass:: LMDBStore - - .. automethod:: close - .. automethod:: flush - .. autoclass:: SQLiteStore .. automethod:: close diff --git a/docs/api/v3.rst b/docs/api/v3.rst index 7f56b49468..8d1c006952 100644 --- a/docs/api/v3.rst +++ b/docs/api/v3.rst @@ -51,7 +51,6 @@ You can also check ``Store type`` here (which indicates Zarr V3). .. autoclass:: DirectoryStoreV3 .. autoclass:: ZipStoreV3 .. autoclass:: DBMStoreV3 -.. autoclass:: LMDBStoreV3 .. autoclass:: SQLiteStoreV3 .. autoclass:: LRUStoreCacheV3 .. autoclass:: ConsolidatedMetadataStoreV3 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 2d6d42cffc..4b6938efde 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -818,16 +818,6 @@ database for storage (requires `bsddb3 >>> z[:] = 42 >>> store.close() -Also added in Zarr version 2.2 is the :class:`zarr.storage.LMDBStore` class which -enables the lightning memory-mapped database (LMDB) to be used for storing an array or -group (requires `lmdb `_ to be installed):: - - >>> store = zarr.LMDBStore('data/example.lmdb') - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which enables the SQLite database to be used for storing an array or group (requires Python is built with SQLite support):: diff --git a/notebooks/store_benchmark.ipynb b/notebooks/store_benchmark.ipynb index 869e7df608..42128a834b 100644 --- a/notebooks/store_benchmark.ipynb +++ b/notebooks/store_benchmark.ipynb @@ -119,7 +119,6 @@ " clean()\n", " fdict_root = zarr.group(store=dict())\n", " hdict_root = zarr.group(store=zarr.DictStore())\n", - " lmdb_root = zarr.group(store=zarr.LMDBStore(os.path.join(bench_dir, 'lmdb')))\n", " gdbm_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'gdbm'), open=dbm.gnu.open))\n", " ndbm_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'ndbm'), open=dbm.ndbm.open))\n", " bdbm_btree_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'bdbm_btree'), open=bsddb3.btopen))\n", @@ -129,7 +128,6 @@ "\n", " fdict_z = fdict_root.empty_like(name, a)\n", " hdict_z = hdict_root.empty_like(name, a)\n", - " lmdb_z = lmdb_root.empty_like(name, a)\n", " gdbm_z = gdbm_root.empty_like(name, a)\n", " ndbm_z = ndbm_root.empty_like(name, a)\n", " bdbm_btree_z = bdbm_btree_root.empty_like(name, a)\n", diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index c40b86e7a5..387464d589 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -35,7 +35,6 @@ DictStore, DirectoryStore, KVStore, - LMDBStore, LRUStoreCache, MemoryStore, NestedDirectoryStore, diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 424811fcf5..452f59e56a 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -17,12 +17,10 @@ import atexit import errno import glob -import multiprocessing import operator import os import re import shutil -import sys import tempfile import warnings import zipfile @@ -2010,182 +2008,6 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys(self, path) -class LMDBStore(Store): - """Storage class using LMDB. Requires the `lmdb `_ - package to be installed. - - - Parameters - ---------- - path : string - Location of database file. - buffers : bool, optional - If True (default) use support for buffers, which should increase performance by - reducing memory copies. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `lmdb.open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.LMDBStore('data/array.mdb') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.LMDBStore('data/group.mdb') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.LMDBStore('data/array.mdb') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - Notes - ----- - By default writes are not immediately flushed to disk to increase performance. You - can ensure data are flushed to disk by calling the ``flush()`` or ``close()`` methods. - - Should be safe to write in multiple threads or processes due to the synchronization - support within LMDB, although writing from multiple processes has not been tested. - - """ - - def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): - import lmdb - - # set default memory map size to something larger than the lmdb default, which is - # very likely to be too small for any moderate array (logic copied from zict) - map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 - kwargs.setdefault("map_size", map_size) - - # don't initialize buffers to zero by default, shouldn't be necessary - kwargs.setdefault("meminit", False) - - # decide whether to use the writemap option based on the operating system's - # support for sparse files - writemap requires sparse file support otherwise - # the whole# `map_size` may be reserved up front on disk (logic copied from zict) - writemap = sys.platform.startswith("linux") - kwargs.setdefault("writemap", writemap) - - # decide options for when data are flushed to disk - choose to delay syncing - # data to filesystem, otherwise pay a large performance penalty (zict also does - # this) - kwargs.setdefault("metasync", False) - kwargs.setdefault("sync", False) - kwargs.setdefault("map_async", False) - - # set default option for number of cached transactions - max_spare_txns = multiprocessing.cpu_count() - kwargs.setdefault("max_spare_txns", max_spare_txns) - - # normalize path - path = os.path.abspath(path) - - # open database - self.db = lmdb.open(path, **kwargs) - - # store properties - self.buffers = buffers - self.path = path - self.kwargs = kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # just in case - except Exception: - # flush may fail if db has already been closed - pass - return self.path, self.buffers, self.kwargs - - def __setstate__(self, state): - path, buffers, kwargs = state - self.__init__(path=path, buffers=buffers, **kwargs) - - def close(self): - """Closes the underlying database.""" - self.db.close() - - def flush(self): - """Synchronizes data to the file system.""" - self.db.sync() - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - # use the buffers option, should avoid a memory copy - with self.db.begin(buffers=self.buffers) as txn: - value = txn.get(key) - if value is None: - raise KeyError(key) - return value - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True, buffers=self.buffers) as txn: - txn.put(key, value) - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(write=True) as txn: - if not txn.delete(key): - raise KeyError(key) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - return cursor.set_key(key) - - def items(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k, v in cursor.iternext(keys=True, values=True): - yield ensure_text(k, "ascii"), v - - def keys(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - for k in cursor.iternext(keys=True, values=False): - yield ensure_text(k, "ascii") - - def values(self): - with self.db.begin(buffers=self.buffers) as txn: - with txn.cursor() as cursor: - yield from cursor.iternext(keys=False, values=True) - - def __iter__(self): - return self.keys() - - def __len__(self): - return self.db.stat()["entries"] - - class LRUStoreCache(Store): """Storage class that implements a least-recently-used (LRU) cache layer over some other store. Intended primarily for use with stores that can be slow to diff --git a/tests/test_core.py b/tests/test_core.py index 4f83aad0ff..3f979a2a84 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -41,7 +41,6 @@ DirectoryStore, FSStore, KVStore, - LMDBStore, LRUStoreCache, NestedDirectoryStore, SQLiteStore, @@ -1693,33 +1692,6 @@ def test_nbytes_stored(self): pass # not implemented -class TestArrayWithLMDBStore(TestArray): - def create_store(self): - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - store = LMDBStore(path, buffers=True) - return store - - def test_store_has_bytes_values(self): - pass # returns values as memoryviews/buffers instead of bytes - - def test_nbytes_stored(self): - pass # not implemented - - -class TestArrayWithLMDBStoreNoBuffers(TestArray): - def create_store(self): - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - store = LMDBStore(path, buffers=False) - return store - - def test_nbytes_stored(self): - pass # not implemented - - class TestArrayWithSQLiteStore(TestArray): def create_store(self): pytest.importorskip("sqlite3") diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 0c101f6f26..0ce3422eeb 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -26,7 +26,6 @@ KVStore, DirectoryStore, FSStore, - LMDBStore, LRUStoreCache, MemoryStore, NestedDirectoryStore, @@ -1124,16 +1123,6 @@ def create_store(): return store, None -class TestGroupWithLMDBStore(TestGroup): - @staticmethod - def create_store(): - pytest.importorskip("lmdb") - path = mktemp(suffix=".lmdb") - atexit.register(atexit_rmtree, path) - store = LMDBStore(path) - return store, None - - class TestGroupWithSQLiteStore(TestGroup): def create_store(self): pytest.importorskip("sqlite3") diff --git a/tests/test_storage.py b/tests/test_storage.py index 3c16ae3ead..69755bf4b9 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -36,7 +36,6 @@ # DictStore, # DirectoryStore, # KVStore, - # LMDBStore, # LRUStoreCache, # MemoryStore, # NestedDirectoryStore, @@ -1658,22 +1657,6 @@ def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): # return store -# class TestLMDBStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("lmdb") -# path = mktemp(suffix=".lmdb") -# atexit.register(atexit_rmtree, path) -# buffers = True -# store = LMDBStore(path, buffers=buffers, **kwargs) -# return store - -# def test_context_manager(self): -# with self.create_store() as store: -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"qux" -# assert 2 == len(store) - - # class TestSQLiteStore(StoreTests): # def create_store(self, **kwargs): # pytest.importorskip("sqlite3") diff --git a/tests/test_storage_v3.py b/tests/test_storage_v3.py index 2c78cbe99e..c0cf6531e8 100644 --- a/tests/test_storage_v3.py +++ b/tests/test_storage_v3.py @@ -36,7 +36,6 @@ # FSStoreV3, # KVStore, # KVStoreV3, -# LMDBStoreV3, # LRUStoreCacheV3, # MemoryStoreV3, # SQLiteStoreV3, @@ -55,7 +54,6 @@ # from .test_storage import TestDBMStoreNDBM as _TestDBMStoreNDBM # from .test_storage import TestDirectoryStore as _TestDirectoryStore # from .test_storage import TestFSStore as _TestFSStore -# from .test_storage import TestLMDBStore as _TestLMDBStore # from .test_storage import TestLRUStoreCache as _TestLRUStoreCache # from .test_storage import TestMemoryStore as _TestMemoryStore # from .test_storage import TestSQLiteStore as _TestSQLiteStore @@ -464,16 +462,6 @@ # return store -# class TestLMDBStoreV3(_TestLMDBStore, StoreV3Tests): -# def create_store(self, **kwargs): -# pytest.importorskip("lmdb") -# path = mktemp(suffix=".lmdb") -# atexit.register(atexit_rmtree, path) -# buffers = True -# store = LMDBStoreV3(path, buffers=buffers, **kwargs) -# return store - - # class TestSQLiteStoreV3(_TestSQLiteStore, StoreV3Tests): # def create_store(self, **kwargs): # pytest.importorskip("sqlite3") @@ -624,7 +612,6 @@ # "DBMStoreV3", # "KVStoreV3", # "DirectoryStoreV3", -# "LMDBStoreV3", # "LRUStoreCacheV3", # "MemoryStoreV3", # "SQLiteStoreV3", From 6a8a92c27249af755e1f64b70a225682fbc772ad Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 13:51:50 +0200 Subject: [PATCH 18/22] chore: remove dbm storage --- docs/api/storage.rst | 5 - docs/api/v3.rst | 1 - docs/tutorial.rst | 17 +- notebooks/store_benchmark.ipynb | 585 -------------------------------- src/zarr/__init__.py | 1 - src/zarr/storage.py | 207 +---------- tests/test_core.py | 27 -- tests/test_hierarchy.py | 21 -- tests/test_storage.py | 56 --- tests/test_storage_v3.py | 56 --- 10 files changed, 3 insertions(+), 973 deletions(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 4d8fd774e9..9a4c9c7b4a 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -11,11 +11,6 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush -.. autoclass:: DBMStore - - .. automethod:: close - .. automethod:: flush - .. autoclass:: SQLiteStore .. automethod:: close diff --git a/docs/api/v3.rst b/docs/api/v3.rst index 8d1c006952..b880c1859c 100644 --- a/docs/api/v3.rst +++ b/docs/api/v3.rst @@ -50,7 +50,6 @@ You can also check ``Store type`` here (which indicates Zarr V3). .. autoclass:: MemoryStoreV3 .. autoclass:: DirectoryStoreV3 .. autoclass:: ZipStoreV3 -.. autoclass:: DBMStoreV3 .. autoclass:: SQLiteStoreV3 .. autoclass:: LRUStoreCacheV3 .. autoclass:: ConsolidatedMetadataStoreV3 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 4b6938efde..95052f3667 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -805,19 +805,6 @@ boundaries. Note also that the ``close()`` method must be called after writing any data to the store, otherwise essential records will not be written to the underlying zip file. -Another storage alternative is the :class:`zarr.storage.DBMStore` class, added -in Zarr version 2.2. This class allows any DBM-style database to be used for -storing an array or group. Here is an example using a Berkeley DB B-tree -database for storage (requires `bsddb3 -`_ to be installed):: - - >>> import bsddb3 - >>> store = zarr.DBMStore('data/example.bdb', open=bsddb3.btopen) - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which enables the SQLite database to be used for storing an array or group (requires Python is built with SQLite support):: @@ -1542,8 +1529,8 @@ storage. Note that if an array or group is backed by an in-memory store like a ``dict`` or :class:`zarr.storage.MemoryStore`, then when it is pickled all of the store data will be included in the pickled data. However, if an array or group is backed by a persistent -store like a :class:`zarr.storage.DirectoryStore`, :class:`zarr.storage.ZipStore` or -:class:`zarr.storage.DBMStore` then the store data **are not** pickled. The only thing +store like a :class:`zarr.storage.DirectoryStore` or :class:`zarr.storage.ZipStore` +then the store data **are not** pickled. The only thing that is pickled is the necessary parameters to allow the store to re-open any underlying files or databases upon being unpickled. diff --git a/notebooks/store_benchmark.ipynb b/notebooks/store_benchmark.ipynb index 42128a834b..014f895c3e 100644 --- a/notebooks/store_benchmark.ipynb +++ b/notebooks/store_benchmark.ipynb @@ -35,48 +35,6 @@ "zarr.__version__" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'6.2.5'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import bsddb3\n", - "bsddb3.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.93'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import lmdb\n", - "lmdb.__version__" - ] - }, { "cell_type": "code", "execution_count": 4, @@ -86,16 +44,6 @@ "import numpy as np" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import dbm.gnu\n", - "import dbm.ndbm" - ] - }, { "cell_type": "code", "execution_count": 6, @@ -119,19 +67,11 @@ " clean()\n", " fdict_root = zarr.group(store=dict())\n", " hdict_root = zarr.group(store=zarr.DictStore())\n", - " gdbm_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'gdbm'), open=dbm.gnu.open))\n", - " ndbm_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'ndbm'), open=dbm.ndbm.open))\n", - " bdbm_btree_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'bdbm_btree'), open=bsddb3.btopen))\n", - " bdbm_hash_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'bdbm_hash'), open=bsddb3.hashopen))\n", " zip_root = zarr.group(store=zarr.ZipStore(os.path.join(bench_dir, 'zip'), mode='w'))\n", " dir_root = zarr.group(store=zarr.DirectoryStore(os.path.join(bench_dir, 'dir')))\n", "\n", " fdict_z = fdict_root.empty_like(name, a)\n", " hdict_z = hdict_root.empty_like(name, a)\n", - " gdbm_z = gdbm_root.empty_like(name, a)\n", - " ndbm_z = ndbm_root.empty_like(name, a)\n", - " bdbm_btree_z = bdbm_btree_root.empty_like(name, a)\n", - " bdbm_hash_z = bdbm_hash_root.empty_like(name, a)\n", " zip_z = zip_root.empty_like(name, a)\n", " dir_z = dir_root.empty_like(name, a)\n", "\n", @@ -252,91 +192,6 @@ "%timeit save(a, hdict_z)" ] }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "316 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit save(a, lmdb_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "938 ms ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit save(a, gdbm_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "406 ms ± 8.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit save(a, ndbm_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.43 s ± 156 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit save(a, bdbm_btree_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.24 s ± 260 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit save(a, bdbm_hash_z)" - ] - }, { "cell_type": "code", "execution_count": 16, @@ -414,91 +269,6 @@ "%timeit load(hdict_z, a)" ] }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "429 ms ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit load(lmdb_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "459 ms ± 10 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit load(gdbm_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "473 ms ± 5.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit load(ndbm_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "504 ms ± 8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit load(bdbm_btree_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "519 ms ± 9.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit load(bdbm_hash_z, a)" - ] - }, { "cell_type": "code", "execution_count": 25, @@ -618,91 +388,6 @@ "%timeit -r3 save(a, hdict_z)" ] }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "846 ms ± 24 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 save(a, lmdb_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6.35 s ± 785 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 save(a, gdbm_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4.62 s ± 1.09 s per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 save(a, ndbm_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7.84 s ± 1.66 s per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 save(a, bdbm_btree_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6.49 s ± 808 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 save(a, bdbm_hash_z)" - ] - }, { "cell_type": "code", "execution_count": 36, @@ -778,91 +463,6 @@ "%timeit -r3 load(hdict_z, a)" ] }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "532 ms ± 16.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 load(lmdb_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.2 s ± 10.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 load(gdbm_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.18 s ± 13.2 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 load(ndbm_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.59 s ± 16.7 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 load(bdbm_btree_z, a)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.61 s ± 7.31 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit -r3 load(bdbm_hash_z, a)" - ] - }, { "cell_type": "code", "execution_count": 46, @@ -978,96 +578,6 @@ "%time dask_op(hdict_z, fdict_z)" ] }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 15.1 s, sys: 524 ms, total: 15.6 s\n", - "Wall time: 3.02 s\n" - ] - } - ], - "source": [ - "%time dask_op(lmdb_z, fdict_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 16.5 s, sys: 712 ms, total: 17.2 s\n", - "Wall time: 3.13 s\n" - ] - } - ], - "source": [ - "%time dask_op(gdbm_z, fdict_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 16.3 s, sys: 604 ms, total: 16.9 s\n", - "Wall time: 3.22 s\n" - ] - } - ], - "source": [ - "%time dask_op(ndbm_z, fdict_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 19.6 s, sys: 1.24 s, total: 20.9 s\n", - "Wall time: 3.27 s\n" - ] - } - ], - "source": [ - "%time dask_op(bdbm_btree_z, fdict_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 20.3 s, sys: 1.08 s, total: 21.4 s\n", - "Wall time: 3.53 s\n" - ] - } - ], - "source": [ - "%time dask_op(bdbm_hash_z, fdict_z)" - ] - }, { "cell_type": "code", "execution_count": 83, @@ -1129,96 +639,6 @@ "%time dask_op(fdict_z, hdict_z)" ] }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 16.2 s, sys: 1.6 s, total: 17.8 s\n", - "Wall time: 2.71 s\n" - ] - } - ], - "source": [ - "%time dask_op(fdict_z, lmdb_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 16.8 s, sys: 3.05 s, total: 19.8 s\n", - "Wall time: 8.01 s\n" - ] - } - ], - "source": [ - "%time dask_op(fdict_z, gdbm_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 17.9 s, sys: 3.01 s, total: 20.9 s\n", - "Wall time: 5.46 s\n" - ] - } - ], - "source": [ - "%time dask_op(fdict_z, ndbm_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 13.8 s, sys: 3.39 s, total: 17.2 s\n", - "Wall time: 7.87 s\n" - ] - } - ], - "source": [ - "%time dask_op(fdict_z, bdbm_btree_z)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 13.9 s, sys: 3.27 s, total: 17.2 s\n", - "Wall time: 6.73 s\n" - ] - } - ], - "source": [ - "%time dask_op(fdict_z, bdbm_hash_z)" - ] - }, { "cell_type": "code", "execution_count": 57, @@ -1261,11 +681,6 @@ "metadata": {}, "outputs": [], "source": [ - "lmdb_z.store.close()\n", - "gdbm_z.store.close()\n", - "ndbm_z.store.close()\n", - "bdbm_btree_z.store.close()\n", - "bdbm_hash_z.store.close()\n", "zip_z.store.close()" ] }, diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 387464d589..1da0f95ce7 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -31,7 +31,6 @@ from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group from zarr.storage import ( - DBMStore, DictStore, DirectoryStore, KVStore, diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 452f59e56a..61686fa152 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -33,7 +33,7 @@ import uuid import time -from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray_like from numcodecs.registry import codec_registry from zarr.context import Context @@ -49,7 +49,6 @@ from zarr.util import ( buffer_size, json_loads, - nolock, normalize_chunks, normalize_dimension_separator, normalize_dtype, @@ -1804,210 +1803,6 @@ def migrate_1to2(store): del store["attrs"] -# noinspection PyShadowingBuiltins -class DBMStore(Store): - """Storage class using a DBM-style database. - - Parameters - ---------- - path : string - Location of database file. - flag : string, optional - Flags for opening the database file. - mode : int - File mode used if a new file is created. - open : function, optional - Function to open the database file. If not provided, :func:`dbm.open` will be - used on Python 3, and :func:`anydbm.open` will be used on Python 2. - write_lock: bool, optional - Use a lock to prevent concurrent writes from multiple threads (True by default). - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk.e - **open_kwargs - Keyword arguments to pass the `open` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.DBMStore('data/array.db') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.DBMStore('data/group.db') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - - After modifying a DBMStore, the ``close()`` method must be called, otherwise - essential data may not be written to the underlying database file. The - DBMStore class also supports the context manager protocol, which ensures the - ``close()`` method is called on leaving the context, e.g.:: - - >>> with zarr.DBMStore('data/array.db') as store: - ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - ... z[...] = 42 - ... # no need to call store.close() - - A different database library can be used by passing a different function to - the `open` parameter. For example, if the `bsddb3 - `_ package is installed, a - Berkeley DB database can be used:: - - >>> import bsddb3 - >>> store = zarr.DBMStore('data/array.bdb', open=bsddb3.btopen) - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() - - Notes - ----- - Please note that, by default, this class will use the Python standard - library `dbm.open` function to open the database file (or `anydbm.open` on - Python 2). There are up to three different implementations of DBM-style - databases available in any Python installation, and which one is used may - vary from one system to another. Database file formats are not compatible - between these different implementations. Also, some implementations are - more efficient than others. In particular, the "dumb" implementation will be - the fall-back on many systems, and has very poor performance for some usage - scenarios. If you want to ensure a specific implementation is used, pass the - corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM - library. - - Safe to write in multiple threads. May be safe to write in multiple processes, - depending on which DBM implementation is being used, although this has not been - tested. - - """ - - def __init__( - self, - path, - flag="c", - mode=0o666, - open=None, - write_lock=True, - dimension_separator=None, - **open_kwargs, - ): - if open is None: - import dbm - - open = dbm.open - path = os.path.abspath(path) - # noinspection PyArgumentList - self.db = open(path, flag, mode, **open_kwargs) - self.path = path - self.flag = flag - self.mode = mode - self.open = open - self.write_lock = write_lock - if write_lock: - # This may not be required as some dbm implementations manage their own - # locks, but err on the side of caution. - self.write_mutex = Lock() - else: - self.write_mutex = nolock - self.open_kwargs = open_kwargs - self._dimension_separator = dimension_separator - - def __getstate__(self): - try: - self.flush() # needed for ndbm - except Exception: - # flush may fail if db has already been closed - pass - return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) - - def __setstate__(self, state): - path, flag, mode, open, write_lock, open_kws = state - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.__init__(path=path, flag=flag, mode=mode, open=open, write_lock=write_lock, **open_kws) - - def close(self): - """Closes the underlying database file.""" - if hasattr(self.db, "close"): - with self.write_mutex: - self.db.close() - - def flush(self): - """Synchronizes data to the underlying database file.""" - if self.flag[0] != "r": - with self.write_mutex: - if hasattr(self.db, "sync"): - self.db.sync() - else: # pragma: no cover - # we don't cover this branch anymore as ndbm (oracle) is not packaged - # by conda-forge on non-mac OS: - # https://github.com/conda-forge/staged-recipes/issues/4476 - # fall-back, close and re-open, needed for ndbm - flag = self.flag - if flag[0] == "n": - flag = "c" + flag[1:] # don't clobber an existing database - self.db.close() - # noinspection PyArgumentList - self.db = self.open(self.path, flag, self.mode, **self.open_kwargs) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __getitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return self.db[key] - - def __setitem__(self, key, value): - if isinstance(key, str): - key = key.encode("ascii") - value = ensure_bytes(value) - with self.write_mutex: - self.db[key] = value - - def __delitem__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - with self.write_mutex: - del self.db[key] - - def __eq__(self, other): - return ( - isinstance(other, DBMStore) - and self.path == other.path - and - # allow flag and mode to differ - self.open == other.open - and self.open_kwargs == other.open_kwargs - ) - - def keys(self): - return (ensure_text(k, "ascii") for k in iter(self.db.keys())) - - def __iter__(self): - return self.keys() - - def __len__(self): - return sum(1 for _ in self.keys()) - - def __contains__(self, key): - if isinstance(key, str): - key = key.encode("ascii") - return key in self.db - - def rmdir(self, path: str = "") -> None: - path = normalize_storage_path(path) - _rmdir_from_keys(self, path) - - class LRUStoreCache(Store): """Storage class that implements a least-recently-used (LRU) cache layer over some other store. Intended primarily for use with stores that can be slow to diff --git a/tests/test_core.py b/tests/test_core.py index 3f979a2a84..8f64be81be 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,5 +1,4 @@ import atexit -import os import pickle import shutil from typing import Any, Literal, Optional, Tuple, Union @@ -37,14 +36,12 @@ from zarr.core import Array from zarr.meta import json_loads from zarr.storage import ( - DBMStore, DirectoryStore, FSStore, KVStore, LRUStoreCache, NestedDirectoryStore, SQLiteStore, - atexit_rmglob, atexit_rmtree, init_array, init_group, @@ -1668,30 +1665,6 @@ def expected(self): ] -class TestArrayWithDBMStore(TestArray): - def create_store(self): - path = mktemp(suffix=".anydbm") - atexit.register(atexit_rmglob, path + "*") - store = DBMStore(path, flag="n") - return store - - def test_nbytes_stored(self): - pass # not implemented - - -@pytest.mark.skip(reason="can't get bsddb3 to work on CI right now") -class TestArrayWithDBMStoreBerkeleyDB(TestArray): - def create_store(self): - bsddb3 = pytest.importorskip("bsddb3") - path = mktemp(suffix=".dbm") - atexit.register(os.remove, path) - store = DBMStore(path, flag="n", open=bsddb3.btopen) - return store - - def test_nbytes_stored(self): - pass # not implemented - - class TestArrayWithSQLiteStore(TestArray): def create_store(self): pytest.importorskip("sqlite3") diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 0ce3422eeb..5e36288413 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -22,7 +22,6 @@ from zarr.creation import open_array from zarr.hierarchy import Group, group, open_group from zarr.storage import ( - DBMStore, KVStore, DirectoryStore, FSStore, @@ -32,7 +31,6 @@ SQLiteStore, ZipStore, array_meta_key, - atexit_rmglob, atexit_rmtree, group_meta_key, init_array, @@ -1104,25 +1102,6 @@ def test_move(self): pass -class TestGroupWithDBMStore(TestGroup): - @staticmethod - def create_store(): - path = mktemp(suffix=".anydbm") - atexit.register(atexit_rmglob, path + "*") - store = DBMStore(path, flag="n") - return store, None - - -class TestGroupWithDBMStoreBerkeleyDB(TestGroup): - @staticmethod - def create_store(): - bsddb3 = pytest.importorskip("bsddb3") - path = mktemp(suffix=".dbm") - atexit.register(os.remove, path) - store = DBMStore(path, flag="n", open=bsddb3.btopen) - return store, None - - class TestGroupWithSQLiteStore(TestGroup): def create_store(self): pytest.importorskip("sqlite3") diff --git a/tests/test_storage.py b/tests/test_storage.py index 69755bf4b9..7895955376 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -32,7 +32,6 @@ from zarr.storage import ( # ConsolidatedMetadataStore, - # DBMStore, # DictStore, # DirectoryStore, # KVStore, @@ -1602,61 +1601,6 @@ def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): # assert np.array_equiv(y, x) -# class TestDBMStore(StoreTests): -# def create_store(self, dimension_separator=None): -# path = mktemp(suffix=".anydbm") -# atexit.register(atexit_rmglob, path + "*") -# # create store using default dbm implementation -# store = DBMStore(path, flag="n", dimension_separator=dimension_separator) -# return store - -# def test_context_manager(self): -# with self.create_store() as store: -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"qux" -# assert 2 == len(store) - - -# class TestDBMStoreDumb(TestDBMStore): -# def create_store(self, **kwargs): -# path = mktemp(suffix=".dumbdbm") -# atexit.register(atexit_rmglob, path + "*") - -# import dbm.dumb as dumbdbm - -# store = DBMStore(path, flag="n", open=dumbdbm.open, **kwargs) -# return store - - -# class TestDBMStoreGnu(TestDBMStore): -# def create_store(self, **kwargs): -# gdbm = pytest.importorskip("dbm.gnu") -# path = mktemp(suffix=".gdbm") # pragma: no cover -# atexit.register(os.remove, path) # pragma: no cover -# store = DBMStore( -# path, flag="n", open=gdbm.open, write_lock=False, **kwargs -# ) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreNDBM(TestDBMStore): -# def create_store(self, **kwargs): -# ndbm = pytest.importorskip("dbm.ndbm") -# path = mktemp(suffix=".ndbm") # pragma: no cover -# atexit.register(atexit_rmglob, path + "*") # pragma: no cover -# store = DBMStore(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreBerkeleyDB(TestDBMStore): -# def create_store(self, **kwargs): -# bsddb3 = pytest.importorskip("bsddb3") -# path = mktemp(suffix=".dbm") -# atexit.register(os.remove, path) -# store = DBMStore(path, flag="n", open=bsddb3.btopen, write_lock=False, **kwargs) -# return store - - # class TestSQLiteStore(StoreTests): # def create_store(self, **kwargs): # pytest.importorskip("sqlite3") diff --git a/tests/test_storage_v3.py b/tests/test_storage_v3.py index c0cf6531e8..d9001d3d78 100644 --- a/tests/test_storage_v3.py +++ b/tests/test_storage_v3.py @@ -31,7 +31,6 @@ # ) # from zarr._storage.v3 import ( # ConsolidatedMetadataStoreV3, -# DBMStoreV3, # DirectoryStoreV3, # FSStoreV3, # KVStore, @@ -47,11 +46,6 @@ # # pytest will fail to run if the following fixtures aren't imported here # from .test_storage import StoreTests as _StoreTests # from .test_storage import TestConsolidatedMetadataStore as _TestConsolidatedMetadataStore -# from .test_storage import TestDBMStore as _TestDBMStore -# from .test_storage import TestDBMStoreBerkeleyDB as _TestDBMStoreBerkeleyDB -# from .test_storage import TestDBMStoreDumb as _TestDBMStoreDumb -# from .test_storage import TestDBMStoreGnu as _TestDBMStoreGnu -# from .test_storage import TestDBMStoreNDBM as _TestDBMStoreNDBM # from .test_storage import TestDirectoryStore as _TestDirectoryStore # from .test_storage import TestFSStore as _TestFSStore # from .test_storage import TestLRUStoreCache as _TestLRUStoreCache @@ -413,55 +407,6 @@ # return store -# class TestDBMStoreV3(_TestDBMStore, StoreV3Tests): -# def create_store(self, dimension_separator=None): -# path = mktemp(suffix=".anydbm") -# atexit.register(atexit_rmglob, path + "*") -# # create store using default dbm implementation -# store = DBMStoreV3(path, flag="n", dimension_separator=dimension_separator) -# return store - - -# class TestDBMStoreV3Dumb(_TestDBMStoreDumb, StoreV3Tests): -# def create_store(self, **kwargs): -# path = mktemp(suffix=".dumbdbm") -# atexit.register(atexit_rmglob, path + "*") - -# import dbm.dumb as dumbdbm - -# store = DBMStoreV3(path, flag="n", open=dumbdbm.open, **kwargs) -# return store - - -# class TestDBMStoreV3Gnu(_TestDBMStoreGnu, StoreV3Tests): -# def create_store(self, **kwargs): -# gdbm = pytest.importorskip("dbm.gnu") -# path = mktemp(suffix=".gdbm") # pragma: no cover -# atexit.register(os.remove, path) # pragma: no cover -# store = DBMStoreV3( -# path, flag="n", open=gdbm.open, write_lock=False, **kwargs -# ) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreV3NDBM(_TestDBMStoreNDBM, StoreV3Tests): -# def create_store(self, **kwargs): -# ndbm = pytest.importorskip("dbm.ndbm") -# path = mktemp(suffix=".ndbm") # pragma: no cover -# atexit.register(atexit_rmglob, path + "*") # pragma: no cover -# store = DBMStoreV3(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover -# return store # pragma: no cover - - -# class TestDBMStoreV3BerkeleyDB(_TestDBMStoreBerkeleyDB, StoreV3Tests): -# def create_store(self, **kwargs): -# bsddb3 = pytest.importorskip("bsddb3") -# path = mktemp(suffix=".dbm") -# atexit.register(os.remove, path) -# store = DBMStoreV3(path, flag="n", open=bsddb3.btopen, write_lock=False, **kwargs) -# return store - - # class TestSQLiteStoreV3(_TestSQLiteStore, StoreV3Tests): # def create_store(self, **kwargs): # pytest.importorskip("sqlite3") @@ -609,7 +554,6 @@ # def test_top_level_imports(): # for store_name in [ -# "DBMStoreV3", # "KVStoreV3", # "DirectoryStoreV3", # "LRUStoreCacheV3", From df179c39c424d71a898f14ae00a70fa55d4242d8 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 14:06:17 +0200 Subject: [PATCH 19/22] chore: remove sqlite storage --- docs/api/storage.rst | 4 - docs/api/v3.rst | 1 - docs/tutorial.rst | 10 -- src/zarr/__init__.py | 1 - src/zarr/storage.py | 198 --------------------------------------- tests/test_core.py | 16 +--- tests/test_hierarchy.py | 10 -- tests/test_storage.py | 37 -------- tests/test_storage_v3.py | 20 ---- 9 files changed, 1 insertion(+), 296 deletions(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 9a4c9c7b4a..7df93c4c8c 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -11,10 +11,6 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush -.. autoclass:: SQLiteStore - - .. automethod:: close - .. autoclass:: LRUStoreCache .. automethod:: invalidate diff --git a/docs/api/v3.rst b/docs/api/v3.rst index b880c1859c..dce07ace5f 100644 --- a/docs/api/v3.rst +++ b/docs/api/v3.rst @@ -50,7 +50,6 @@ You can also check ``Store type`` here (which indicates Zarr V3). .. autoclass:: MemoryStoreV3 .. autoclass:: DirectoryStoreV3 .. autoclass:: ZipStoreV3 -.. autoclass:: SQLiteStoreV3 .. autoclass:: LRUStoreCacheV3 .. autoclass:: ConsolidatedMetadataStoreV3 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 95052f3667..1d20b73966 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -805,16 +805,6 @@ boundaries. Note also that the ``close()`` method must be called after writing any data to the store, otherwise essential records will not be written to the underlying zip file. -In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which -enables the SQLite database to be used for storing an array or group (requires -Python is built with SQLite support):: - - >>> store = zarr.SQLiteStore('data/example.sqldb') - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 1da0f95ce7..cbbdfdaf27 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -37,7 +37,6 @@ LRUStoreCache, MemoryStore, NestedDirectoryStore, - SQLiteStore, TempStore, ZipStore, ) diff --git a/src/zarr/storage.py b/src/zarr/storage.py index 61686fa152..9b2e9db92e 100644 --- a/src/zarr/storage.py +++ b/src/zarr/storage.py @@ -17,7 +17,6 @@ import atexit import errno import glob -import operator import os import re import shutil @@ -27,7 +26,6 @@ from collections import OrderedDict from collections.abc import MutableMapping from os import scandir -from pickle import PicklingError from threading import Lock, RLock from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any import uuid @@ -2002,202 +2000,6 @@ def __delitem__(self, key): self._invalidate_value(key) -class SQLiteStore(Store): - """Storage class using SQLite. - - Parameters - ---------- - path : string - Location of database file. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - **kwargs - Keyword arguments passed through to the `sqlite3.connect` function. - - Examples - -------- - Store a single array:: - - >>> import zarr - >>> store = zarr.SQLiteStore('data/array.sqldb') - >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) - >>> z[...] = 42 - >>> store.close() # don't forget to call this when you're done - - Store a group:: - - >>> store = zarr.SQLiteStore('data/group.sqldb') - >>> root = zarr.group(store=store, overwrite=True) - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) - >>> bar[...] = 42 - >>> store.close() # don't forget to call this when you're done - """ - - def __init__(self, path, dimension_separator=None, **kwargs): - import sqlite3 - - self._dimension_separator = dimension_separator - - # normalize path - if path != ":memory:": - path = os.path.abspath(path) - - # store properties - self.path = path - self.kwargs = kwargs - - # allow threading if SQLite connections are thread-safe - # - # ref: https://www.sqlite.org/releaselog/3_3_1.html - # ref: https://github.com/python/cpython/issues/71377 - check_same_thread = True - if sqlite3.sqlite_version_info >= (3, 3, 1): - check_same_thread = False - - # keep a lock for serializing mutable operations - self.lock = Lock() - - # open database - self.db = sqlite3.connect( - self.path, - detect_types=0, - isolation_level=None, - check_same_thread=check_same_thread, - **self.kwargs, - ) - - # handle keys as `str`s - self.db.text_factory = str - - # get a cursor to read/write to the database - self.cursor = self.db.cursor() - - # initialize database with our table if missing - with self.lock: - self.cursor.execute("CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)") - - def __getstate__(self): - if self.path == ":memory:": - raise PicklingError("Cannot pickle in-memory SQLite databases") - return self.path, self.kwargs - - def __setstate__(self, state): - path, kwargs = state - self.__init__(path=path, **kwargs) - - def close(self): - """Closes the underlying database.""" - - # close cursor and db objects - self.cursor.close() - self.db.close() - - def __getitem__(self, key): - value = self.cursor.execute("SELECT v FROM zarr WHERE (k = ?)", (key,)) - for (v,) in value: - return v - raise KeyError(key) - - def __setitem__(self, key, value): - self.update({key: value}) - - def __delitem__(self, key): - with self.lock: - self.cursor.execute("DELETE FROM zarr WHERE (k = ?)", (key,)) - if self.cursor.rowcount < 1: - raise KeyError(key) - - def __contains__(self, key): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr WHERE (k = ?)", (key,)) - for (has,) in cs: - has = bool(has) - return has - - def items(self): - kvs = self.cursor.execute("SELECT k, v FROM zarr") - yield from kvs - - def keys(self): - ks = self.cursor.execute("SELECT k FROM zarr") - for (k,) in ks: - yield k - - def values(self): - vs = self.cursor.execute("SELECT v FROM zarr") - for (v,) in vs: - yield v - - def __iter__(self): - return self.keys() - - def __len__(self): - cs = self.cursor.execute("SELECT COUNT(*) FROM zarr") - for (c,) in cs: - return c - - def update(self, *args, **kwargs): - args += (kwargs,) - - kv_list = [] - for dct in args: - for k, v in dct.items(): - v = ensure_contiguous_ndarray_like(v) - - # Accumulate key-value pairs for storage - kv_list.append((k, v)) - - with self.lock: - self.cursor.executemany("REPLACE INTO zarr VALUES (?, ?)", kv_list) - - def listdir(self, path=None): - path = normalize_storage_path(path) - sep = "_" if path == "" else "/" - keys = self.cursor.execute( - """ - SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( - SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m - FROM zarr WHERE k LIKE (? || "{sep}%") - ) ORDER BY l ASC - """.format(sep=sep), - (path, path), - ) - keys = list(map(operator.itemgetter(0), keys)) - return keys - - def getsize(self, path=None): - path = normalize_storage_path(path) - size = self.cursor.execute( - """ - SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - WHERE k LIKE (? || "%") AND - 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") - """, - (path, path), - ) - for (s,) in size: - return s - - def rmdir(self, path=None): - path = normalize_storage_path(path) - if path: - with self.lock: - self.cursor.execute('DELETE FROM zarr WHERE k LIKE (? || "/%")', (path,)) - else: - self.clear() - - def clear(self): - with self.lock: - self.cursor.executescript( - """ - BEGIN TRANSACTION; - DROP TABLE zarr; - CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); - COMMIT TRANSACTION; - """ - ) - - class ConsolidatedMetadataStore(Store): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/tests/test_core.py b/tests/test_core.py index 8f64be81be..d996af5563 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -41,15 +41,13 @@ KVStore, LRUStoreCache, NestedDirectoryStore, - SQLiteStore, - atexit_rmtree, init_array, init_group, normalize_store_arg, ) from zarr.util import buffer_size -from .util import have_fsspec, mktemp +from .util import have_fsspec # noinspection PyMethodMayBeStatic @@ -1665,18 +1663,6 @@ def expected(self): ] -class TestArrayWithSQLiteStore(TestArray): - def create_store(self): - pytest.importorskip("sqlite3") - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStore(path) - return store - - def test_nbytes_stored(self): - pass # not implemented - - class TestArrayWithNoCompressor(TestArray): compressor = None diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 5e36288413..f85056f5ff 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -28,7 +28,6 @@ LRUStoreCache, MemoryStore, NestedDirectoryStore, - SQLiteStore, ZipStore, array_meta_key, atexit_rmtree, @@ -1102,15 +1101,6 @@ def test_move(self): pass -class TestGroupWithSQLiteStore(TestGroup): - def create_store(self): - pytest.importorskip("sqlite3") - path = mktemp(suffix=".db") - atexit.register(atexit_rmtree, path) - store = SQLiteStore(path) - return store, None - - class TestGroupWithChunkStore(TestGroup): @staticmethod def create_store(): diff --git a/tests/test_storage.py b/tests/test_storage.py index 7895955376..f6f9f6d911 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -38,7 +38,6 @@ # LRUStoreCache, # MemoryStore, # NestedDirectoryStore, - # SQLiteStore, # Store, # TempStore, # ZipStore, @@ -1601,42 +1600,6 @@ def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): # assert np.array_equiv(y, x) -# class TestSQLiteStore(StoreTests): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStore(path, **kwargs) -# return store - -# def test_underscore_in_name(self): -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStore(path) -# store["a"] = b"aaa" -# store["a_b"] = b"aa_bb" -# store.rmdir("a") -# assert "a_b" in store - - -# class TestSQLiteStoreInMemory(TestSQLiteStore): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# store = SQLiteStore(":memory:", **kwargs) -# return store - -# def test_pickle(self): - -# # setup store -# store = self.create_store() -# store[self.root + "foo"] = b"bar" -# store[self.root + "baz"] = b"quux" - -# # round-trip through pickle -# with pytest.raises(PicklingError): -# pickle.dumps(store) - - # class TestLRUStoreCache(StoreTests): # CountingClass = CountingDict diff --git a/tests/test_storage_v3.py b/tests/test_storage_v3.py index d9001d3d78..671bfeee2a 100644 --- a/tests/test_storage_v3.py +++ b/tests/test_storage_v3.py @@ -37,7 +37,6 @@ # KVStoreV3, # LRUStoreCacheV3, # MemoryStoreV3, -# SQLiteStoreV3, # StoreV3, # ZipStoreV3, # ) @@ -50,8 +49,6 @@ # from .test_storage import TestFSStore as _TestFSStore # from .test_storage import TestLRUStoreCache as _TestLRUStoreCache # from .test_storage import TestMemoryStore as _TestMemoryStore -# from .test_storage import TestSQLiteStore as _TestSQLiteStore -# from .test_storage import TestSQLiteStoreInMemory as _TestSQLiteStoreInMemory # from .test_storage import TestZipStore as _TestZipStore # from .test_storage import dimension_separator_fixture, s3, skip_if_nested_chunks # noqa @@ -407,22 +404,6 @@ # return store -# class TestSQLiteStoreV3(_TestSQLiteStore, StoreV3Tests): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# path = mktemp(suffix=".db") -# atexit.register(atexit_rmtree, path) -# store = SQLiteStoreV3(path, **kwargs) -# return store - - -# class TestSQLiteStoreV3InMemory(_TestSQLiteStoreInMemory, StoreV3Tests): -# def create_store(self, **kwargs): -# pytest.importorskip("sqlite3") -# store = SQLiteStoreV3(":memory:", **kwargs) -# return store - - # @pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") # class TestStorageTransformerV3(TestMappingStoreV3): # def create_store(self, **kwargs): @@ -558,7 +539,6 @@ # "DirectoryStoreV3", # "LRUStoreCacheV3", # "MemoryStoreV3", -# "SQLiteStoreV3", # "ZipStoreV3", # ]: # if v3_api_available: From 35130901875eb47b86cfbc1468ffc8c2edd198d1 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 14:22:20 +0200 Subject: [PATCH 20/22] docs: add release notes --- docs/release.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index 3ed47ff9f5..73ffc3d628 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -18,6 +18,12 @@ Release notes Unreleased (v3) --------------- +Refactoring +~~~~~~~~~~~ + +* Remove storage classes for the following backends: N5, Redis, MongoDB, ABS, LMDB, DBM, and SQLite. +The intention is that these storage classes will be developed external to ``zarr-python``. + Maintenance ~~~~~~~~~~~ From a55f8d6edec072d11483d773cb72816906fdbd27 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 14:23:07 +0200 Subject: [PATCH 21/22] docs: edit release notes --- docs/release.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index 73ffc3d628..33302e14ae 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -22,7 +22,8 @@ Refactoring ~~~~~~~~~~~ * Remove storage classes for the following backends: N5, Redis, MongoDB, ABS, LMDB, DBM, and SQLite. -The intention is that these storage classes will be developed external to ``zarr-python``. + The intention is that these storage classes will be developed external to ``zarr-python``. + By :user:`Davis Bennett ` :issue:`1791`. Maintenance ~~~~~~~~~~~ From 85732032fb389d4573a8802de270cd7d129d99b1 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 22 Apr 2024 16:19:01 +0200 Subject: [PATCH 22/22] docs: remove n5.rst --- docs/api/n5.rst | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/api/n5.rst diff --git a/docs/api/n5.rst b/docs/api/n5.rst deleted file mode 100644 index b6a8d8c61e..0000000000 --- a/docs/api/n5.rst +++ /dev/null @@ -1,5 +0,0 @@ -N5 (``zarr.n5``) -================ -.. automodule:: zarr.n5 - -.. autoclass:: N5Store