From 0c445fd8184da71630a6813c5aa507887e02cf75 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 17 Mar 2024 16:13:54 -0400 Subject: [PATCH 01/32] change entries property to a structured array, add from_dict --- virtualizarr/manifests/manifest.py | 104 ++++++++++++++---- .../tests/test_manifests/test_manifest.py | 16 +-- 2 files changed, 90 insertions(+), 30 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 0e54394e..937d7d8f 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,9 +1,9 @@ import itertools import re -from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast +from typing import Any, Iterable, Iterator, List, Mapping, NewType, Tuple, Union, cast import numpy as np -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict from ..types import ChunkKey @@ -14,6 +14,11 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) +ChunkDict = NewType( + "ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]] +) # just the .zattrs (for one array or for the whole store/group) + + class ChunkEntry(BaseModel): """ Information for a single chunk in the manifest. @@ -42,11 +47,20 @@ def to_kerchunk(self) -> List[Union[str, int]]: return [self.path, self.offset, self.length] +# TODO we want the path field to contain a variable-length string, but that's not available until numpy 2.0 +# See https://numpy.org/neps/nep-0055-string_dtype.html +MANIFEST_STRUCTURED_ARRAY_DTYPES = np.dtype( + [("path", " Mapping[ChunkKey, ChunkEntry]: - validate_chunk_keys(list(entries.keys())) + def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": + # TODO do some input validation here first? - # TODO what if pydantic adjusts anything during validation? - return entries + # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape? + shape = get_chunk_grid_shape(chunks.keys()) + + # Initializing to empty implies that entries with path='' are treated as missing chunks + entries = np.empty(shape=shape, dtype=MANIFEST_STRUCTURED_ARRAY_DTYPES) + + # populate the array + for key, entry in chunks.items(): + entries[split(key)] = tuple(entry.values()) + + return ChunkManifest(entries=entries) @property def ndim_chunk_grid(self) -> int: @@ -81,7 +108,7 @@ def ndim_chunk_grid(self) -> int: Not the same as the dimension of an array backed by this chunk manifest. """ - return get_ndim_from_key(list(self.entries.keys())[0]) + return self.entries.ndim @property def shape_chunk_grid(self) -> Tuple[int, ...]: @@ -90,23 +117,56 @@ def shape_chunk_grid(self) -> Tuple[int, ...]: Not the same as the shape of an array backed by this chunk manifest. """ - return get_chunk_grid_shape(list(self.entries.keys())) + return self.entries.shape def __repr__(self) -> str: return f"ChunkManifest" def __getitem__(self, key: ChunkKey) -> ChunkEntry: - return self.chunks[key] + indices = split(key) + return ChunkEntry(self.entries[indices]) def __iter__(self) -> Iterator[ChunkKey]: return iter(self.chunks.keys()) def __len__(self) -> int: - return len(self.chunks) + return self.entries.size + + def dict(self) -> ChunkDict: + """ + Converts the entire manifest to a nested dictionary, of the form + + { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + } + """ - def dict(self) -> dict[str, dict[str, Union[str, int]]]: - """Converts the entire manifest to a nested dictionary.""" - return {k: dict(entry) for k, entry in self.entries.items()} + def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]: + return { + "path": entry[0], + "offset": entry[1], + "length": entry[2], + } + + # print(self.entries.dtype) + + coord_vectors = np.mgrid[ + tuple(slice(None, length) for length in self.shape_chunk_grid) + ] + # print(coord_vectors) + # print(self.entries) + + # TODO don't include entry if path='' ? + return cast( + ChunkDict, + { + join(inds): _entry_to_dict(entry.item()) + for *inds, entry in np.nditer([*coord_vectors, self.entries]) + }, + ) @staticmethod def from_zarr_json(filepath: str) -> "ChunkManifest": @@ -125,12 +185,12 @@ def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": return ChunkManifest(entries=chunkentries) -def split(key: ChunkKey) -> List[int]: - return list(int(i) for i in key.split(".")) +def split(key: ChunkKey) -> Tuple[int, ...]: + return tuple(int(i) for i in key.split(".")) -def join(inds: Iterable[int]) -> ChunkKey: - return cast(ChunkKey, ".".join(str(i) for i in inds)) +def join(inds: Iterable[Any]) -> ChunkKey: + return cast(ChunkKey, ".".join(str(i) for i in list(inds))) def get_ndim_from_key(key: str) -> int: diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 4a977837..50149692 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -9,7 +9,7 @@ def test_create_manifest(self): chunks = { "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, } - manifest = ChunkManifest(entries=chunks) + manifest = ChunkManifest.from_dict(chunks) assert manifest.dict() == chunks chunks = { @@ -18,7 +18,7 @@ def test_create_manifest(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest(entries=chunks) + manifest = ChunkManifest.from_dict(chunks) assert manifest.dict() == chunks def test_invalid_chunk_entries(self): @@ -26,7 +26,7 @@ def test_invalid_chunk_entries(self): "0.0.0": {"path": "s3://bucket/foo.nc"}, } with pytest.raises(ValidationError, match="missing"): - ChunkManifest(entries=chunks) + ChunkManifest.from_dict(chunks) chunks = { "0.0.0": { @@ -36,21 +36,21 @@ def test_invalid_chunk_entries(self): }, } with pytest.raises(ValidationError, match="should be a valid integer"): - ChunkManifest(entries=chunks) + ChunkManifest.from_dict(chunks) def test_invalid_chunk_keys(self): chunks = { "0.0.": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, } with pytest.raises(ValueError, match="Invalid format for chunk key: '0.0.'"): - ChunkManifest(entries=chunks) + ChunkManifest.from_dict(chunks) chunks = { "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, "0": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, } with pytest.raises(ValueError, match="Inconsistent number of dimensions"): - ChunkManifest(entries=chunks) + ChunkManifest.from_dict(chunks) chunks = { "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, @@ -58,13 +58,13 @@ def test_invalid_chunk_keys(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, } with pytest.raises(ValueError, match="do not form a complete grid"): - ChunkManifest(entries=chunks) + ChunkManifest.from_dict(chunks) chunks = { "1": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, } with pytest.raises(ValueError, match="do not form a complete grid"): - ChunkManifest(entries=chunks) + ChunkManifest.from_dict(chunks) class TestProperties: From 3bc483f04cf91bb04cb131ecb83649bbe839dfc7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 17 Mar 2024 17:05:11 -0400 Subject: [PATCH 02/32] fix validation --- virtualizarr/manifests/manifest.py | 32 ++++------- .../tests/test_manifests/test_manifest.py | 53 +++++++------------ 2 files changed, 28 insertions(+), 57 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 937d7d8f..bfe393f2 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,4 +1,3 @@ -import itertools import re from typing import Any, Iterable, Iterator, List, Mapping, NewType, Tuple, Union, cast @@ -88,6 +87,7 @@ class ChunkManifest(BaseModel): @classmethod def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": # TODO do some input validation here first? + validate_chunk_keys(chunks.keys()) # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape? shape = get_chunk_grid_shape(chunks.keys()) @@ -97,7 +97,14 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": # populate the array for key, entry in chunks.items(): - entries[split(key)] = tuple(entry.values()) + try: + entries[split(key)] = tuple(entry.values()) + except (ValueError, TypeError) as e: + msg = ( + "Each chunk entry must be of the form dict(path=, offset=, length=), " + f"but got {entry}" + ) + raise ValueError(msg) from e return ChunkManifest(entries=entries) @@ -214,9 +221,6 @@ def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]): f"Inconsistent number of dimensions between chunk key {key} and {first_key}: {other_ndim} vs {ndim}" ) - # Check that the keys collectively form a complete grid - check_keys_form_grid(chunk_keys) - def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> Tuple[int, ...]: # find max chunk index along each dimension @@ -227,24 +231,6 @@ def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> Tuple[int, ...]: return chunk_grid_shape -def check_keys_form_grid(chunk_keys: Iterable[ChunkKey]): - """Check that the chunk keys collectively form a complete grid""" - - chunk_grid_shape = get_chunk_grid_shape(chunk_keys) - - # create every possible combination - all_possible_combos = itertools.product( - *[range(length) for length in chunk_grid_shape] - ) - all_required_chunk_keys: set[ChunkKey] = set( - join(inds) for inds in all_possible_combos - ) - - # check that every possible combination is represented once in the list of chunk keys - if set(chunk_keys) != all_required_chunk_keys: - raise ValueError("Chunk keys do not form a complete grid") - - def concat_manifests(manifests: List["ChunkManifest"], axis: int) -> "ChunkManifest": """ Concatenate manifests along an existing dimension. diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 50149692..ba268472 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -1,5 +1,4 @@ import pytest -from pydantic import ValidationError from virtualizarr.manifests import ChunkManifest, concat_manifests, stack_manifests @@ -25,7 +24,7 @@ def test_invalid_chunk_entries(self): chunks = { "0.0.0": {"path": "s3://bucket/foo.nc"}, } - with pytest.raises(ValidationError, match="missing"): + with pytest.raises(ValueError, match="must be of the form"): ChunkManifest.from_dict(chunks) chunks = { @@ -35,7 +34,7 @@ def test_invalid_chunk_entries(self): "length": 100, }, } - with pytest.raises(ValidationError, match="should be a valid integer"): + with pytest.raises(ValueError, match="must be of the form"): ChunkManifest.from_dict(chunks) def test_invalid_chunk_keys(self): @@ -52,20 +51,6 @@ def test_invalid_chunk_keys(self): with pytest.raises(ValueError, match="Inconsistent number of dimensions"): ChunkManifest.from_dict(chunks) - chunks = { - "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, - } - with pytest.raises(ValueError, match="do not form a complete grid"): - ChunkManifest.from_dict(chunks) - - chunks = { - "1": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - } - with pytest.raises(ValueError, match="do not form a complete grid"): - ChunkManifest.from_dict(chunks) - class TestProperties: def test_chunk_grid_info(self): @@ -75,21 +60,21 @@ def test_chunk_grid_info(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest(entries=chunks) + manifest = ChunkManifest.from_dict(chunks) assert manifest.ndim_chunk_grid == 3 assert manifest.shape_chunk_grid == (1, 2, 2) class TestEquals: def test_equals(self): - manifest1 = ChunkManifest( - entries={ + manifest1 = ChunkManifest.from_dict( + { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } ) - manifest2 = ChunkManifest( - entries={ + manifest2 = ChunkManifest.from_dict( + { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } @@ -102,21 +87,21 @@ def test_equals(self): # Perhaps by testing the property that splitting along a dimension then concatenating the pieces along that dimension should recreate the original manifest? class TestCombineManifests: def test_concat(self): - manifest1 = ChunkManifest( - entries={ + manifest1 = ChunkManifest.from_dict( + { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } ) - manifest2 = ChunkManifest( - entries={ + manifest2 = ChunkManifest.from_dict( + { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } ) axis = 1 - expected = ChunkManifest( - entries={ + expected = ChunkManifest.from_dict( + { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, @@ -128,21 +113,21 @@ def test_concat(self): assert result.dict() == expected.dict() def test_stack(self): - manifest1 = ChunkManifest( - entries={ + manifest1 = ChunkManifest.from_dict( + { "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } ) - manifest2 = ChunkManifest( - entries={ + manifest2 = ChunkManifest.from_dict( + { "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } ) axis = 1 - expected = ChunkManifest( - entries={ + expected = ChunkManifest.from_dict( + { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, From 20f2ded45baf020968203441e0d4ae14bea042cd Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 Mar 2024 11:04:36 -0400 Subject: [PATCH 03/32] equals method --- virtualizarr/manifests/manifest.py | 11 ++++++----- virtualizarr/tests/test_manifests/test_manifest.py | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index bfe393f2..fb14c297 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -158,23 +158,24 @@ def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]: "length": entry[2], } - # print(self.entries.dtype) - coord_vectors = np.mgrid[ tuple(slice(None, length) for length in self.shape_chunk_grid) ] - # print(coord_vectors) - # print(self.entries) - # TODO don't include entry if path='' ? return cast( ChunkDict, { join(inds): _entry_to_dict(entry.item()) for *inds, entry in np.nditer([*coord_vectors, self.entries]) + if entry.item()[0] + != "" # don't include entry if path='' (i.e. empty chunk) }, ) + def __eq__(self, other: Any) -> bool: + """Two manifests are equal if all of their entries are identical.""" + return (self.entries == other.entries).all() + @staticmethod def from_zarr_json(filepath: str) -> "ChunkManifest": """Create a ChunkManifest from a Zarr manifest.json file.""" diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index ba268472..38b16f09 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -79,7 +79,6 @@ def test_equals(self): "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } ) - assert not manifest1 == manifest2 assert manifest1 != manifest2 From be8af12d285ee31fd69ee76a26a82fbd6d0f1bf4 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 Mar 2024 12:30:06 -0400 Subject: [PATCH 04/32] re-implemented concatenation through concatenation of the wrapped structured array --- virtualizarr/manifests/__init__.py | 7 +- virtualizarr/manifests/array_api.py | 12 +-- virtualizarr/manifests/manifest.py | 86 +------------------ .../tests/test_manifests/test_array.py | 28 +++--- .../tests/test_manifests/test_manifest.py | 11 ++- 5 files changed, 34 insertions(+), 110 deletions(-) diff --git a/virtualizarr/manifests/__init__.py b/virtualizarr/manifests/__init__.py index 39f60c59..c317ed6a 100644 --- a/virtualizarr/manifests/__init__.py +++ b/virtualizarr/manifests/__init__.py @@ -2,9 +2,4 @@ # This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring. from .array import ManifestArray # type: ignore # noqa -from .manifest import ( # type: ignore # noqa - ChunkEntry, - ChunkManifest, - concat_manifests, - stack_manifests, -) +from .manifest import ChunkEntry, ChunkManifest # type: ignore # noqa diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 9333d88e..4bfea632 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -4,7 +4,7 @@ import numpy as np from ..zarr import Codec, ZArray -from .manifest import concat_manifests, stack_manifests +from .manifest import ChunkManifest if TYPE_CHECKING: from .array import ManifestArray @@ -123,10 +123,11 @@ def concatenate( new_shape = list(first_shape) new_shape[axis] = new_length_along_concat_axis - concatenated_manifest = concat_manifests( - [arr.manifest for arr in arrays], + concatenated_manifest_entries = np.concatenate( + [arr.manifest.entries for arr in arrays], axis=axis, ) + concatenated_manifest = ChunkManifest(entries=concatenated_manifest_entries) new_zarray = ZArray( chunks=first_arr.chunks, @@ -206,10 +207,11 @@ def stack( new_shape = list(first_shape) new_shape.insert(axis, length_along_new_stacked_axis) - stacked_manifest = stack_manifests( - [arr.manifest for arr in arrays], + stacked_manifest_entries = np.stack( + [arr.manifest.entries for arr in arrays], axis=axis, ) + stacked_manifest = ChunkManifest(entries=stacked_manifest_entries) # chunk size has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index fb14c297..456a1333 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,5 +1,5 @@ import re -from typing import Any, Iterable, Iterator, List, Mapping, NewType, Tuple, Union, cast +from typing import Any, Iterable, Iterator, List, NewType, Tuple, Union, cast import numpy as np from pydantic import BaseModel, ConfigDict @@ -13,9 +13,7 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) -ChunkDict = NewType( - "ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]] -) # just the .zattrs (for one array or for the whole store/group) +ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) class ChunkEntry(BaseModel): @@ -190,7 +188,7 @@ def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": chunkentries = { k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items() } - return ChunkManifest(entries=chunkentries) + return ChunkManifest.from_dict(chunkentries) def split(key: ChunkKey) -> Tuple[int, ...]: @@ -230,81 +228,3 @@ def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> Tuple[int, ...]: max(indices_along_one_dim) + 1 for indices_along_one_dim in zipped_indices ) return chunk_grid_shape - - -def concat_manifests(manifests: List["ChunkManifest"], axis: int) -> "ChunkManifest": - """ - Concatenate manifests along an existing dimension. - - This only requires adjusting one index of chunk keys along a single dimension. - - Note axis is not expected to be negative. - """ - if len(manifests) == 1: - return manifests[0] - - chunk_grid_shapes = [manifest.shape_chunk_grid for manifest in manifests] - lengths_along_concat_dim = [shape[axis] for shape in chunk_grid_shapes] - - # Note we do not need to change the keys of the first manifest - chunk_index_offsets = np.cumsum(lengths_along_concat_dim)[:-1] - new_entries = [ - adjust_chunk_keys(manifest.entries, axis, offset) - for manifest, offset in zip(manifests[1:], chunk_index_offsets) - ] - all_entries = [manifests[0].entries] + new_entries - merged_entries = dict((k, v) for d in all_entries for k, v in d.items()) - - # Arguably don't need to re-perform validation checks on a manifest we created out of already-validated manifests - # Could use pydantic's model_construct classmethod to skip these checks - # But we should actually performance test it because it might be pointless, and current implementation is safer - return ChunkManifest(entries=merged_entries) - - -def adjust_chunk_keys( - entries: Mapping[ChunkKey, ChunkEntry], axis: int, offset: int -) -> Mapping[ChunkKey, ChunkEntry]: - """Replace all chunk keys with keys which have been offset along one axis.""" - - def offset_key(key: ChunkKey, axis: int, offset: int) -> ChunkKey: - inds = split(key) - inds[axis] += offset - return join(inds) - - return {offset_key(k, axis, offset): v for k, v in entries.items()} - - -def stack_manifests(manifests: List[ChunkManifest], axis: int) -> "ChunkManifest": - """ - Stack manifests along a new dimension. - - This only requires inserting one index into all chunk keys to add a new dimension. - - Note axis is not expected to be negative. - """ - - # even if there is only one manifest it still needs a new axis inserted - chunk_indexes_along_new_dim = range(len(manifests)) - new_entries = [ - insert_new_axis_into_chunk_keys(manifest.entries, axis, new_index_value) - for manifest, new_index_value in zip(manifests, chunk_indexes_along_new_dim) - ] - merged_entries = dict((k, v) for d in new_entries for k, v in d.items()) - - # Arguably don't need to re-perform validation checks on a manifest we created out of already-validated manifests - # Could use pydantic's model_construct classmethod to skip these checks - # But we should actually performance test it because it might be pointless, and current implementation is safer - return ChunkManifest(entries=merged_entries) - - -def insert_new_axis_into_chunk_keys( - entries: Mapping[ChunkKey, ChunkEntry], axis: int, new_index_value: int -) -> Mapping[ChunkKey, ChunkEntry]: - """Replace all chunk keys with keys which have a new axis inserted, with a given value.""" - - def insert_axis(key: ChunkKey, new_axis: int, index_value: int) -> ChunkKey: - inds = split(key) - inds.insert(new_axis, index_value) - return join(inds) - - return {insert_axis(k, axis, new_index_value): v for k, v in entries.items()} diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index aa10d91c..b587f88a 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -13,7 +13,7 @@ def test_create_manifestarray(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest(entries=chunks_dict) + manifest = ChunkManifest.from_dict(chunks_dict) chunks = (5, 1, 10) shape = (5, 2, 20) zarray = ZArray( @@ -38,7 +38,7 @@ def test_create_invalid_manifestarray(self): chunks_dict = { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, } - manifest = ChunkManifest(entries=chunks_dict) + manifest = ChunkManifest.from_dict(chunks_dict) chunks = (5, 1, 10) shape = (5, 2, 20) zarray = ZArray( @@ -79,7 +79,7 @@ def test_equals(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest(entries=chunks_dict) + manifest = ChunkManifest.from_dict(chunks_dict) chunks = (5, 1, 10) shape = (5, 2, 20) zarray = ZArray( @@ -118,14 +118,14 @@ def test_not_equal_chunk_entries(self): "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest(entries=chunks_dict1) + manifest1 = ChunkManifest.from_dict(chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) chunks_dict2 = { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest(entries=chunks_dict2) + manifest2 = ChunkManifest.from_dict(chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) assert not (marr1 == marr2).all() @@ -154,14 +154,14 @@ def test_concat(self): "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest(entries=chunks_dict1) + manifest1 = ChunkManifest.from_dict(chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) chunks_dict2 = { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest(entries=chunks_dict2) + manifest2 = ChunkManifest.from_dict(chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) result = np.concatenate([marr1, marr2], axis=1) @@ -199,14 +199,14 @@ def test_stack(self): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest(entries=chunks_dict1) + manifest1 = ChunkManifest.from_dict(chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) chunks_dict2 = { "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest(entries=chunks_dict2) + manifest2 = ChunkManifest.from_dict(chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) result = np.stack([marr1, marr2], axis=1) @@ -242,28 +242,30 @@ def test_refuse_combine(): chunks_dict1 = { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, } + chunkmanifest1 = ChunkManifest.from_dict(chunks_dict1) chunks_dict2 = { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, } - marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunks_dict1) + chunkmanifest2 = ChunkManifest.from_dict(chunks_dict2) + marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunkmanifest1) zarray_wrong_compressor = zarray_common.copy() zarray_wrong_compressor["compressor"] = None - marr2 = ManifestArray(zarray=zarray_wrong_compressor, chunkmanifest=chunks_dict2) + marr2 = ManifestArray(zarray=zarray_wrong_compressor, chunkmanifest=chunkmanifest2) for func in [np.concatenate, np.stack]: with pytest.raises(NotImplementedError, match="different codecs"): func([marr1, marr2], axis=0) zarray_wrong_dtype = zarray_common.copy() zarray_wrong_dtype["dtype"] = np.dtype("int64") - marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunks_dict2) + marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunkmanifest2) for func in [np.concatenate, np.stack]: with pytest.raises(ValueError, match="inconsistent dtypes"): func([marr1, marr2], axis=0) zarray_wrong_dtype = zarray_common.copy() zarray_wrong_dtype["dtype"] = np.dtype("int64") - marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunks_dict2) + marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunkmanifest2) for func in [np.concatenate, np.stack]: with pytest.raises(ValueError, match="inconsistent dtypes"): func([marr1, marr2], axis=0) diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 38b16f09..062ad7ad 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -1,6 +1,7 @@ +import numpy as np import pytest -from virtualizarr.manifests import ChunkManifest, concat_manifests, stack_manifests +from virtualizarr.manifests import ChunkManifest class TestCreateManifest: @@ -108,7 +109,10 @@ def test_concat(self): } ) - result = concat_manifests([manifest1, manifest2], axis=axis) + result_manifest = np.concatenate( + [manifest1.entries, manifest2.entries], axis=axis + ) + result = ChunkManifest(entries=result_manifest) assert result.dict() == expected.dict() def test_stack(self): @@ -134,7 +138,8 @@ def test_stack(self): } ) - result = stack_manifests([manifest1, manifest2], axis=axis) + result_manifest = np.stack([manifest1.entries, manifest2.entries], axis=axis) + result = ChunkManifest(entries=result_manifest) assert result.dict() == expected.dict() From bd8ad2283b854cff7540d69a1323c00fe520327e Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 Mar 2024 12:50:09 -0400 Subject: [PATCH 05/32] fixed manifest.from_kerchunk_dict --- virtualizarr/manifests/manifest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 456a1333..1636d8eb 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -43,6 +43,9 @@ def to_kerchunk(self) -> List[Union[str, int]]: """Write out in the format that kerchunk uses for chunk entries.""" return [self.path, self.offset, self.length] + def dict(self) -> dict[str, Union[str, int]]: + return dict(path=self.path, offset=self.offset, length=self.length) + # TODO we want the path field to contain a variable-length string, but that's not available until numpy 2.0 # See https://numpy.org/neps/nep-0055-string_dtype.html @@ -186,9 +189,10 @@ def to_zarr_json(self, filepath: str) -> None: @classmethod def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": chunkentries = { - k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items() + cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict() + for k, v in kerchunk_chunk_dict.items() } - return ChunkManifest.from_dict(chunkentries) + return ChunkManifest.from_dict(cast(ChunkDict, chunkentries)) def split(key: ChunkKey) -> Tuple[int, ...]: From 385290dff384ef07acee6a5294a674980dfd4810 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 Mar 2024 15:19:36 -0400 Subject: [PATCH 06/32] fixed kerchunk tests --- virtualizarr/kerchunk.py | 6 +++--- virtualizarr/tests/test_kerchunk.py | 14 ++++++++++---- virtualizarr/tests/test_xarray.py | 14 +++++++------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 48e4b5af..7525d7b2 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -167,7 +167,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable) -> KerchunkArrRefs: Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). """ - from virtualizarr.manifests import ManifestArray + from virtualizarr.manifests import ChunkEntry, ManifestArray marr = var.data @@ -177,8 +177,8 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable) -> KerchunkArrRefs: ) arr_refs: dict[str, Union[str, List[Union[str, int]]]] = { - str(chunk_key): chunk_entry.to_kerchunk() - for chunk_key, chunk_entry in marr.manifest.entries.items() + str(chunk_key): ChunkEntry(**chunk_entry).to_kerchunk() + for chunk_key, chunk_entry in marr.manifest.dict().items() } zarray_dict = marr.zarray.to_kerchunk_json() diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 6fbee558..d59f6391 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -3,7 +3,7 @@ import xarray as xr import xarray.testing as xrt -from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs @@ -40,8 +40,11 @@ def test_dataset_from_kerchunk_refs(): class TestAccessor: def test_accessor_to_kerchunk_dict(self): + manifest = ChunkManifest.from_dict( + {"0.0": dict(path="test.nc", offset=6144, length=48)} + ) arr = ManifestArray( - chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)}, + chunkmanifest=manifest, zarray=dict( shape=(2, 3), dtype=np.dtype(" Date: Thu, 9 May 2024 19:36:05 -0400 Subject: [PATCH 07/32] change private attributes to 3 numpy arrays --- virtualizarr/manifests/manifest.py | 114 +++++++++++++++++++---------- 1 file changed, 76 insertions(+), 38 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index c239fe7e..b06fb877 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -5,7 +5,7 @@ import numpy as np from pydantic import BaseModel, ConfigDict -from ..types import ChunkKey +from virtualizarr.types import ChunkKey _INTEGER = ( r"([1-9]+\d*|0)" # matches 0 or an unsigned integer that does not begin with zero @@ -48,21 +48,13 @@ def dict(self) -> dict[str, Union[str, int]]: return dict(path=self.path, offset=self.offset, length=self.length) -# TODO we want the path field to contain a variable-length string, but that's not available until numpy 2.0 -# See https://numpy.org/neps/nep-0055-string_dtype.html -MANIFEST_STRUCTURED_ARRAY_DTYPES = np.dtype( - [("path", " "ChunkManifest": - # TODO do some input validation here first? - validate_chunk_keys(chunks.keys()) + def from_arrays( + cls, + paths: np.ndarray[Any, np.dtypes.StringDType], + offsets: np.ndarray[Any, np.dtype("int32")], + lengths: np.ndarray[Any, np.dtype("int32")], + ) -> "ChunkManifest": + """ + Create manifest directly from numpy arrays containing the path and byte range information. - # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape? - shape = get_chunk_grid_shape(chunks.keys()) + Useful if you want to avoid the memory overhead of creating an intermediate dictionary first, + as these 3 arrays are what will be used internally to store the references. - # Initializing to empty implies that entries with path='' are treated as missing chunks - entries = np.empty(shape=shape, dtype=MANIFEST_STRUCTURED_ARRAY_DTYPES) + Parameters + ---------- + paths: np.ndarray + offsets: np.ndarray + lengths: np.ndarray + """ - # populate the array - for key, entry in chunks.items(): - try: - entries[split(key)] = tuple(entry.values()) - except (ValueError, TypeError) as e: - msg = ( - "Each chunk entry must be of the form dict(path=, offset=, length=), " - f"but got {entry}" - ) - raise ValueError(msg) from e + # check types + if not isinstance(paths, np.ndarray): + raise TypeError(f"paths must be a numpy array, but got type {type(paths)}") + if not isinstance(offsets, np.ndarray): + raise TypeError( + f"offsets must be a numpy array, but got type {type(offsets)}" + ) + if not isinstance(lengths, np.ndarray): + raise TypeError( + f"lengths must be a numpy array, but got type {type(lengths)}" + ) - return ChunkManifest(entries=entries) + # check dtypes + if paths.dtype is not np.dtypes.StringDType: + raise ValueError( + f"paths array must have numpy dtype StringDType, but got dtype {paths.dtype}" + ) + if offsets.dtype is not np.dtype("int32"): + raise ValueError( + f"offsets array must have 32-bit integer dtype, but got dtype {offsets.dtype}" + ) + if lengths.dtype is not np.dtype("int32"): + raise ValueError( + f"lengths array must have 32-bit integer dtype, but got dtype {lengths.dtype}" + ) + + # check shapes + shape = paths.shape + if offsets.shape != shape: + raise ValueError( + f"Shapes of the arrays must be consistent, but shapes of paths array and offsets array do not match: {paths.shape} vs {offsets.shape}" + ) + if lengths.shape != shape: + raise ValueError( + f"Shapes of the arrays must be consistent, but shapes of paths array and lengths array do not match: {paths.shape} vs {lengths.shape}" + ) + + obj = object.__new__(cls) + obj._paths = paths + obj._offsets = offsets + obj._lengths = lengths + + return obj @property def ndim_chunk_grid(self) -> int: @@ -119,7 +151,7 @@ def ndim_chunk_grid(self) -> int: Not the same as the dimension of an array backed by this chunk manifest. """ - return self.entries.ndim + return self._paths.ndim @property def shape_chunk_grid(self) -> Tuple[int, ...]: @@ -128,20 +160,23 @@ def shape_chunk_grid(self) -> Tuple[int, ...]: Not the same as the shape of an array backed by this chunk manifest. """ - return self.entries.shape + return self._paths.shape def __repr__(self) -> str: return f"ChunkManifest" def __getitem__(self, key: ChunkKey) -> ChunkEntry: indices = split(key) - return ChunkEntry(self.entries[indices]) + path = self._paths[indices] + offset = self._offsets[indices] + length = self._lengths[indices] + return ChunkEntry(path=path, offset=offset, length=length) def __iter__(self) -> Iterator[ChunkKey]: - return iter(self.entries.keys()) + return iter(self._paths.keys()) def __len__(self) -> int: - return self.entries.size + return self._paths.size def dict(self) -> ChunkDict: """ @@ -178,7 +213,10 @@ def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]: def __eq__(self, other: Any) -> bool: """Two manifests are equal if all of their entries are identical.""" - return (self.entries == other.entries).all() + paths_equal = (self.paths == other.paths).all() + offsets_equal = (self.offsets == other.offsets).all() + lengths_equal = (self.lengths == other.lengths).all() + return paths_equal and offsets_equal and lengths_equal @classmethod def from_zarr_json(cls, filepath: str) -> "ChunkManifest": From e93d3b81c3bb4d8850cb7b878d9d7a919ccbc447 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 9 May 2024 19:36:38 -0400 Subject: [PATCH 08/32] add from_arrays method --- virtualizarr/manifests/manifest.py | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index b06fb877..aaff57a0 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -80,6 +80,42 @@ class ChunkManifest(BaseModel): _offsets: np.ndarray[Any, np.dtype("int32")] _lengths: np.ndarray[Any, np.dtype("int32")] + @classmethod + def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": + # TODO do some input validation here first? + validate_chunk_keys(chunks.keys()) + + # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape? + shape = get_chunk_grid_shape(chunks.keys()) + + # Initializing to empty implies that entries with path='' are treated as missing chunks + paths = np.empty(shape=shape, dtype=np.dtypes.StringDType) + offsets = np.empty(shape=shape, dtype=np.dtype("int32")) + lengths = np.empty(shape=shape, dtype=np.dtype("int32")) + + # populate the array + for key, entry in chunks.items(): + try: + entry = ChunkEntry(entry) + except (ValueError, TypeError) as e: + msg = ( + "Each chunk entry must be of the form dict(path=, offset=, length=), " + f"but got {entry}" + ) + raise ValueError(msg) from e + + split_key = split(key) + paths[split_key] = entry.path + offsets[split_key] = entry.offset + lengths[split_key] = entry.length + + obj = object.__new__(cls) + obj._paths = paths + obj._offsets = offsets + obj._lengths = lengths + + return obj + @classmethod def from_arrays( cls, From 39131431d2707bfc7ad8c7f4e14ee3919fcf24ee Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 10 May 2024 12:10:29 -0400 Subject: [PATCH 09/32] to and from dict working again --- virtualizarr/manifests/manifest.py | 43 ++++++++++++++---------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index aaff57a0..519cfbc9 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -48,7 +48,7 @@ def dict(self) -> dict[str, Union[str, int]]: return dict(path=self.path, offset=self.offset, length=self.length) -class ChunkManifest(BaseModel): +class ChunkManifest: """ In-memory representation of a single Zarr chunk manifest. @@ -71,11 +71,6 @@ class ChunkManifest(BaseModel): so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks. """ - model_config = ConfigDict( - frozen=True, - arbitrary_types_allowed=True, # so pydantic doesn't complain about the numpy array field - ) - _paths: np.ndarray[Any, np.dtypes.StringDType] _offsets: np.ndarray[Any, np.dtype("int32")] _lengths: np.ndarray[Any, np.dtype("int32")] @@ -96,7 +91,8 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": # populate the array for key, entry in chunks.items(): try: - entry = ChunkEntry(entry) + path, offset, length = entry.values() + entry = ChunkEntry(path=path, offset=offset, length=length) except (ValueError, TypeError) as e: msg = ( "Each chunk entry must be of the form dict(path=, offset=, length=), " @@ -224,34 +220,35 @@ def dict(self) -> ChunkDict: "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - """ - def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]: - return { - "path": entry[0], - "offset": entry[1], - "length": entry[2], - } + Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary._ + """ coord_vectors = np.mgrid[ tuple(slice(None, length) for length in self.shape_chunk_grid) ] + d = { + join(inds): dict( + path=path.item(), offset=offset.item(), length=length.item() + ) + for *inds, path, offset, length in np.nditer( + [*coord_vectors, self._paths, self._offsets, self._lengths], + flags=("refs_ok",), + ) + if path.item()[0] != "" # don't include entry if path='' (i.e. empty chunk) + } + return cast( ChunkDict, - { - join(inds): _entry_to_dict(entry.item()) - for *inds, entry in np.nditer([*coord_vectors, self.entries]) - if entry.item()[0] - != "" # don't include entry if path='' (i.e. empty chunk) - }, + d, ) def __eq__(self, other: Any) -> bool: """Two manifests are equal if all of their entries are identical.""" - paths_equal = (self.paths == other.paths).all() - offsets_equal = (self.offsets == other.offsets).all() - lengths_equal = (self.lengths == other.lengths).all() + paths_equal = (self._paths == other._paths).all() + offsets_equal = (self._offsets == other._offsets).all() + lengths_equal = (self._lengths == other._lengths).all() return paths_equal and offsets_equal and lengths_equal @classmethod From 6a5d9961a79f96e1be9f98d309984e4c9f4f7a12 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 10 May 2024 12:39:45 -0400 Subject: [PATCH 10/32] fix dtype comparisons --- virtualizarr/manifests/manifest.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 519cfbc9..cdb44098 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -71,9 +71,9 @@ class ChunkManifest: so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks. """ - _paths: np.ndarray[Any, np.dtypes.StringDType] - _offsets: np.ndarray[Any, np.dtype("int32")] - _lengths: np.ndarray[Any, np.dtype("int32")] + _paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]] + _offsets: np.ndarray[Any, np.dtype[np.int32]] + _lengths: np.ndarray[Any, np.dtype[np.int32]] @classmethod def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": @@ -115,9 +115,9 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": @classmethod def from_arrays( cls, - paths: np.ndarray[Any, np.dtypes.StringDType], - offsets: np.ndarray[Any, np.dtype("int32")], - lengths: np.ndarray[Any, np.dtype("int32")], + paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], + offsets: np.ndarray[Any, np.dtype[np.int32]], + lengths: np.ndarray[Any, np.dtype[np.int32]], ) -> "ChunkManifest": """ Create manifest directly from numpy arrays containing the path and byte range information. @@ -145,15 +145,15 @@ def from_arrays( ) # check dtypes - if paths.dtype is not np.dtypes.StringDType: + if paths.dtype != np.dtypes.StringDType(): raise ValueError( f"paths array must have numpy dtype StringDType, but got dtype {paths.dtype}" ) - if offsets.dtype is not np.dtype("int32"): + if offsets.dtype != np.dtype("int32"): raise ValueError( f"offsets array must have 32-bit integer dtype, but got dtype {offsets.dtype}" ) - if lengths.dtype is not np.dtype("int32"): + if lengths.dtype != np.dtype("int32"): raise ValueError( f"lengths array must have 32-bit integer dtype, but got dtype {lengths.dtype}" ) From 8a77a0a291c89e29d98266adb4ee91aa55b1c708 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 10 May 2024 12:40:06 -0400 Subject: [PATCH 11/32] depend on numpy release candidate --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index afdde7a7..d37411bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "kerchunk==0.2.2", "h5netcdf", "pydantic", - "numpy", + "numpy>=2.0.0rc1", "ujson", "packaging", ] From a95117f58545006188ffcff8eb7923a85decf0f2 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 10 May 2024 12:51:08 -0400 Subject: [PATCH 12/32] get concatenation and stacking working --- virtualizarr/manifests/array_api.py | 38 ++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 3322ab34..7131a314 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -124,11 +124,24 @@ def concatenate( new_shape = list(first_shape) new_shape[axis] = new_length_along_concat_axis - concatenated_manifest_entries = np.concatenate( - [arr.manifest.entries for arr in arrays], + # do concatenation of entries in manifest + concatenated_paths = np.concatenate( + [arr.manifest._paths for arr in arrays], axis=axis, ) - concatenated_manifest = ChunkManifest(entries=concatenated_manifest_entries) + concatenated_offsets = np.concatenate( + [arr.manifest._offsets for arr in arrays], + axis=axis, + ) + concatenated_lengths = np.concatenate( + [arr.manifest._lengths for arr in arrays], + axis=axis, + ) + concatenated_manifest = ChunkManifest.from_arrays( + paths=concatenated_paths, + offsets=concatenated_offsets, + lengths=concatenated_lengths, + ) new_zarray = ZArray( chunks=first_arr.chunks, @@ -210,11 +223,24 @@ def stack( new_shape = list(first_shape) new_shape.insert(axis, length_along_new_stacked_axis) - stacked_manifest_entries = np.stack( - [arr.manifest.entries for arr in arrays], + # do stacking of entries in manifest + stacked_paths = np.stack( + [arr.manifest._paths for arr in arrays], axis=axis, ) - stacked_manifest = ChunkManifest(entries=stacked_manifest_entries) + stacked_offsets = np.stack( + [arr.manifest._offsets for arr in arrays], + axis=axis, + ) + stacked_lengths = np.stack( + [arr.manifest._lengths for arr in arrays], + axis=axis, + ) + stacked_manifest = ChunkManifest.from_arrays( + paths=stacked_paths, + offsets=stacked_offsets, + lengths=stacked_lengths, + ) # chunk size has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks From b45b1604942361f7b01f0389351687029614b8ce Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 May 2024 16:55:21 +0000 Subject: [PATCH 13/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/tests/test_kerchunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 71c8530d..c5a6ec37 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -4,7 +4,7 @@ import xarray as xr import xarray.testing as xrt -from virtualizarr.kerchunk import _automatically_determine_filetype, FileType +from virtualizarr.kerchunk import FileType, _automatically_determine_filetype from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs From 7410a6669344050ef100241587d38596243e7be8 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 10 May 2024 15:05:52 -0400 Subject: [PATCH 14/32] remove manifest-level tests of concatenation --- .../tests/test_manifests/test_manifest.py | 61 ------------------- 1 file changed, 61 deletions(-) diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 93f3ce45..368e2aa6 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -1,4 +1,3 @@ -import numpy as np import pytest from virtualizarr.manifests import ChunkManifest @@ -83,66 +82,6 @@ def test_equals(self): assert manifest1 != manifest2 -# TODO could we use hypothesis to test this? -# Perhaps by testing the property that splitting along a dimension then concatenating the pieces along that dimension should recreate the original manifest? -class TestCombineManifests: - def test_concat(self): - manifest1 = ChunkManifest.from_dict( - { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, - } - ) - manifest2 = ChunkManifest.from_dict( - { - "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, - "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, - } - ) - axis = 1 - expected = ChunkManifest.from_dict( - { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, - "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100}, - } - ) - - result_manifest = np.concatenate( - [manifest1.entries, manifest2.entries], axis=axis - ) - result = ChunkManifest(entries=result_manifest) - assert result.dict() == expected.dict() - - def test_stack(self): - manifest1 = ChunkManifest.from_dict( - { - "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, - "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, - } - ) - manifest2 = ChunkManifest.from_dict( - { - "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, - "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, - } - ) - axis = 1 - expected = ChunkManifest.from_dict( - { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, - "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100}, - } - ) - - result_manifest = np.stack([manifest1.entries, manifest2.entries], axis=axis) - result = ChunkManifest(entries=result_manifest) - assert result.dict() == expected.dict() - - @pytest.mark.skip(reason="Not implemented") class TestSerializeManifest: def test_serialize_manifest_to_zarr(self): ... From e1e8bf7f659f83cdcaaa31ef7bdf71ea9a535b50 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 11 May 2024 13:18:51 -0400 Subject: [PATCH 15/32] generalized create_manifestarray fixture --- virtualizarr/tests/__init__.py | 44 +++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 291188a5..2e8730fb 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -1,7 +1,11 @@ +import itertools +from typing import Union + import numpy as np from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.zarr import ZArray +from virtualizarr.manifests.manifest import join +from virtualizarr.zarr import ZArray, ceildiv def create_manifestarray( @@ -9,6 +13,8 @@ def create_manifestarray( ) -> ManifestArray: """ Create an example ManifestArray with sensible defaults. + + The manifest is populated with a (somewhat) unique path, offset, and length for each key. """ zarray = ZArray( @@ -22,14 +28,36 @@ def create_manifestarray( zarr_format=2, ) - if shape != (): - raise NotImplementedError( - "Only generation of array representing a single scalar currently supported" - ) + chunk_grid_shape = tuple( + ceildiv(axis_length, chunk_length) + for axis_length, chunk_length in zip(shape, chunks) + ) + + # create every possible combination of keys + all_possible_combos = itertools.product( + *[range(length) for length in chunk_grid_shape] + ) - # TODO generalize this - chunkmanifest = ChunkManifest( - entries={"0": ChunkEntry(path="scalar.nc", offset=6144, length=48)} + chunkmanifest = ChunkManifest.from_dict( + {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos} ) return ManifestArray(chunkmanifest=chunkmanifest, zarray=zarray) + + +def entry_from_chunk_key(ind: tuple[int, ...]) -> dict[str, Union[str, int]]: + """Generate a (somewhat) unique manifest entry from a given chunk key""" + entry = { + "path": f"file.{join(ind)}.nc", + "offset": offset_from_chunk_key(ind), + "length": length_from_chunk_key(ind), + } + return entry + + +def offset_from_chunk_key(ind: tuple[int, ...]) -> int: + return sum(ind) * 10 + + +def length_from_chunk_key(ind: tuple[int, ...]) -> int: + return sum(ind) + 5 From 7e97e74e5260fb312d49d86d914e5553d3d4f742 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 11 May 2024 13:19:06 -0400 Subject: [PATCH 16/32] added tests of broadcasting --- .../tests/test_manifests/test_array.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 7def676b..0e890066 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -124,13 +124,37 @@ def test_partly_equals(self): ... class TestBroadcast: + def test_broadcast_existing_axis(self): + marr = create_manifestarray(shape=(1, 2), chunks=(1, 2)) + expanded = np.broadcast_to(marr, shape=(3, 2)) + assert expanded.shape == (3, 2) + assert expanded.chunks == (1, 2) + assert expanded.manifest.dict() == { + "0.0": {"path": "file.0.0.nc", "offset": 0, "length": 5}, + "1.0": {"path": "file.0.0.nc", "offset": 0, "length": 5}, + "2.0": {"path": "file.0.0.nc", "offset": 0, "length": 5}, + } + + def test_broadcast_new_axis(self): + marr = create_manifestarray(shape=(3,), chunks=(1,)) + expanded = np.broadcast_to(marr, shape=(1, 3)) + assert expanded.shape == (1, 3) + assert expanded.chunks == (1, 1) + assert expanded.manifest.dict() == { + "0.0": {"path": "file.0.nc", "offset": 0, "length": 5}, + "0.1": {"path": "file.1.nc", "offset": 10, "length": 6}, + "0.2": {"path": "file.2.nc", "offset": 20, "length": 7}, + } + def test_broadcast_scalar(self): # regression test marr = create_manifestarray(shape=(), chunks=()) expanded = np.broadcast_to(marr, shape=(1,)) assert expanded.shape == (1,) assert expanded.chunks == (1,) - assert expanded.manifest == marr.manifest + assert expanded.manifest == { + "0": {"path": "file.0.nc", "offset": 0, "length": 5}, + } # TODO we really need some kind of fixtures to generate useful example data From 96b28419a260edd6028aa33b191e62524e34baf0 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 11 May 2024 13:19:43 -0400 Subject: [PATCH 17/32] made basic broadcasting tests pass by calling np.broadcast_to on underlying numpy arrays --- virtualizarr/manifests/array_api.py | 80 +++++++++++++++++------------ 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 7131a314..1756f3f8 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,9 +1,10 @@ import itertools -from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Tuple, Union, cast +from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Tuple, Union import numpy as np -from ..zarr import Codec, ZArray +from virtualizarr.zarr import Codec, ZArray, ceildiv + from .manifest import ChunkManifest if TYPE_CHECKING: @@ -281,39 +282,54 @@ def expand_dims(x: "ManifestArray", /, *, axis: int = 0) -> "ManifestArray": @implements(np.broadcast_to) def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArray": """ - Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + Broadcasts a ManifestArray to a specified shape, by either adjusting chunk keys or copying chunk manifest entries. """ - if len(x.shape) > len(shape): - raise ValueError("input operand has more dimensions than allowed") - - # numpy broadcasting algorithm requires us to start by comparing the length of the final axes and work backwards - result = x - for axis, d, d_requested in itertools.zip_longest( - reversed(range(len(shape))), reversed(x.shape), reversed(shape), fillvalue=None - ): - # len(shape) check above ensures this can't be type None - d_requested = cast(int, d_requested) - - if d == d_requested: - pass - elif d is None: - if result.shape == (): - # scalars are a special case because their manifests already have a chunk key with one dimension - # see https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282 - result = _broadcast_scalar(result, new_axis_length=d_requested) - else: - # stack same array upon itself d_requested number of times, which inserts a new axis at axis=0 - result = stack([result] * d_requested, axis=0) - elif d == 1: - # concatenate same array upon itself d_requested number of times along existing axis - result = concatenate([result] * d_requested, axis=axis) - else: - raise ValueError( - f"Array with shape {x.shape} cannot be broadcast to shape {shape}" - ) + from .array import ManifestArray + + new_chunk_grid_shape = tuple( + ceildiv(axis_length, chunk_length) + for axis_length, chunk_length in itertools.zip_longest( + shape, x.chunks, fillvalue=1 + ) + ) + + # do broadcasting of entries in manifest + broadcast_paths = np.broadcast_to(x.manifest._paths, shape=new_chunk_grid_shape) + broadcast_offsets = np.broadcast_to( + x.manifest._offsets, + shape=new_chunk_grid_shape, + ) + broadcast_lengths = np.broadcast_to( + x.manifest._lengths, + shape=new_chunk_grid_shape, + ) + broadcast_manifest = ChunkManifest.from_arrays( + paths=broadcast_paths, + offsets=broadcast_offsets, + lengths=broadcast_lengths, + ) + + new_shape = np.broadcast_shapes(x.shape, shape) + new_chunks = tuple( + ceildiv(axis_length, chunk_grid_length) + for axis_length, chunk_grid_length in zip(new_shape, new_chunk_grid_shape) + ) + + # refactor to use a new ZArray.copy(x, shape=..., chunks=..) constructor method + new_zarray = ZArray( + chunks=new_chunks, + compressor=x.zarray.compressor, + dtype=x.dtype, + fill_value=x.zarray.fill_value, + filters=x.zarray.filters, + shape=new_shape, + # TODO presumably these things should be checked for consistency across arrays too? + order=x.zarray.order, + zarr_format=x.zarray.zarr_format, + ) - return result + return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray) def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray": From 00c1757bb1ee9f3c4f0da549734274e3cc8e1290 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 11 May 2024 14:27:45 -0400 Subject: [PATCH 18/32] generalize fixture for creating scalar ManifestArrays --- virtualizarr/tests/__init__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 2e8730fb..7f11a409 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -33,14 +33,16 @@ def create_manifestarray( for axis_length, chunk_length in zip(shape, chunks) ) - # create every possible combination of keys - all_possible_combos = itertools.product( - *[range(length) for length in chunk_grid_shape] - ) - - chunkmanifest = ChunkManifest.from_dict( - {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos} - ) + if chunk_grid_shape == (): + d = {"0": entry_from_chunk_key((0,))} + else: + # create every possible combination of keys + all_possible_combos = itertools.product( + *[range(length) for length in chunk_grid_shape] + ) + d = {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos} + + chunkmanifest = ChunkManifest.from_dict(d) return ManifestArray(chunkmanifest=chunkmanifest, zarray=zarray) From 06180b3c7a3c619318f43f5b5ba583a40622d20e Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 11 May 2024 14:28:43 -0400 Subject: [PATCH 19/32] improve regression test for expanding scalar ManifestArray --- virtualizarr/tests/test_manifests/test_array.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 0e890066..f3f59b4b 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -149,10 +149,16 @@ def test_broadcast_new_axis(self): def test_broadcast_scalar(self): # regression test marr = create_manifestarray(shape=(), chunks=()) + assert marr.shape == () + assert marr.chunks == () + assert marr.manifest.dict() == { + "0": {"path": "file.0.nc", "offset": 0, "length": 5}, + } + expanded = np.broadcast_to(marr, shape=(1,)) assert expanded.shape == (1,) assert expanded.chunks == (1,) - assert expanded.manifest == { + assert expanded.manifest.dict() == { "0": {"path": "file.0.nc", "offset": 0, "length": 5}, } From dae048b9863542f918bba5e990352e2198afd47d Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 11 May 2024 14:29:10 -0400 Subject: [PATCH 20/32] remove now-unneeded scalar broadcasting logic --- virtualizarr/manifests/array_api.py | 35 ----------------------------- 1 file changed, 35 deletions(-) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 1756f3f8..58c376bb 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -332,41 +332,6 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray) -def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray": - """ - Add an axis to a scalar ManifestArray, but without adding a new axis to the keys of the chunk manifest. - - This is not the same as concatenation, because there is no existing axis along which to concatenate. - It's also not the same as stacking, because we don't want to insert a new axis into the chunk keys. - - Scalars are a special case because their manifests still have a chunk key with one dimension. - See https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282 - """ - - from .array import ManifestArray - - new_shape = (new_axis_length,) - new_chunks = (new_axis_length,) - - concatenated_manifest = concat_manifests( - [x.manifest] * new_axis_length, - axis=0, - ) - - new_zarray = ZArray( - chunks=new_chunks, - compressor=x.zarray.compressor, - dtype=x.dtype, - fill_value=x.zarray.fill_value, - filters=x.zarray.filters, - shape=new_shape, - order=x.zarray.order, - zarr_format=x.zarray.zarr_format, - ) - - return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray) - - # TODO broadcast_arrays, squeeze, permute_dims From 9a72df22aacf9834f5e21a2007788500173e8680 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 12 May 2024 15:26:10 -0400 Subject: [PATCH 21/32] rewrite __eq__ method on ManifestArray --- virtualizarr/manifests/array.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 423610cb..e5d8d576 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -53,6 +53,9 @@ def __init__( f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}" ) + # TODO check that the zarray shape and chunkmanifest shape are consistent with one another + # TODO also cover the special case of scalar arrays + self._zarray = _zarray self._manifest = _chunkmanifest @@ -140,7 +143,7 @@ def __eq__( # type: ignore[override] Returns a numpy array of booleans. """ - if isinstance(other, (int, float, bool)): + if isinstance(other, (int, float, bool, np.ndarray)): # TODO what should this do when comparing against numpy arrays? return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) elif not isinstance(other, ManifestArray): @@ -164,9 +167,22 @@ def __eq__( # type: ignore[override] UserWarning, ) - # TODO do chunk-wise comparison - # TODO expand it into an element-wise result - return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) + # do chunk-wise comparison + equal_chunk_paths = self.manifest._paths == other.manifest._paths + equal_chunk_offsets = self.manifest._offsets == other.manifest._offsets + equal_chunk_lengths = self.manifest._lengths == other.manifest._lengths + + equal_chunks = ( + equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths + ) + + if not equal_chunks.all(): + # TODO expand chunk-wise comparison into an element-wise result instead of just returning all False + return np.full( + shape=self.shape, fill_value=False, dtype=np.dtype(bool) + ) + else: + raise RuntimeWarning("Should not be possible to get here") def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": """Cannot change the dtype, but needed because xarray will call this even when it's a no-op.""" From e09c18fdc6d00266c68e91ce712d36b6e067cd71 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 13 May 2024 11:40:11 -0400 Subject: [PATCH 22/32] remove unused import --- virtualizarr/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 7f11a409..17da446b 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -3,7 +3,7 @@ import numpy as np -from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join from virtualizarr.zarr import ZArray, ceildiv From bb23e453b6e0d989b95fffdbd0730595c664f14c Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 9 Jun 2024 09:27:52 -0600 Subject: [PATCH 23/32] reinstate the ChunkManifest.__init__ method to accept dictionaries --- virtualizarr/manifests/array.py | 2 +- virtualizarr/manifests/manifest.py | 71 +++++++++++-------- virtualizarr/tests/__init__.py | 2 +- virtualizarr/tests/test_kerchunk.py | 10 +-- .../tests/test_manifests/test_array.py | 20 +++--- .../tests/test_manifests/test_manifest.py | 22 +++--- virtualizarr/tests/test_xarray.py | 18 ++--- virtualizarr/tests/test_zarr.py | 6 +- 8 files changed, 84 insertions(+), 67 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index e5d8d576..5d132bc2 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -50,7 +50,7 @@ def __init__( _chunkmanifest = ChunkManifest(entries=chunkmanifest) else: raise TypeError( - f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}" + f"chunkmanifest arg must be of type ChunkManifest or dict, but got type {type(chunkmanifest)}" ) # TODO check that the zarray shape and chunkmanifest shape are consistent with one another diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index cdb44098..242c1db0 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -52,9 +52,9 @@ class ChunkManifest: """ In-memory representation of a single Zarr chunk manifest. - Stores the manifest internally as numpy arrays. + Stores the manifest internally as numpy arrays, so the most efficient way to create this object is via the `.from_arrays` constructor classmethod. - The manifest can be converted to or from a dictionary from looking like this + The manifest can be converted to or from a dictionary which looks like this { "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, @@ -63,33 +63,49 @@ class ChunkManifest: "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - using the .from_dict() and .dict() methods, so users of this class can think of the manifest as if it were a dict. + using the .__init__() and .dict() methods, so users of this class can think of the manifest as if it were a dict mapping zarr chunk keys to byte ranges. - See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287 . + (See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287.) Validation is done when this object is instatiated, and this class is immutable, - so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks. + so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ _paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]] _offsets: np.ndarray[Any, np.dtype[np.int32]] _lengths: np.ndarray[Any, np.dtype[np.int32]] - @classmethod - def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": + def __init__(self, entries: dict) -> None: + """ + Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges. + + Parameters + ---------- + entries: dict + Chunk keys and byte range information, as a dictionary of the form + + { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + } + """ + # TODO do some input validation here first? - validate_chunk_keys(chunks.keys()) + validate_chunk_keys(entries.keys()) - # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape? - shape = get_chunk_grid_shape(chunks.keys()) + # TODO should we actually optionally pass chunk grid shape in, + # in case there are not enough chunks to give correct idea of full shape? + shape = get_chunk_grid_shape(entries.keys()) # Initializing to empty implies that entries with path='' are treated as missing chunks paths = np.empty(shape=shape, dtype=np.dtypes.StringDType) offsets = np.empty(shape=shape, dtype=np.dtype("int32")) lengths = np.empty(shape=shape, dtype=np.dtype("int32")) - # populate the array - for key, entry in chunks.items(): + # populate the arrays + for key, entry in entries.items(): try: path, offset, length = entry.values() entry = ChunkEntry(path=path, offset=offset, length=length) @@ -105,12 +121,9 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest": offsets[split_key] = entry.offset lengths[split_key] = entry.length - obj = object.__new__(cls) - obj._paths = paths - obj._offsets = offsets - obj._lengths = lengths - - return obj + self._paths = paths + self._offsets = offsets + self._lengths = lengths @classmethod def from_arrays( @@ -205,14 +218,18 @@ def __getitem__(self, key: ChunkKey) -> ChunkEntry: return ChunkEntry(path=path, offset=offset, length=length) def __iter__(self) -> Iterator[ChunkKey]: - return iter(self._paths.keys()) + # TODO make this work for numpy arrays + raise NotImplementedError() + # return iter(self._paths.keys()) def __len__(self) -> int: return self._paths.size def dict(self) -> ChunkDict: """ - Converts the entire manifest to a nested dictionary, of the form + Convert the entire manifest to a nested dictionary. + + The returned dict will be of the form { "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, @@ -221,7 +238,7 @@ def dict(self) -> ChunkDict: "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary._ + Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary. """ coord_vectors = np.mgrid[ @@ -255,17 +272,15 @@ def __eq__(self, other: Any) -> bool: def from_zarr_json(cls, filepath: str) -> "ChunkManifest": """Create a ChunkManifest from a Zarr manifest.json file.""" with open(filepath, "r") as manifest_file: - entries_dict = json.load(manifest_file) + entries = json.load(manifest_file) - entries = { - cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items() - } return cls(entries=entries) def to_zarr_json(self, filepath: str) -> None: - """Write a ChunkManifest to a Zarr manifest.json file.""" + """Write the manifest to a Zarr manifest.json file.""" + entries = self.dict() with open(filepath, "w") as json_file: - json.dump(self.dict(), json_file, indent=4, separators=(", ", ": ")) + json.dump(entries, json_file, indent=4, separators=(", ", ": ")) @classmethod def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": @@ -273,7 +288,7 @@ def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict() for k, v in kerchunk_chunk_dict.items() } - return ChunkManifest.from_dict(cast(ChunkDict, chunkentries)) + return ChunkManifest(entries=cast(ChunkDict, chunkentries)) def split(key: ChunkKey) -> Tuple[int, ...]: diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 17da446b..13ee63ab 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -42,7 +42,7 @@ def create_manifestarray( ) d = {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos} - chunkmanifest = ChunkManifest.from_dict(d) + chunkmanifest = ChunkManifest(entries=d) return ManifestArray(chunkmanifest=chunkmanifest, zarray=zarray) diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index c5a6ec37..f3ff312f 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -67,8 +67,8 @@ def test_dataset_from_df_refs_with_filters(): class TestAccessor: def test_accessor_to_kerchunk_dict(self): - manifest = ChunkManifest.from_dict( - {"0.0": dict(path="test.nc", offset=6144, length=48)} + manifest = ChunkManifest( + entries={"0.0": dict(path="test.nc", offset=6144, length=48)} ) arr = ManifestArray( chunkmanifest=manifest, @@ -98,8 +98,8 @@ def test_accessor_to_kerchunk_dict(self): assert result_ds_refs == expected_ds_refs def test_accessor_to_kerchunk_json(self, tmp_path): - manifest = ChunkManifest.from_dict( - {"0.0": dict(path="test.nc", offset=6144, length=48)} + manifest = ChunkManifest( + entries={"0.0": dict(path="test.nc", offset=6144, length=48)} ) arr = ManifestArray( chunkmanifest=manifest, @@ -140,7 +140,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks_dict) + manifest = ChunkManifest(entries=chunks_dict) marr = ManifestArray( zarray=dict( shape=(2, 4), diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index f3f59b4b..d847bb9b 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -14,7 +14,7 @@ def test_create_manifestarray(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks_dict) + manifest = ChunkManifest(entries=chunks_dict) chunks = (5, 1, 10) shape = (5, 2, 20) zarray = ZArray( @@ -69,7 +69,7 @@ def test_equals(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks_dict) + manifest = ChunkManifest(entries=chunks_dict) chunks = (5, 1, 10) shape = (5, 2, 20) zarray = ZArray( @@ -108,14 +108,14 @@ def test_not_equal_chunk_entries(self): "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) chunks_dict2 = { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest.from_dict(chunks_dict2) + manifest2 = ChunkManifest(entries=chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) assert not (marr1 == marr2).all() @@ -183,14 +183,14 @@ def test_concat(self): "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) chunks_dict2 = { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest.from_dict(chunks_dict2) + manifest2 = ChunkManifest(entries=chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) result = np.concatenate([marr1, marr2], axis=1) @@ -228,14 +228,14 @@ def test_stack(self): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) chunks_dict2 = { "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest.from_dict(chunks_dict2) + manifest2 = ChunkManifest(entries=chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) result = np.stack([marr1, marr2], axis=1) @@ -271,11 +271,11 @@ def test_refuse_combine(): chunks_dict1 = { "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, } - chunkmanifest1 = ChunkManifest.from_dict(chunks_dict1) + chunkmanifest1 = ChunkManifest(entries=chunks_dict1) chunks_dict2 = { "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, } - chunkmanifest2 = ChunkManifest.from_dict(chunks_dict2) + chunkmanifest2 = ChunkManifest(entries=chunks_dict2) marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunkmanifest1) zarray_wrong_compressor = zarray_common.copy() diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 368e2aa6..acf8ac42 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -8,7 +8,7 @@ def test_create_manifest(self): chunks = { "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks) + manifest = ChunkManifest(entries=chunks) assert manifest.dict() == chunks chunks = { @@ -17,7 +17,7 @@ def test_create_manifest(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks) + manifest = ChunkManifest(entries=chunks) assert manifest.dict() == chunks def test_invalid_chunk_entries(self): @@ -25,7 +25,7 @@ def test_invalid_chunk_entries(self): "0.0.0": {"path": "s3://bucket/foo.nc"}, } with pytest.raises(ValueError, match="must be of the form"): - ChunkManifest.from_dict(chunks) + ChunkManifest(entries=chunks) chunks = { "0.0.0": { @@ -35,21 +35,21 @@ def test_invalid_chunk_entries(self): }, } with pytest.raises(ValueError, match="must be of the form"): - ChunkManifest.from_dict(chunks) + ChunkManifest(entries=chunks) def test_invalid_chunk_keys(self): chunks = { "0.0.": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, } with pytest.raises(ValueError, match="Invalid format for chunk key: '0.0.'"): - ChunkManifest.from_dict(chunks) + ChunkManifest(entries=chunks) chunks = { "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, "0": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, } with pytest.raises(ValueError, match="Inconsistent number of dimensions"): - ChunkManifest.from_dict(chunks) + ChunkManifest(entries=chunks) class TestProperties: @@ -60,21 +60,21 @@ def test_chunk_grid_info(self): "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks) + manifest = ChunkManifest(entries=chunks) assert manifest.ndim_chunk_grid == 3 assert manifest.shape_chunk_grid == (1, 2, 2) class TestEquals: def test_equals(self): - manifest1 = ChunkManifest.from_dict( - { + manifest1 = ChunkManifest( + entries={ "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } ) - manifest2 = ChunkManifest.from_dict( - { + manifest2 = ChunkManifest( + entries={ "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index a2eb9d72..a13977ff 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -29,7 +29,7 @@ def test_wrapping(): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest = ChunkManifest.from_dict(chunks_dict) + manifest = ChunkManifest(entries=chunks_dict) marr = ManifestArray(zarray=zarray, chunkmanifest=manifest) ds = xr.Dataset({"a": (["x", "y"], marr)}) @@ -59,7 +59,7 @@ def test_equals(self): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) ds1 = xr.Dataset({"a": (["x", "y"], marr1)}) @@ -71,7 +71,7 @@ def test_equals(self): "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest3 = ChunkManifest.from_dict(chunks_dict3) + manifest3 = ChunkManifest(entries=chunks_dict3) marr3 = ManifestArray(zarray=zarray, chunkmanifest=manifest3) ds3 = xr.Dataset({"a": (["x", "y"], marr3)}) assert not ds1.equals(ds3) @@ -96,7 +96,7 @@ def test_concat_along_existing_dim(self): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) ds1 = xr.Dataset({"a": (["x", "y"], marr1)}) @@ -104,7 +104,7 @@ def test_concat_along_existing_dim(self): "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest.from_dict(chunks_dict2) + manifest2 = ChunkManifest(entries=chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) ds2 = xr.Dataset({"a": (["x", "y"], marr2)}) @@ -143,7 +143,7 @@ def test_concat_along_new_dim(self): "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) ds1 = xr.Dataset({"a": (["x", "y"], marr1)}) @@ -151,7 +151,7 @@ def test_concat_along_new_dim(self): "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest.from_dict(chunks_dict2) + manifest2 = ChunkManifest(entries=chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) ds2 = xr.Dataset({"a": (["x", "y"], marr2)}) @@ -193,7 +193,7 @@ def test_concat_dim_coords_along_existing_dim(self): "0": {"path": "foo.nc", "offset": 100, "length": 100}, "1": {"path": "foo.nc", "offset": 200, "length": 100}, } - manifest1 = ChunkManifest.from_dict(chunks_dict1) + manifest1 = ChunkManifest(entries=chunks_dict1) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) coords = xr.Coordinates({"t": (["t"], marr1)}, indexes={}) ds1 = xr.Dataset(coords=coords) @@ -202,7 +202,7 @@ def test_concat_dim_coords_along_existing_dim(self): "0": {"path": "foo.nc", "offset": 300, "length": 100}, "1": {"path": "foo.nc", "offset": 400, "length": 100}, } - manifest2 = ChunkManifest.from_dict(chunks_dict2) + manifest2 = ChunkManifest(entries=chunks_dict2) marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) coords = xr.Coordinates({"t": (["t"], marr2)}, indexes={}) ds2 = xr.Dataset(coords=coords) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 71bec981..866fcc92 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -3,12 +3,14 @@ import xarray.testing as xrt from virtualizarr import ManifestArray, open_virtual_dataset -from virtualizarr.manifests.manifest import ChunkEntry +from virtualizarr.manifests.manifest import ChunkManifest def test_zarr_v3_roundtrip(tmpdir): arr = ManifestArray( - chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)}, + chunkmanifest=ChunkManifest( + entries={"0.0": dict(path="test.nc", offset=6144, length=48)} + ), zarray=dict( shape=(2, 3), dtype=np.dtype(" Date: Sun, 9 Jun 2024 09:57:58 -0600 Subject: [PATCH 24/32] hopefully fix broadcasting bug --- virtualizarr/manifests/array_api.py | 49 ++++++++++------------------- virtualizarr/zarr.py | 25 +++++++++++++++ 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 58c376bb..10561f3c 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -3,7 +3,7 @@ import numpy as np -from virtualizarr.zarr import Codec, ZArray, ceildiv +from virtualizarr.zarr import Codec, ceildiv from .manifest import ChunkManifest @@ -144,16 +144,8 @@ def concatenate( lengths=concatenated_lengths, ) - new_zarray = ZArray( - chunks=first_arr.chunks, - compressor=first_arr.zarray.compressor, - dtype=first_arr.dtype, - fill_value=first_arr.zarray.fill_value, - filters=first_arr.zarray.filters, - shape=new_shape, - # TODO presumably these things should be checked for consistency across arrays too? - order=first_arr.zarray.order, - zarr_format=first_arr.zarray.zarr_format, + new_zarray = first_arr.zarray.replace( + shape=tuple(new_shape), ) return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray) @@ -248,16 +240,9 @@ def stack( new_chunks = list(old_chunks) new_chunks.insert(axis, 1) - new_zarray = ZArray( - chunks=new_chunks, - compressor=first_arr.zarray.compressor, - dtype=first_arr.dtype, - fill_value=first_arr.zarray.fill_value, - filters=first_arr.zarray.filters, - shape=new_shape, - # TODO presumably these things should be checked for consistency across arrays too? - order=first_arr.zarray.order, - zarr_format=first_arr.zarray.zarr_format, + new_zarray = first_arr.zarray.replace( + chunks=tuple(new_chunks), + shape=tuple(new_shape), ) return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray) @@ -287,10 +272,16 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra from .array import ManifestArray + # iterate from last axis to first here so that the fillvalue of 1 is inserted at the start of the shape + # i.e. to follow numpy's broadcasting rules new_chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) - for axis_length, chunk_length in itertools.zip_longest( - shape, x.chunks, fillvalue=1 + reversed( + [ + ceildiv(axis_length, chunk_length) + for axis_length, chunk_length in itertools.zip_longest( + shape[::-1], x.chunks[::-1], fillvalue=1 + ) + ] ) ) @@ -316,17 +307,9 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra for axis_length, chunk_grid_length in zip(new_shape, new_chunk_grid_shape) ) - # refactor to use a new ZArray.copy(x, shape=..., chunks=..) constructor method - new_zarray = ZArray( + new_zarray = x.zarray.replace( chunks=new_chunks, - compressor=x.zarray.compressor, - dtype=x.dtype, - fill_value=x.zarray.fill_value, - filters=x.zarray.filters, shape=new_shape, - # TODO presumably these things should be checked for consistency across arrays too? - order=x.zarray.order, - zarr_format=x.zarray.zarr_format, ) return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index e20561e6..bc703df9 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -99,6 +99,31 @@ def dict(self) -> dict[str, Any]: def to_kerchunk_json(self) -> str: return ujson.dumps(self.dict()) + def replace( + self, + chunks: Optional[tuple[int, ...]] = None, + compressor: Optional[str] = None, + dtype: Optional[np.dtype] = None, + fill_value: Optional[float] = None, # float or int? + filters: Optional[List[Dict]] = None, + order: Optional[Union[Literal["C"], Literal["F"]]] = None, + shape: Optional[Tuple[int, ...]] = None, + zarr_format: Optional[Union[Literal[2], Literal[3]]] = None, + ) -> "ZArray": + """ + Convenience method to create a new ZArray from an existing one by altering only certain attributes. + """ + return ZArray( + chunks=chunks if chunks is not None else self.chunks, + compressor=compressor if compressor is not None else self.compressor, + dtype=dtype if dtype is not None else self.dtype, + fill_value=fill_value if fill_value is not None else self.fill_value, + filters=filters if filters is not None else self.filters, + shape=shape if shape is not None else self.shape, + order=order if order is not None else self.order, + zarr_format=zarr_format if zarr_format is not None else self.zarr_format, + ) + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the ' Date: Thu, 13 Jun 2024 10:59:38 -0400 Subject: [PATCH 25/32] add backwards compatibility for pre-numpy2.0 --- virtualizarr/kerchunk.py | 4 ++-- virtualizarr/manifests/manifest.py | 28 +++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index f0aecda4..ea85a9d0 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -232,13 +232,13 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). """ - from virtualizarr.manifests import ChunkEntry, ManifestArray + from virtualizarr.manifests import ManifestArray if isinstance(var.data, ManifestArray): marr = var.data arr_refs: dict[str, str | list[str | int]] = { - str(chunk_key): [entry['path'], entry['offset'], entry['length']] + str(chunk_key): [entry["path"], entry["offset"], entry["length"]] for chunk_key, entry in marr.manifest.dict().items() } diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 2eee6b0e..86cb9f64 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,6 +1,7 @@ import json import re from collections.abc import Iterable, Iterator +from importlib.metadata import version from typing import Any, NewType, Tuple, Union, cast import numpy as np @@ -17,6 +18,17 @@ ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) +# This is a hard-coded length as its the only option for backwards-compatibility +# The actual choice of 32-bit length is arbitrary +# TODO remove once all dependencies support numpy 2.0 +ValidStringDTypes = [np.dtype("= "2.0.0": + from numpy.dtypes import StringDType # type: ignore[attr-defined] + + ValidStringDTypes.append(StringDType()) +# TODO get type hinting to work here +T_ValidStringDTypes = Any + class ChunkEntry(BaseModel): """ @@ -70,7 +82,7 @@ class ChunkManifest: so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ - _paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]] + _paths: np.ndarray[Any, T_ValidStringDTypes] _offsets: np.ndarray[Any, np.dtype[np.int32]] _lengths: np.ndarray[Any, np.dtype[np.int32]] @@ -98,8 +110,14 @@ def __init__(self, entries: dict) -> None: # in case there are not enough chunks to give correct idea of full shape? shape = get_chunk_grid_shape(entries.keys()) + if version("numpy") < "2.0.0": + # TODO remove this backwards-compatibility branch once all dependencies support numpy 2.0 + string_dtype = np.dtype(" None: @classmethod def from_arrays( cls, - paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], + paths: np.ndarray[Any, np.dtype[T_ValidStringDTypes]], offsets: np.ndarray[Any, np.dtype[np.int32]], lengths: np.ndarray[Any, np.dtype[np.int32]], ) -> "ChunkManifest": @@ -157,9 +175,9 @@ def from_arrays( ) # check dtypes - if paths.dtype != np.dtypes.StringDType(): + if paths.dtype not in ValidStringDTypes: raise ValueError( - f"paths array must have numpy dtype StringDType, but got dtype {paths.dtype}" + f"paths array must have a numpy string dtype (i.e. one of {ValidStringDTypes}), but got dtype {paths.dtype}" ) if offsets.dtype != np.dtype("int32"): raise ValueError( From 8d2dcd5e452d192b1e931f037c624df8df9fabb2 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 17 Jun 2024 18:36:30 -0400 Subject: [PATCH 26/32] depend on numpy>=2.0.0 --- pyproject.toml | 4 ++-- virtualizarr/manifests/manifest.py | 30 ++++++------------------------ 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 79b9d221..3dffef2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,11 +21,11 @@ classifiers = [ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ - "xarray>=2024.5.0", + "xarray>=2024.06.0", "kerchunk>=0.2.5", "h5netcdf", "pydantic", - "numpy>=2.0.0rc1", + "numpy>=2.0.0", "ujson", "packaging", "universal-pathlib", diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 86cb9f64..9cb25177 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,7 +1,6 @@ import json import re from collections.abc import Iterable, Iterator -from importlib.metadata import version from typing import Any, NewType, Tuple, Union, cast import numpy as np @@ -18,17 +17,6 @@ ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) -# This is a hard-coded length as its the only option for backwards-compatibility -# The actual choice of 32-bit length is arbitrary -# TODO remove once all dependencies support numpy 2.0 -ValidStringDTypes = [np.dtype("= "2.0.0": - from numpy.dtypes import StringDType # type: ignore[attr-defined] - - ValidStringDTypes.append(StringDType()) -# TODO get type hinting to work here -T_ValidStringDTypes = Any - class ChunkEntry(BaseModel): """ @@ -78,11 +66,11 @@ class ChunkManifest: (See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287.) - Validation is done when this object is instatiated, and this class is immutable, + Validation is done when this object is instantiated, and this class is immutable, so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ - _paths: np.ndarray[Any, T_ValidStringDTypes] + _paths: np.ndarray[Any, np.dtypes.StringDType] _offsets: np.ndarray[Any, np.dtype[np.int32]] _lengths: np.ndarray[Any, np.dtype[np.int32]] @@ -110,14 +98,8 @@ def __init__(self, entries: dict) -> None: # in case there are not enough chunks to give correct idea of full shape? shape = get_chunk_grid_shape(entries.keys()) - if version("numpy") < "2.0.0": - # TODO remove this backwards-compatibility branch once all dependencies support numpy 2.0 - string_dtype = np.dtype(" None: @classmethod def from_arrays( cls, - paths: np.ndarray[Any, np.dtype[T_ValidStringDTypes]], + paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], offsets: np.ndarray[Any, np.dtype[np.int32]], lengths: np.ndarray[Any, np.dtype[np.int32]], ) -> "ChunkManifest": @@ -175,9 +157,9 @@ def from_arrays( ) # check dtypes - if paths.dtype not in ValidStringDTypes: + if paths.dtype != np.dtypes.StringDType(): raise ValueError( - f"paths array must have a numpy string dtype (i.e. one of {ValidStringDTypes}), but got dtype {paths.dtype}" + f"paths array must have a numpy variable-length string dtype, but got dtype {paths.dtype}" ) if offsets.dtype != np.dtype("int32"): raise ValueError( From 1e62807ff0036ebbde988c496c0f444183c28f90 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 18 Jun 2024 00:38:59 -0400 Subject: [PATCH 27/32] rewrote broadcast_to shape logic --- virtualizarr/manifests/array_api.py | 67 +++++++++++++++++------------ 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 41e8a3e0..0ecdc023 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,4 +1,3 @@ -import itertools from typing import TYPE_CHECKING, Callable, Iterable import numpy as np @@ -144,6 +143,7 @@ def concatenate( lengths=concatenated_lengths, ) + # chunk shape has not changed, there are just now more chunks along the concatenation axis new_zarray = first_arr.zarray.replace( shape=tuple(new_shape), ) @@ -235,7 +235,7 @@ def stack( lengths=stacked_lengths, ) - # chunk size has changed because a length-1 axis has been inserted + # chunk shape has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks new_chunks = list(old_chunks) new_chunks.insert(axis, 1) @@ -272,47 +272,60 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra from .array import ManifestArray - # iterate from last axis to first here so that the fillvalue of 1 is inserted at the start of the shape - # i.e. to follow numpy's broadcasting rules - new_chunk_grid_shape = tuple( - reversed( - [ - ceildiv(axis_length, chunk_length) - for axis_length, chunk_length in itertools.zip_longest( - shape[::-1], x.chunks[::-1], fillvalue=1 - ) - ] + new_shape = shape + + # check its actually possible to broadcast to this new shape + mutually_broadcastable_shape = np.broadcast_shapes(x.shape, new_shape) + if mutually_broadcastable_shape != new_shape: + # we're not trying to broadcast both shapes to a third shape + raise ValueError( + f"array of shape {x.shape} cannot be broadcast to shape {new_shape}" ) + + # new chunk_shape is old chunk_shape with singleton dimensions pre-pended + # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements) + old_chunk_shape = x.chunks + new_chunk_shape = _prepend_singleton_dimensions( + old_chunk_shape, ndim=len(new_shape) + ) + + # find new chunk grid shape by dividing new array shape by new chunk shape + new_chunk_grid_shape = tuple( + ceildiv(axis_length, chunk_length) + for axis_length, chunk_length in zip(new_shape, new_chunk_shape) ) # do broadcasting of entries in manifest - broadcast_paths = np.broadcast_to(x.manifest._paths, shape=new_chunk_grid_shape) - broadcast_offsets = np.broadcast_to( + broadcasted_paths = np.broadcast_to( + x.manifest._paths, + shape=new_chunk_grid_shape, + ) + broadcasted_offsets = np.broadcast_to( x.manifest._offsets, shape=new_chunk_grid_shape, ) - broadcast_lengths = np.broadcast_to( + broadcasted_lengths = np.broadcast_to( x.manifest._lengths, shape=new_chunk_grid_shape, ) - broadcast_manifest = ChunkManifest.from_arrays( - paths=broadcast_paths, - offsets=broadcast_offsets, - lengths=broadcast_lengths, - ) - - new_shape = np.broadcast_shapes(x.shape, shape) - new_chunks = tuple( - ceildiv(axis_length, chunk_grid_length) - for axis_length, chunk_grid_length in zip(new_shape, new_chunk_grid_shape) + broadcasted_manifest = ChunkManifest.from_arrays( + paths=broadcasted_paths, + offsets=broadcasted_offsets, + lengths=broadcasted_lengths, ) new_zarray = x.zarray.replace( - chunks=new_chunks, + chunks=new_chunk_shape, shape=new_shape, ) - return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray) + return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray) + + +def _prepend_singleton_dimensions(shape: tuple[int, ...], ndim: int) -> tuple[int, ...]: + """Prepend as many new length-1 axes to shape as necessary such that the result has ndim number of axes.""" + n_prepended_dims = ndim - len(shape) + return tuple([1] * n_prepended_dims + list(shape)) # TODO broadcast_arrays, squeeze, permute_dims From dc824bb8bce87244ac400b89f7cdfe91cb3bd0dc Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 18 Jun 2024 00:54:04 -0400 Subject: [PATCH 28/32] remove faulty hypothesis strategies --- .../tests/test_manifests/test_array.py | 53 +++++++++++-------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index eb3a3df5..459e60be 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -1,8 +1,5 @@ -import hypothesis.extra.numpy as npst -import hypothesis.strategies as st import numpy as np import pytest -from hypothesis import given from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import create_manifestarray @@ -165,31 +162,43 @@ def test_broadcast_scalar(self): "0": {"path": "file.0.nc", "offset": 0, "length": 5}, } - @given(st.data()) - def test_broadcast_any_shape(self, data): - # Generate arbitary ManifestArray with arbitrary chunk shape - original_shape = data.draw(npst.array_shapes()) - arr_ndim = len(original_shape) - chunk_shape = data.draw( - npst.array_shapes( - min_dims=arr_ndim, max_dims=arr_ndim, max_side=max(original_shape) - ) - ) - marr = create_manifestarray(shape=original_shape, chunks=chunk_shape) - - # generate arbitrary broadcastable shape to broadcast to - target_broadcast_shape = data.draw(npst.broadcastable_shapes(marr.shape)) + @pytest.mark.parametrize( + "shape, chunks, target_shape", + [ + ((1,), (1,), ()), + ((2,), (1,), (1,)), + ((3,), (2,), (5, 4, 4)), + ((3, 2), (2, 2), (2, 3, 4)), + ], + ) + def test_raise_on_invalid_broadcast_shapes(self, shape, chunks, target_shape): + marr = create_manifestarray(shape=shape, chunks=chunks) + with pytest.raises(ValueError): + np.broadcast_to(marr, shape=target_shape) + + # TODO replace this parametrization with hypothesis strategies + @pytest.mark.parametrize( + "shape, chunks, target_shape", + [ + ((1,), (1,), (3,)), + ((2,), (1,), (2,)), + ((3,), (2,), (5, 4, 3)), + ((3, 1), (2, 1), (2, 3, 4)), + ], + ) + def test_broadcast_any_shape(self, shape, chunks, target_shape): + marr = create_manifestarray(shape=shape, chunks=chunks) # do the broadcasting - broadcasted_marr = np.broadcast_to(marr, shape=target_broadcast_shape) - broadcasted_chunk_shape = broadcasted_marr.chunks + broadcasted_marr = np.broadcast_to(marr, shape=target_shape) # check that the resultant shape is correct - assert broadcasted_marr.shape == target_broadcast_shape + assert broadcasted_marr.shape == target_shape # check that chunk shape has plausible ndims and lengths - assert broadcasted_chunk_shape.ndim == broadcasted_marr.ndim - for len_arr, len_chunk in zip(broadcasted_chunk_shape, broadcasted_marr): + broadcasted_chunk_shape = broadcasted_marr.chunks + assert len(broadcasted_chunk_shape) == broadcasted_marr.ndim + for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape): assert len_chunk <= len_arr From a97e5d864c60d23bb6e8010431e9e746c9ae38b6 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 18 Jun 2024 01:03:58 -0400 Subject: [PATCH 29/32] remove hypothesis from dependencies --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3dffef2a..b91af30c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ test = [ "fsspec", "s3fs", "fastparquet", - "hypothesis", ] From daaa33105167b9545adfa5a0baf985801227557c Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 18 Jun 2024 11:04:39 -0400 Subject: [PATCH 30/32] ignore remaining mypy errors --- virtualizarr/manifests/manifest.py | 8 ++++---- virtualizarr/zarr.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 9cb25177..530ce17c 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -70,7 +70,7 @@ class ChunkManifest: so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ - _paths: np.ndarray[Any, np.dtypes.StringDType] + _paths: np.ndarray[Any, np.dtypes.StringDType] # type: ignore[name-defined] _offsets: np.ndarray[Any, np.dtype[np.int32]] _lengths: np.ndarray[Any, np.dtype[np.int32]] @@ -99,7 +99,7 @@ def __init__(self, entries: dict) -> None: shape = get_chunk_grid_shape(entries.keys()) # Initializing to empty implies that entries with path='' are treated as missing chunks - paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) + paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) # type: ignore[attr-defined] offsets = np.empty(shape=shape, dtype=np.dtype("int32")) lengths = np.empty(shape=shape, dtype=np.dtype("int32")) @@ -127,7 +127,7 @@ def __init__(self, entries: dict) -> None: @classmethod def from_arrays( cls, - paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], + paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], # type: ignore[name-defined] offsets: np.ndarray[Any, np.dtype[np.int32]], lengths: np.ndarray[Any, np.dtype[np.int32]], ) -> "ChunkManifest": @@ -157,7 +157,7 @@ def from_arrays( ) # check dtypes - if paths.dtype != np.dtypes.StringDType(): + if paths.dtype != np.dtypes.StringDType(): # type: ignore[attr-defined] raise ValueError( f"paths array must have a numpy variable-length string dtype, but got dtype {paths.dtype}" ) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index dbba794a..22b46ffb 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -109,7 +109,7 @@ def replace( compressor: Optional[str] = None, dtype: Optional[np.dtype] = None, fill_value: Optional[float] = None, # float or int? - filters: Optional[list[dict]] = None, + filters: Optional[list[dict]] = None, # type: ignore[valid-type] order: Optional[Literal["C"] | Literal["F"]] = None, shape: Optional[tuple[int, ...]] = None, zarr_format: Optional[Literal[2] | Literal[3]] = None, From 5f6c91204334e698148f8382200d61b2cc115511 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 18 Jun 2024 11:07:38 -0400 Subject: [PATCH 31/32] release notes --- docs/releases.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index 2255ac10..7228ca66 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -13,6 +13,8 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- Requires numpy 2.0 (for :pull:`107`). + By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ @@ -29,6 +31,8 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Refactor `ChunkManifest` class to store chunk references internally using numpy arrays. + (:pull:`107`) By `Tom Nicholas `_. - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest. (:pull:`144`) By `Tom Nicholas `_. From 7893c8803147793d88b77fa135f49537c47be8ed Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 18 Jun 2024 13:13:14 -0400 Subject: [PATCH 32/32] update dependencies in CI test env --- ci/environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index d8af6bf5..0385ea5a 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,10 +7,10 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.5.0 + - xarray>=2024.6.0 - kerchunk>=0.2.5 - pydantic - - numpy + - numpy>=2.0.0 - ujson - packaging - universal_pathlib