From 0c445fd8184da71630a6813c5aa507887e02cf75 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sun, 17 Mar 2024 16:13:54 -0400
Subject: [PATCH 01/32] change entries property to a structured array, add
 from_dict

---
 virtualizarr/manifests/manifest.py            | 104 ++++++++++++++----
 .../tests/test_manifests/test_manifest.py     |  16 +--
 2 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 0e54394e..937d7d8f 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,9 +1,9 @@
 import itertools
 import re
-from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast
+from typing import Any, Iterable, Iterator, List, Mapping, NewType, Tuple, Union, cast
 
 import numpy as np
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import BaseModel, ConfigDict
 
 from ..types import ChunkKey
 
@@ -14,6 +14,11 @@
 _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$"  # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)
 
 
+ChunkDict = NewType(
+    "ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]
+)  # just the .zattrs (for one array or for the whole store/group)
+
+
 class ChunkEntry(BaseModel):
     """
     Information for a single chunk in the manifest.
@@ -42,11 +47,20 @@ def to_kerchunk(self) -> List[Union[str, int]]:
         return [self.path, self.offset, self.length]
 
 
+# TODO we want the path field to contain a variable-length string, but that's not available until numpy 2.0
+# See https://numpy.org/neps/nep-0055-string_dtype.html
+MANIFEST_STRUCTURED_ARRAY_DTYPES = np.dtype(
+    [("path", "<U32"), ("offset", np.int32), ("length", np.int32)]
+)
+
+
 class ChunkManifest(BaseModel):
     """
     In-memory representation of a single Zarr chunk manifest.
 
-    Stores the manifest as a dictionary under the .chunks attribute, in this form:
+    Stores the manifest internally as a numpy structured array.
+
+    The manifest can be converted to or from a dictionary form looking like this
 
     {
         "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
@@ -55,24 +69,37 @@ class ChunkManifest(BaseModel):
         "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
     }
 
+    using the .from_dict() and .dict() methods, so users of this class can think of the manifest as if it were a dict.
+
     See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287 .
 
     Validation is done when this object is instatiated, and this class is immutable,
     so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks.
     """
 
-    model_config = ConfigDict(frozen=True)
+    model_config = ConfigDict(
+        frozen=True,
+        arbitrary_types_allowed=True,  # so pydantic doesn't complain about the numpy array field
+    )
 
-    entries: Mapping[ChunkKey, ChunkEntry]
-    # shape_chunk_grid: Tuple[int, ...]  # TODO do we need this for anything?
+    # TODO how to type hint to indicate a numpy structured array with specifically-typed fields?
+    entries: np.ndarray
 
-    @field_validator("entries")
     @classmethod
-    def validate_chunks(cls, entries: Any) -> Mapping[ChunkKey, ChunkEntry]:
-        validate_chunk_keys(list(entries.keys()))
+    def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
+        # TODO do some input validation here first?
 
-        # TODO what if pydantic adjusts anything during validation?
-        return entries
+        # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape?
+        shape = get_chunk_grid_shape(chunks.keys())
+
+        # Initializing to empty implies that entries with path='' are treated as missing chunks
+        entries = np.empty(shape=shape, dtype=MANIFEST_STRUCTURED_ARRAY_DTYPES)
+
+        # populate the array
+        for key, entry in chunks.items():
+            entries[split(key)] = tuple(entry.values())
+
+        return ChunkManifest(entries=entries)
 
     @property
     def ndim_chunk_grid(self) -> int:
@@ -81,7 +108,7 @@ def ndim_chunk_grid(self) -> int:
 
         Not the same as the dimension of an array backed by this chunk manifest.
         """
-        return get_ndim_from_key(list(self.entries.keys())[0])
+        return self.entries.ndim
 
     @property
     def shape_chunk_grid(self) -> Tuple[int, ...]:
@@ -90,23 +117,56 @@ def shape_chunk_grid(self) -> Tuple[int, ...]:
 
         Not the same as the shape of an array backed by this chunk manifest.
         """
-        return get_chunk_grid_shape(list(self.entries.keys()))
+        return self.entries.shape
 
     def __repr__(self) -> str:
         return f"ChunkManifest<shape={self.shape_chunk_grid}>"
 
     def __getitem__(self, key: ChunkKey) -> ChunkEntry:
-        return self.chunks[key]
+        indices = split(key)
+        return ChunkEntry(self.entries[indices])
 
     def __iter__(self) -> Iterator[ChunkKey]:
         return iter(self.chunks.keys())
 
     def __len__(self) -> int:
-        return len(self.chunks)
+        return self.entries.size
+
+    def dict(self) -> ChunkDict:
+        """
+        Converts the entire manifest to a nested dictionary, of the form
+
+        {
+            "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
+            "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
+        }
+        """
 
-    def dict(self) -> dict[str, dict[str, Union[str, int]]]:
-        """Converts the entire manifest to a nested dictionary."""
-        return {k: dict(entry) for k, entry in self.entries.items()}
+        def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]:
+            return {
+                "path": entry[0],
+                "offset": entry[1],
+                "length": entry[2],
+            }
+
+        # print(self.entries.dtype)
+
+        coord_vectors = np.mgrid[
+            tuple(slice(None, length) for length in self.shape_chunk_grid)
+        ]
+        # print(coord_vectors)
+        # print(self.entries)
+
+        # TODO don't include entry if path='' ?
+        return cast(
+            ChunkDict,
+            {
+                join(inds): _entry_to_dict(entry.item())
+                for *inds, entry in np.nditer([*coord_vectors, self.entries])
+            },
+        )
 
     @staticmethod
     def from_zarr_json(filepath: str) -> "ChunkManifest":
@@ -125,12 +185,12 @@ def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
         return ChunkManifest(entries=chunkentries)
 
 
-def split(key: ChunkKey) -> List[int]:
-    return list(int(i) for i in key.split("."))
+def split(key: ChunkKey) -> Tuple[int, ...]:
+    return tuple(int(i) for i in key.split("."))
 
 
-def join(inds: Iterable[int]) -> ChunkKey:
-    return cast(ChunkKey, ".".join(str(i) for i in inds))
+def join(inds: Iterable[Any]) -> ChunkKey:
+    return cast(ChunkKey, ".".join(str(i) for i in list(inds)))
 
 
 def get_ndim_from_key(key: str) -> int:
diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index 4a977837..50149692 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -9,7 +9,7 @@ def test_create_manifest(self):
         chunks = {
             "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
         }
-        manifest = ChunkManifest(entries=chunks)
+        manifest = ChunkManifest.from_dict(chunks)
         assert manifest.dict() == chunks
 
         chunks = {
@@ -18,7 +18,7 @@ def test_create_manifest(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest(entries=chunks)
+        manifest = ChunkManifest.from_dict(chunks)
         assert manifest.dict() == chunks
 
     def test_invalid_chunk_entries(self):
@@ -26,7 +26,7 @@ def test_invalid_chunk_entries(self):
             "0.0.0": {"path": "s3://bucket/foo.nc"},
         }
         with pytest.raises(ValidationError, match="missing"):
-            ChunkManifest(entries=chunks)
+            ChunkManifest.from_dict(chunks)
 
         chunks = {
             "0.0.0": {
@@ -36,21 +36,21 @@ def test_invalid_chunk_entries(self):
             },
         }
         with pytest.raises(ValidationError, match="should be a valid integer"):
-            ChunkManifest(entries=chunks)
+            ChunkManifest.from_dict(chunks)
 
     def test_invalid_chunk_keys(self):
         chunks = {
             "0.0.": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
         }
         with pytest.raises(ValueError, match="Invalid format for chunk key: '0.0.'"):
-            ChunkManifest(entries=chunks)
+            ChunkManifest.from_dict(chunks)
 
         chunks = {
             "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
             "0": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
         }
         with pytest.raises(ValueError, match="Inconsistent number of dimensions"):
-            ChunkManifest(entries=chunks)
+            ChunkManifest.from_dict(chunks)
 
         chunks = {
             "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
@@ -58,13 +58,13 @@ def test_invalid_chunk_keys(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
         }
         with pytest.raises(ValueError, match="do not form a complete grid"):
-            ChunkManifest(entries=chunks)
+            ChunkManifest.from_dict(chunks)
 
         chunks = {
             "1": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
         }
         with pytest.raises(ValueError, match="do not form a complete grid"):
-            ChunkManifest(entries=chunks)
+            ChunkManifest.from_dict(chunks)
 
 
 class TestProperties:

From 3bc483f04cf91bb04cb131ecb83649bbe839dfc7 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sun, 17 Mar 2024 17:05:11 -0400
Subject: [PATCH 02/32] fix validation

---
 virtualizarr/manifests/manifest.py            | 32 ++++-------
 .../tests/test_manifests/test_manifest.py     | 53 +++++++------------
 2 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 937d7d8f..bfe393f2 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,4 +1,3 @@
-import itertools
 import re
 from typing import Any, Iterable, Iterator, List, Mapping, NewType, Tuple, Union, cast
 
@@ -88,6 +87,7 @@ class ChunkManifest(BaseModel):
     @classmethod
     def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
         # TODO do some input validation here first?
+        validate_chunk_keys(chunks.keys())
 
         # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape?
         shape = get_chunk_grid_shape(chunks.keys())
@@ -97,7 +97,14 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
 
         # populate the array
         for key, entry in chunks.items():
-            entries[split(key)] = tuple(entry.values())
+            try:
+                entries[split(key)] = tuple(entry.values())
+            except (ValueError, TypeError) as e:
+                msg = (
+                    "Each chunk entry must be of the form dict(path=<str>, offset=<int>, length=<int>), "
+                    f"but got {entry}"
+                )
+                raise ValueError(msg) from e
 
         return ChunkManifest(entries=entries)
 
@@ -214,9 +221,6 @@ def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]):
                 f"Inconsistent number of dimensions between chunk key {key} and {first_key}: {other_ndim} vs {ndim}"
             )
 
-    # Check that the keys collectively form a complete grid
-    check_keys_form_grid(chunk_keys)
-
 
 def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> Tuple[int, ...]:
     # find max chunk index along each dimension
@@ -227,24 +231,6 @@ def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> Tuple[int, ...]:
     return chunk_grid_shape
 
 
-def check_keys_form_grid(chunk_keys: Iterable[ChunkKey]):
-    """Check that the chunk keys collectively form a complete grid"""
-
-    chunk_grid_shape = get_chunk_grid_shape(chunk_keys)
-
-    # create every possible combination
-    all_possible_combos = itertools.product(
-        *[range(length) for length in chunk_grid_shape]
-    )
-    all_required_chunk_keys: set[ChunkKey] = set(
-        join(inds) for inds in all_possible_combos
-    )
-
-    # check that every possible combination is represented once in the list of chunk keys
-    if set(chunk_keys) != all_required_chunk_keys:
-        raise ValueError("Chunk keys do not form a complete grid")
-
-
 def concat_manifests(manifests: List["ChunkManifest"], axis: int) -> "ChunkManifest":
     """
     Concatenate manifests along an existing dimension.
diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index 50149692..ba268472 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -1,5 +1,4 @@
 import pytest
-from pydantic import ValidationError
 
 from virtualizarr.manifests import ChunkManifest, concat_manifests, stack_manifests
 
@@ -25,7 +24,7 @@ def test_invalid_chunk_entries(self):
         chunks = {
             "0.0.0": {"path": "s3://bucket/foo.nc"},
         }
-        with pytest.raises(ValidationError, match="missing"):
+        with pytest.raises(ValueError, match="must be of the form"):
             ChunkManifest.from_dict(chunks)
 
         chunks = {
@@ -35,7 +34,7 @@ def test_invalid_chunk_entries(self):
                 "length": 100,
             },
         }
-        with pytest.raises(ValidationError, match="should be a valid integer"):
+        with pytest.raises(ValueError, match="must be of the form"):
             ChunkManifest.from_dict(chunks)
 
     def test_invalid_chunk_keys(self):
@@ -52,20 +51,6 @@ def test_invalid_chunk_keys(self):
         with pytest.raises(ValueError, match="Inconsistent number of dimensions"):
             ChunkManifest.from_dict(chunks)
 
-        chunks = {
-            "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
-            "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
-        }
-        with pytest.raises(ValueError, match="do not form a complete grid"):
-            ChunkManifest.from_dict(chunks)
-
-        chunks = {
-            "1": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
-        }
-        with pytest.raises(ValueError, match="do not form a complete grid"):
-            ChunkManifest.from_dict(chunks)
-
 
 class TestProperties:
     def test_chunk_grid_info(self):
@@ -75,21 +60,21 @@ def test_chunk_grid_info(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest(entries=chunks)
+        manifest = ChunkManifest.from_dict(chunks)
         assert manifest.ndim_chunk_grid == 3
         assert manifest.shape_chunk_grid == (1, 2, 2)
 
 
 class TestEquals:
     def test_equals(self):
-        manifest1 = ChunkManifest(
-            entries={
+        manifest1 = ChunkManifest.from_dict(
+            {
                 "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
             }
         )
-        manifest2 = ChunkManifest(
-            entries={
+        manifest2 = ChunkManifest.from_dict(
+            {
                 "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
             }
@@ -102,21 +87,21 @@ def test_equals(self):
 # Perhaps by testing the property that splitting along a dimension then concatenating the pieces along that dimension should recreate the original manifest?
 class TestCombineManifests:
     def test_concat(self):
-        manifest1 = ChunkManifest(
-            entries={
+        manifest1 = ChunkManifest.from_dict(
+            {
                 "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
             }
         )
-        manifest2 = ChunkManifest(
-            entries={
+        manifest2 = ChunkManifest.from_dict(
+            {
                 "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
             }
         )
         axis = 1
-        expected = ChunkManifest(
-            entries={
+        expected = ChunkManifest.from_dict(
+            {
                 "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
                 "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
@@ -128,21 +113,21 @@ def test_concat(self):
         assert result.dict() == expected.dict()
 
     def test_stack(self):
-        manifest1 = ChunkManifest(
-            entries={
+        manifest1 = ChunkManifest.from_dict(
+            {
                 "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
                 "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
             }
         )
-        manifest2 = ChunkManifest(
-            entries={
+        manifest2 = ChunkManifest.from_dict(
+            {
                 "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
                 "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
             }
         )
         axis = 1
-        expected = ChunkManifest(
-            entries={
+        expected = ChunkManifest.from_dict(
+            {
                 "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
                 "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},

From 20f2ded45baf020968203441e0d4ae14bea042cd Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Mon, 18 Mar 2024 11:04:36 -0400
Subject: [PATCH 03/32] equals method

---
 virtualizarr/manifests/manifest.py                 | 11 ++++++-----
 virtualizarr/tests/test_manifests/test_manifest.py |  1 -
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index bfe393f2..fb14c297 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -158,23 +158,24 @@ def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]:
                 "length": entry[2],
             }
 
-        # print(self.entries.dtype)
-
         coord_vectors = np.mgrid[
             tuple(slice(None, length) for length in self.shape_chunk_grid)
         ]
-        # print(coord_vectors)
-        # print(self.entries)
 
-        # TODO don't include entry if path='' ?
         return cast(
             ChunkDict,
             {
                 join(inds): _entry_to_dict(entry.item())
                 for *inds, entry in np.nditer([*coord_vectors, self.entries])
+                if entry.item()[0]
+                != ""  # don't include entry if path='' (i.e. empty chunk)
             },
         )
 
+    def __eq__(self, other: Any) -> bool:
+        """Two manifests are equal if all of their entries are identical."""
+        return (self.entries == other.entries).all()
+
     @staticmethod
     def from_zarr_json(filepath: str) -> "ChunkManifest":
         """Create a ChunkManifest from a Zarr manifest.json file."""
diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index ba268472..38b16f09 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -79,7 +79,6 @@ def test_equals(self):
                 "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
             }
         )
-        assert not manifest1 == manifest2
         assert manifest1 != manifest2
 
 

From be8af12d285ee31fd69ee76a26a82fbd6d0f1bf4 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Mon, 18 Mar 2024 12:30:06 -0400
Subject: [PATCH 04/32] re-implemented concatenation through concatenation of
 the wrapped structured array

---
 virtualizarr/manifests/__init__.py            |  7 +-
 virtualizarr/manifests/array_api.py           | 12 +--
 virtualizarr/manifests/manifest.py            | 86 +------------------
 .../tests/test_manifests/test_array.py        | 28 +++---
 .../tests/test_manifests/test_manifest.py     | 11 ++-
 5 files changed, 34 insertions(+), 110 deletions(-)

diff --git a/virtualizarr/manifests/__init__.py b/virtualizarr/manifests/__init__.py
index 39f60c59..c317ed6a 100644
--- a/virtualizarr/manifests/__init__.py
+++ b/virtualizarr/manifests/__init__.py
@@ -2,9 +2,4 @@
 # This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring.
 
 from .array import ManifestArray  # type: ignore # noqa
-from .manifest import (  # type: ignore # noqa
-    ChunkEntry,
-    ChunkManifest,
-    concat_manifests,
-    stack_manifests,
-)
+from .manifest import ChunkEntry, ChunkManifest  # type: ignore # noqa
diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
index 9333d88e..4bfea632 100644
--- a/virtualizarr/manifests/array_api.py
+++ b/virtualizarr/manifests/array_api.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from ..zarr import Codec, ZArray
-from .manifest import concat_manifests, stack_manifests
+from .manifest import ChunkManifest
 
 if TYPE_CHECKING:
     from .array import ManifestArray
@@ -123,10 +123,11 @@ def concatenate(
     new_shape = list(first_shape)
     new_shape[axis] = new_length_along_concat_axis
 
-    concatenated_manifest = concat_manifests(
-        [arr.manifest for arr in arrays],
+    concatenated_manifest_entries = np.concatenate(
+        [arr.manifest.entries for arr in arrays],
         axis=axis,
     )
+    concatenated_manifest = ChunkManifest(entries=concatenated_manifest_entries)
 
     new_zarray = ZArray(
         chunks=first_arr.chunks,
@@ -206,10 +207,11 @@ def stack(
     new_shape = list(first_shape)
     new_shape.insert(axis, length_along_new_stacked_axis)
 
-    stacked_manifest = stack_manifests(
-        [arr.manifest for arr in arrays],
+    stacked_manifest_entries = np.stack(
+        [arr.manifest.entries for arr in arrays],
         axis=axis,
     )
+    stacked_manifest = ChunkManifest(entries=stacked_manifest_entries)
 
     # chunk size has changed because a length-1 axis has been inserted
     old_chunks = first_arr.chunks
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index fb14c297..456a1333 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,5 +1,5 @@
 import re
-from typing import Any, Iterable, Iterator, List, Mapping, NewType, Tuple, Union, cast
+from typing import Any, Iterable, Iterator, List, NewType, Tuple, Union, cast
 
 import numpy as np
 from pydantic import BaseModel, ConfigDict
@@ -13,9 +13,7 @@
 _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$"  # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)
 
 
-ChunkDict = NewType(
-    "ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]
-)  # just the .zattrs (for one array or for the whole store/group)
+ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]])
 
 
 class ChunkEntry(BaseModel):
@@ -190,7 +188,7 @@ def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
         chunkentries = {
             k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items()
         }
-        return ChunkManifest(entries=chunkentries)
+        return ChunkManifest.from_dict(chunkentries)
 
 
 def split(key: ChunkKey) -> Tuple[int, ...]:
@@ -230,81 +228,3 @@ def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> Tuple[int, ...]:
         max(indices_along_one_dim) + 1 for indices_along_one_dim in zipped_indices
     )
     return chunk_grid_shape
-
-
-def concat_manifests(manifests: List["ChunkManifest"], axis: int) -> "ChunkManifest":
-    """
-    Concatenate manifests along an existing dimension.
-
-    This only requires adjusting one index of chunk keys along a single dimension.
-
-    Note axis is not expected to be negative.
-    """
-    if len(manifests) == 1:
-        return manifests[0]
-
-    chunk_grid_shapes = [manifest.shape_chunk_grid for manifest in manifests]
-    lengths_along_concat_dim = [shape[axis] for shape in chunk_grid_shapes]
-
-    # Note we do not need to change the keys of the first manifest
-    chunk_index_offsets = np.cumsum(lengths_along_concat_dim)[:-1]
-    new_entries = [
-        adjust_chunk_keys(manifest.entries, axis, offset)
-        for manifest, offset in zip(manifests[1:], chunk_index_offsets)
-    ]
-    all_entries = [manifests[0].entries] + new_entries
-    merged_entries = dict((k, v) for d in all_entries for k, v in d.items())
-
-    # Arguably don't need to re-perform validation checks on a manifest we created out of already-validated manifests
-    # Could use pydantic's model_construct classmethod to skip these checks
-    # But we should actually performance test it because it might be pointless, and current implementation is safer
-    return ChunkManifest(entries=merged_entries)
-
-
-def adjust_chunk_keys(
-    entries: Mapping[ChunkKey, ChunkEntry], axis: int, offset: int
-) -> Mapping[ChunkKey, ChunkEntry]:
-    """Replace all chunk keys with keys which have been offset along one axis."""
-
-    def offset_key(key: ChunkKey, axis: int, offset: int) -> ChunkKey:
-        inds = split(key)
-        inds[axis] += offset
-        return join(inds)
-
-    return {offset_key(k, axis, offset): v for k, v in entries.items()}
-
-
-def stack_manifests(manifests: List[ChunkManifest], axis: int) -> "ChunkManifest":
-    """
-    Stack manifests along a new dimension.
-
-    This only requires inserting one index into all chunk keys to add a new dimension.
-
-    Note axis is not expected to be negative.
-    """
-
-    # even if there is only one manifest it still needs a new axis inserted
-    chunk_indexes_along_new_dim = range(len(manifests))
-    new_entries = [
-        insert_new_axis_into_chunk_keys(manifest.entries, axis, new_index_value)
-        for manifest, new_index_value in zip(manifests, chunk_indexes_along_new_dim)
-    ]
-    merged_entries = dict((k, v) for d in new_entries for k, v in d.items())
-
-    # Arguably don't need to re-perform validation checks on a manifest we created out of already-validated manifests
-    # Could use pydantic's model_construct classmethod to skip these checks
-    # But we should actually performance test it because it might be pointless, and current implementation is safer
-    return ChunkManifest(entries=merged_entries)
-
-
-def insert_new_axis_into_chunk_keys(
-    entries: Mapping[ChunkKey, ChunkEntry], axis: int, new_index_value: int
-) -> Mapping[ChunkKey, ChunkEntry]:
-    """Replace all chunk keys with keys which have a new axis inserted, with a given value."""
-
-    def insert_axis(key: ChunkKey, new_axis: int, index_value: int) -> ChunkKey:
-        inds = split(key)
-        inds.insert(new_axis, index_value)
-        return join(inds)
-
-    return {insert_axis(k, axis, new_index_value): v for k, v in entries.items()}
diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index aa10d91c..b587f88a 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -13,7 +13,7 @@ def test_create_manifestarray(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest(entries=chunks_dict)
+        manifest = ChunkManifest.from_dict(chunks_dict)
         chunks = (5, 1, 10)
         shape = (5, 2, 20)
         zarray = ZArray(
@@ -38,7 +38,7 @@ def test_create_invalid_manifestarray(self):
         chunks_dict = {
             "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
         }
-        manifest = ChunkManifest(entries=chunks_dict)
+        manifest = ChunkManifest.from_dict(chunks_dict)
         chunks = (5, 1, 10)
         shape = (5, 2, 20)
         zarray = ZArray(
@@ -79,7 +79,7 @@ def test_equals(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest(entries=chunks_dict)
+        manifest = ChunkManifest.from_dict(chunks_dict)
         chunks = (5, 1, 10)
         shape = (5, 2, 20)
         zarray = ZArray(
@@ -118,14 +118,14 @@ def test_not_equal_chunk_entries(self):
             "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest(entries=chunks_dict1)
+        manifest1 = ChunkManifest.from_dict(chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
             "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest(entries=chunks_dict2)
+        manifest2 = ChunkManifest.from_dict(chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         assert not (marr1 == marr2).all()
 
@@ -154,14 +154,14 @@ def test_concat(self):
             "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest(entries=chunks_dict1)
+        manifest1 = ChunkManifest.from_dict(chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
             "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest(entries=chunks_dict2)
+        manifest2 = ChunkManifest.from_dict(chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
 
         result = np.concatenate([marr1, marr2], axis=1)
@@ -199,14 +199,14 @@ def test_stack(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest(entries=chunks_dict1)
+        manifest1 = ChunkManifest.from_dict(chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest(entries=chunks_dict2)
+        manifest2 = ChunkManifest.from_dict(chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
 
         result = np.stack([marr1, marr2], axis=1)
@@ -242,28 +242,30 @@ def test_refuse_combine():
     chunks_dict1 = {
         "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
     }
+    chunkmanifest1 = ChunkManifest.from_dict(chunks_dict1)
     chunks_dict2 = {
         "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
     }
-    marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunks_dict1)
+    chunkmanifest2 = ChunkManifest.from_dict(chunks_dict2)
+    marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunkmanifest1)
 
     zarray_wrong_compressor = zarray_common.copy()
     zarray_wrong_compressor["compressor"] = None
-    marr2 = ManifestArray(zarray=zarray_wrong_compressor, chunkmanifest=chunks_dict2)
+    marr2 = ManifestArray(zarray=zarray_wrong_compressor, chunkmanifest=chunkmanifest2)
     for func in [np.concatenate, np.stack]:
         with pytest.raises(NotImplementedError, match="different codecs"):
             func([marr1, marr2], axis=0)
 
     zarray_wrong_dtype = zarray_common.copy()
     zarray_wrong_dtype["dtype"] = np.dtype("int64")
-    marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunks_dict2)
+    marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunkmanifest2)
     for func in [np.concatenate, np.stack]:
         with pytest.raises(ValueError, match="inconsistent dtypes"):
             func([marr1, marr2], axis=0)
 
     zarray_wrong_dtype = zarray_common.copy()
     zarray_wrong_dtype["dtype"] = np.dtype("int64")
-    marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunks_dict2)
+    marr2 = ManifestArray(zarray=zarray_wrong_dtype, chunkmanifest=chunkmanifest2)
     for func in [np.concatenate, np.stack]:
         with pytest.raises(ValueError, match="inconsistent dtypes"):
             func([marr1, marr2], axis=0)
diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index 38b16f09..062ad7ad 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -1,6 +1,7 @@
+import numpy as np
 import pytest
 
-from virtualizarr.manifests import ChunkManifest, concat_manifests, stack_manifests
+from virtualizarr.manifests import ChunkManifest
 
 
 class TestCreateManifest:
@@ -108,7 +109,10 @@ def test_concat(self):
             }
         )
 
-        result = concat_manifests([manifest1, manifest2], axis=axis)
+        result_manifest = np.concatenate(
+            [manifest1.entries, manifest2.entries], axis=axis
+        )
+        result = ChunkManifest(entries=result_manifest)
         assert result.dict() == expected.dict()
 
     def test_stack(self):
@@ -134,7 +138,8 @@ def test_stack(self):
             }
         )
 
-        result = stack_manifests([manifest1, manifest2], axis=axis)
+        result_manifest = np.stack([manifest1.entries, manifest2.entries], axis=axis)
+        result = ChunkManifest(entries=result_manifest)
         assert result.dict() == expected.dict()
 
 

From bd8ad2283b854cff7540d69a1323c00fe520327e Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Mon, 18 Mar 2024 12:50:09 -0400
Subject: [PATCH 05/32] fixed manifest.from_kerchunk_dict

---
 virtualizarr/manifests/manifest.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 456a1333..1636d8eb 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -43,6 +43,9 @@ def to_kerchunk(self) -> List[Union[str, int]]:
         """Write out in the format that kerchunk uses for chunk entries."""
         return [self.path, self.offset, self.length]
 
+    def dict(self) -> dict[str, Union[str, int]]:
+        return dict(path=self.path, offset=self.offset, length=self.length)
+
 
 # TODO we want the path field to contain a variable-length string, but that's not available until numpy 2.0
 # See https://numpy.org/neps/nep-0055-string_dtype.html
@@ -186,9 +189,10 @@ def to_zarr_json(self, filepath: str) -> None:
     @classmethod
     def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
         chunkentries = {
-            k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items()
+            cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict()
+            for k, v in kerchunk_chunk_dict.items()
         }
-        return ChunkManifest.from_dict(chunkentries)
+        return ChunkManifest.from_dict(cast(ChunkDict, chunkentries))
 
 
 def split(key: ChunkKey) -> Tuple[int, ...]:

From 385290dff384ef07acee6a5294a674980dfd4810 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Mon, 18 Mar 2024 15:19:36 -0400
Subject: [PATCH 06/32] fixed kerchunk tests

---
 virtualizarr/kerchunk.py            |  6 +++---
 virtualizarr/tests/test_kerchunk.py | 14 ++++++++++----
 virtualizarr/tests/test_xarray.py   | 14 +++++++-------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
index 48e4b5af..7525d7b2 100644
--- a/virtualizarr/kerchunk.py
+++ b/virtualizarr/kerchunk.py
@@ -167,7 +167,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable) -> KerchunkArrRefs:
 
     Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415).
     """
-    from virtualizarr.manifests import ManifestArray
+    from virtualizarr.manifests import ChunkEntry, ManifestArray
 
     marr = var.data
 
@@ -177,8 +177,8 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable) -> KerchunkArrRefs:
         )
 
     arr_refs: dict[str, Union[str, List[Union[str, int]]]] = {
-        str(chunk_key): chunk_entry.to_kerchunk()
-        for chunk_key, chunk_entry in marr.manifest.entries.items()
+        str(chunk_key): ChunkEntry(**chunk_entry).to_kerchunk()
+        for chunk_key, chunk_entry in marr.manifest.dict().items()
     }
 
     zarray_dict = marr.zarray.to_kerchunk_json()
diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
index 6fbee558..d59f6391 100644
--- a/virtualizarr/tests/test_kerchunk.py
+++ b/virtualizarr/tests/test_kerchunk.py
@@ -3,7 +3,7 @@
 import xarray as xr
 import xarray.testing as xrt
 
-from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.xarray import dataset_from_kerchunk_refs
 
 
@@ -40,8 +40,11 @@ def test_dataset_from_kerchunk_refs():
 
 class TestAccessor:
     def test_accessor_to_kerchunk_dict(self):
+        manifest = ChunkManifest.from_dict(
+            {"0.0": dict(path="test.nc", offset=6144, length=48)}
+        )
         arr = ManifestArray(
-            chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)},
+            chunkmanifest=manifest,
             zarray=dict(
                 shape=(2, 3),
                 dtype=np.dtype("<i8"),
@@ -68,8 +71,11 @@ def test_accessor_to_kerchunk_dict(self):
         assert result_ds_refs == expected_ds_refs
 
     def test_accessor_to_kerchunk_json(self, tmp_path):
+        manifest = ChunkManifest.from_dict(
+            {"0.0": dict(path="test.nc", offset=6144, length=48)}
+        )
         arr = ManifestArray(
-            chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)},
+            chunkmanifest=manifest,
             zarray=dict(
                 shape=(2, 3),
                 dtype=np.dtype("<i8"),
@@ -107,7 +113,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
         "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
         "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
     }
-    manifest = ChunkManifest(entries=chunks_dict)
+    manifest = ChunkManifest.from_dict(chunks_dict)
     marr = ManifestArray(
         zarray=dict(
             shape=(2, 4),
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index 2bc00ca2..f6774111 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -24,7 +24,7 @@ def test_wrapping():
         "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
         "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
     }
-    manifest = ChunkManifest(entries=chunks_dict)
+    manifest = ChunkManifest.from_dict(chunks_dict)
     marr = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     ds = xr.Dataset({"a": (["x", "y"], marr)})
 
@@ -54,7 +54,7 @@ def test_equals(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest(entries=chunks_dict1)
+        manifest1 = ChunkManifest.from_dict(chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         ds1 = xr.Dataset({"a": (["x", "y"], marr1)})
 
@@ -66,7 +66,7 @@ def test_equals(self):
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest3 = ChunkManifest(entries=chunks_dict3)
+        manifest3 = ChunkManifest.from_dict(chunks_dict3)
         marr3 = ManifestArray(zarray=zarray, chunkmanifest=manifest3)
         ds3 = xr.Dataset({"a": (["x", "y"], marr3)})
         assert not ds1.equals(ds3)
@@ -90,7 +90,7 @@ def test_concat_along_existing_dim(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest(entries=chunks_dict1)
+        manifest1 = ChunkManifest.from_dict(chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         ds1 = xr.Dataset({"a": (["x", "y"], marr1)})
 
@@ -98,7 +98,7 @@ def test_concat_along_existing_dim(self):
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest(entries=chunks_dict2)
+        manifest2 = ChunkManifest.from_dict(chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         ds2 = xr.Dataset({"a": (["x", "y"], marr2)})
 
@@ -136,7 +136,7 @@ def test_concat_along_new_dim(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest(entries=chunks_dict1)
+        manifest1 = ChunkManifest.from_dict(chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         ds1 = xr.Dataset({"a": (["x", "y"], marr1)})
 
@@ -144,7 +144,7 @@ def test_concat_along_new_dim(self):
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest(entries=chunks_dict2)
+        manifest2 = ChunkManifest.from_dict(chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         ds2 = xr.Dataset({"a": (["x", "y"], marr2)})
 

From c0180cce307dbfb3deb8656b9a6b8633104e470b Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Thu, 9 May 2024 19:36:05 -0400
Subject: [PATCH 07/32] change private attributes to 3 numpy arrays

---
 virtualizarr/manifests/manifest.py | 114 +++++++++++++++++++----------
 1 file changed, 76 insertions(+), 38 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index c239fe7e..b06fb877 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -5,7 +5,7 @@
 import numpy as np
 from pydantic import BaseModel, ConfigDict
 
-from ..types import ChunkKey
+from virtualizarr.types import ChunkKey
 
 _INTEGER = (
     r"([1-9]+\d*|0)"  # matches 0 or an unsigned integer that does not begin with zero
@@ -48,21 +48,13 @@ def dict(self) -> dict[str, Union[str, int]]:
         return dict(path=self.path, offset=self.offset, length=self.length)
 
 
-# TODO we want the path field to contain a variable-length string, but that's not available until numpy 2.0
-# See https://numpy.org/neps/nep-0055-string_dtype.html
-MANIFEST_STRUCTURED_ARRAY_DTYPES = np.dtype(
-    [("path", "<U32"), ("offset", np.int32), ("length", np.int32)]
-)
-
-
 class ChunkManifest(BaseModel):
     """
     In-memory representation of a single Zarr chunk manifest.
 
-    Stores the manifest internally as a numpy structured array.
-
-    The manifest can be converted to or from a dictionary form looking like this
+    Stores the manifest internally as numpy arrays.
 
+    The manifest can be converted to or from a dictionary from looking like this
 
         {
             "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
@@ -71,7 +63,6 @@ class ChunkManifest(BaseModel):
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
 
-
     using the .from_dict() and .dict() methods, so users of this class can think of the manifest as if it were a dict.
 
     See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287 .
@@ -85,32 +76,73 @@ class ChunkManifest(BaseModel):
         arbitrary_types_allowed=True,  # so pydantic doesn't complain about the numpy array field
     )
 
-    # TODO how to type hint to indicate a numpy structured array with specifically-typed fields?
-    entries: np.ndarray
+    _paths: np.ndarray[Any, np.dtypes.StringDType]
+    _offsets: np.ndarray[Any, np.dtype("int32")]
+    _lengths: np.ndarray[Any, np.dtype("int32")]
 
     @classmethod
-    def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
-        # TODO do some input validation here first?
-        validate_chunk_keys(chunks.keys())
+    def from_arrays(
+        cls,
+        paths: np.ndarray[Any, np.dtypes.StringDType],
+        offsets: np.ndarray[Any, np.dtype("int32")],
+        lengths: np.ndarray[Any, np.dtype("int32")],
+    ) -> "ChunkManifest":
+        """
+        Create manifest directly from numpy arrays containing the path and byte range information.
 
-        # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape?
-        shape = get_chunk_grid_shape(chunks.keys())
+        Useful if you want to avoid the memory overhead of creating an intermediate dictionary first,
+        as these 3 arrays are what will be used internally to store the references.
 
-        # Initializing to empty implies that entries with path='' are treated as missing chunks
-        entries = np.empty(shape=shape, dtype=MANIFEST_STRUCTURED_ARRAY_DTYPES)
+        Parameters
+        ----------
+        paths: np.ndarray
+        offsets: np.ndarray
+        lengths: np.ndarray
+        """
 
-        # populate the array
-        for key, entry in chunks.items():
-            try:
-                entries[split(key)] = tuple(entry.values())
-            except (ValueError, TypeError) as e:
-                msg = (
-                    "Each chunk entry must be of the form dict(path=<str>, offset=<int>, length=<int>), "
-                    f"but got {entry}"
-                )
-                raise ValueError(msg) from e
+        # check types
+        if not isinstance(paths, np.ndarray):
+            raise TypeError(f"paths must be a numpy array, but got type {type(paths)}")
+        if not isinstance(offsets, np.ndarray):
+            raise TypeError(
+                f"offsets must be a numpy array, but got type {type(offsets)}"
+            )
+        if not isinstance(lengths, np.ndarray):
+            raise TypeError(
+                f"lengths must be a numpy array, but got type {type(lengths)}"
+            )
 
-        return ChunkManifest(entries=entries)
+        # check dtypes
+        if paths.dtype is not np.dtypes.StringDType:
+            raise ValueError(
+                f"paths array must have numpy dtype StringDType, but got dtype {paths.dtype}"
+            )
+        if offsets.dtype is not np.dtype("int32"):
+            raise ValueError(
+                f"offsets array must have 32-bit integer dtype, but got dtype {offsets.dtype}"
+            )
+        if lengths.dtype is not np.dtype("int32"):
+            raise ValueError(
+                f"lengths array must have 32-bit integer dtype, but got dtype {lengths.dtype}"
+            )
+
+        # check shapes
+        shape = paths.shape
+        if offsets.shape != shape:
+            raise ValueError(
+                f"Shapes of the arrays must be consistent, but shapes of paths array and offsets array do not match: {paths.shape} vs {offsets.shape}"
+            )
+        if lengths.shape != shape:
+            raise ValueError(
+                f"Shapes of the arrays must be consistent, but shapes of paths array and lengths array do not match: {paths.shape} vs {lengths.shape}"
+            )
+
+        obj = object.__new__(cls)
+        obj._paths = paths
+        obj._offsets = offsets
+        obj._lengths = lengths
+
+        return obj
 
     @property
     def ndim_chunk_grid(self) -> int:
@@ -119,7 +151,7 @@ def ndim_chunk_grid(self) -> int:
 
         Not the same as the dimension of an array backed by this chunk manifest.
         """
-        return self.entries.ndim
+        return self._paths.ndim
 
     @property
     def shape_chunk_grid(self) -> Tuple[int, ...]:
@@ -128,20 +160,23 @@ def shape_chunk_grid(self) -> Tuple[int, ...]:
 
         Not the same as the shape of an array backed by this chunk manifest.
         """
-        return self.entries.shape
+        return self._paths.shape
 
     def __repr__(self) -> str:
         return f"ChunkManifest<shape={self.shape_chunk_grid}>"
 
     def __getitem__(self, key: ChunkKey) -> ChunkEntry:
         indices = split(key)
-        return ChunkEntry(self.entries[indices])
+        path = self._paths[indices]
+        offset = self._offsets[indices]
+        length = self._lengths[indices]
+        return ChunkEntry(path=path, offset=offset, length=length)
 
     def __iter__(self) -> Iterator[ChunkKey]:
-        return iter(self.entries.keys())
+        return iter(self._paths.keys())
 
     def __len__(self) -> int:
-        return self.entries.size
+        return self._paths.size
 
     def dict(self) -> ChunkDict:
         """
@@ -178,7 +213,10 @@ def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]:
 
     def __eq__(self, other: Any) -> bool:
         """Two manifests are equal if all of their entries are identical."""
-        return (self.entries == other.entries).all()
+        paths_equal = (self.paths == other.paths).all()
+        offsets_equal = (self.offsets == other.offsets).all()
+        lengths_equal = (self.lengths == other.lengths).all()
+        return paths_equal and offsets_equal and lengths_equal
 
     @classmethod
     def from_zarr_json(cls, filepath: str) -> "ChunkManifest":

From e93d3b81c3bb4d8850cb7b878d9d7a919ccbc447 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Thu, 9 May 2024 19:36:38 -0400
Subject: [PATCH 08/32] add from_arrays method

---
 virtualizarr/manifests/manifest.py | 36 ++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index b06fb877..aaff57a0 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -80,6 +80,42 @@ class ChunkManifest(BaseModel):
     _offsets: np.ndarray[Any, np.dtype("int32")]
     _lengths: np.ndarray[Any, np.dtype("int32")]
 
+    @classmethod
+    def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
+        # TODO do some input validation here first?
+        validate_chunk_keys(chunks.keys())
+
+        # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape?
+        shape = get_chunk_grid_shape(chunks.keys())
+
+        # Initializing to empty implies that entries with path='' are treated as missing chunks
+        paths = np.empty(shape=shape, dtype=np.dtypes.StringDType)
+        offsets = np.empty(shape=shape, dtype=np.dtype("int32"))
+        lengths = np.empty(shape=shape, dtype=np.dtype("int32"))
+
+        # populate the array
+        for key, entry in chunks.items():
+            try:
+                entry = ChunkEntry(entry)
+            except (ValueError, TypeError) as e:
+                msg = (
+                    "Each chunk entry must be of the form dict(path=<str>, offset=<int>, length=<int>), "
+                    f"but got {entry}"
+                )
+                raise ValueError(msg) from e
+
+            split_key = split(key)
+            paths[split_key] = entry.path
+            offsets[split_key] = entry.offset
+            lengths[split_key] = entry.length
+
+        obj = object.__new__(cls)
+        obj._paths = paths
+        obj._offsets = offsets
+        obj._lengths = lengths
+
+        return obj
+
     @classmethod
     def from_arrays(
         cls,

From 39131431d2707bfc7ad8c7f4e14ee3919fcf24ee Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Fri, 10 May 2024 12:10:29 -0400
Subject: [PATCH 09/32] to and from dict working again

---
 virtualizarr/manifests/manifest.py | 43 ++++++++++++++----------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index aaff57a0..519cfbc9 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -48,7 +48,7 @@ def dict(self) -> dict[str, Union[str, int]]:
         return dict(path=self.path, offset=self.offset, length=self.length)
 
 
-class ChunkManifest(BaseModel):
+class ChunkManifest:
     """
     In-memory representation of a single Zarr chunk manifest.
 
@@ -71,11 +71,6 @@ class ChunkManifest(BaseModel):
     so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks.
     """
 
-    model_config = ConfigDict(
-        frozen=True,
-        arbitrary_types_allowed=True,  # so pydantic doesn't complain about the numpy array field
-    )
-
     _paths: np.ndarray[Any, np.dtypes.StringDType]
     _offsets: np.ndarray[Any, np.dtype("int32")]
     _lengths: np.ndarray[Any, np.dtype("int32")]
@@ -96,7 +91,8 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
         # populate the array
         for key, entry in chunks.items():
             try:
-                entry = ChunkEntry(entry)
+                path, offset, length = entry.values()
+                entry = ChunkEntry(path=path, offset=offset, length=length)
             except (ValueError, TypeError) as e:
                 msg = (
                     "Each chunk entry must be of the form dict(path=<str>, offset=<int>, length=<int>), "
@@ -224,34 +220,35 @@ def dict(self) -> ChunkDict:
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        """
 
-        def _entry_to_dict(entry: Tuple[str, int, int]) -> dict[str, Union[str, int]]:
-            return {
-                "path": entry[0],
-                "offset": entry[1],
-                "length": entry[2],
-            }
+        Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary._
+        """
 
         coord_vectors = np.mgrid[
             tuple(slice(None, length) for length in self.shape_chunk_grid)
         ]
 
+        d = {
+            join(inds): dict(
+                path=path.item(), offset=offset.item(), length=length.item()
+            )
+            for *inds, path, offset, length in np.nditer(
+                [*coord_vectors, self._paths, self._offsets, self._lengths],
+                flags=("refs_ok",),
+            )
+            if path.item()[0] != ""  # don't include entry if path='' (i.e. empty chunk)
+        }
+
         return cast(
             ChunkDict,
-            {
-                join(inds): _entry_to_dict(entry.item())
-                for *inds, entry in np.nditer([*coord_vectors, self.entries])
-                if entry.item()[0]
-                != ""  # don't include entry if path='' (i.e. empty chunk)
-            },
+            d,
         )
 
     def __eq__(self, other: Any) -> bool:
         """Two manifests are equal if all of their entries are identical."""
-        paths_equal = (self.paths == other.paths).all()
-        offsets_equal = (self.offsets == other.offsets).all()
-        lengths_equal = (self.lengths == other.lengths).all()
+        paths_equal = (self._paths == other._paths).all()
+        offsets_equal = (self._offsets == other._offsets).all()
+        lengths_equal = (self._lengths == other._lengths).all()
         return paths_equal and offsets_equal and lengths_equal
 
     @classmethod

From 6a5d9961a79f96e1be9f98d309984e4c9f4f7a12 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Fri, 10 May 2024 12:39:45 -0400
Subject: [PATCH 10/32] fix dtype comparisons

---
 virtualizarr/manifests/manifest.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 519cfbc9..cdb44098 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -71,9 +71,9 @@ class ChunkManifest:
     so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks.
     """
 
-    _paths: np.ndarray[Any, np.dtypes.StringDType]
-    _offsets: np.ndarray[Any, np.dtype("int32")]
-    _lengths: np.ndarray[Any, np.dtype("int32")]
+    _paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]]
+    _offsets: np.ndarray[Any, np.dtype[np.int32]]
+    _lengths: np.ndarray[Any, np.dtype[np.int32]]
 
     @classmethod
     def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
@@ -115,9 +115,9 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
     @classmethod
     def from_arrays(
         cls,
-        paths: np.ndarray[Any, np.dtypes.StringDType],
-        offsets: np.ndarray[Any, np.dtype("int32")],
-        lengths: np.ndarray[Any, np.dtype("int32")],
+        paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]],
+        offsets: np.ndarray[Any, np.dtype[np.int32]],
+        lengths: np.ndarray[Any, np.dtype[np.int32]],
     ) -> "ChunkManifest":
         """
         Create manifest directly from numpy arrays containing the path and byte range information.
@@ -145,15 +145,15 @@ def from_arrays(
             )
 
         # check dtypes
-        if paths.dtype is not np.dtypes.StringDType:
+        if paths.dtype != np.dtypes.StringDType():
             raise ValueError(
                 f"paths array must have numpy dtype StringDType, but got dtype {paths.dtype}"
             )
-        if offsets.dtype is not np.dtype("int32"):
+        if offsets.dtype != np.dtype("int32"):
             raise ValueError(
                 f"offsets array must have 32-bit integer dtype, but got dtype {offsets.dtype}"
             )
-        if lengths.dtype is not np.dtype("int32"):
+        if lengths.dtype != np.dtype("int32"):
             raise ValueError(
                 f"lengths array must have 32-bit integer dtype, but got dtype {lengths.dtype}"
             )

From 8a77a0a291c89e29d98266adb4ee91aa55b1c708 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Fri, 10 May 2024 12:40:06 -0400
Subject: [PATCH 11/32] depend on numpy release candidate

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index afdde7a7..d37411bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "kerchunk==0.2.2",
     "h5netcdf",
     "pydantic",
-    "numpy",
+    "numpy>=2.0.0rc1",
     "ujson",
     "packaging",
 ]

From a95117f58545006188ffcff8eb7923a85decf0f2 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Fri, 10 May 2024 12:51:08 -0400
Subject: [PATCH 12/32] get concatenation and stacking working

---
 virtualizarr/manifests/array_api.py | 38 ++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
index 3322ab34..7131a314 100644
--- a/virtualizarr/manifests/array_api.py
+++ b/virtualizarr/manifests/array_api.py
@@ -124,11 +124,24 @@ def concatenate(
     new_shape = list(first_shape)
     new_shape[axis] = new_length_along_concat_axis
 
-    concatenated_manifest_entries = np.concatenate(
-        [arr.manifest.entries for arr in arrays],
+    # do concatenation of entries in manifest
+    concatenated_paths = np.concatenate(
+        [arr.manifest._paths for arr in arrays],
         axis=axis,
     )
-    concatenated_manifest = ChunkManifest(entries=concatenated_manifest_entries)
+    concatenated_offsets = np.concatenate(
+        [arr.manifest._offsets for arr in arrays],
+        axis=axis,
+    )
+    concatenated_lengths = np.concatenate(
+        [arr.manifest._lengths for arr in arrays],
+        axis=axis,
+    )
+    concatenated_manifest = ChunkManifest.from_arrays(
+        paths=concatenated_paths,
+        offsets=concatenated_offsets,
+        lengths=concatenated_lengths,
+    )
 
     new_zarray = ZArray(
         chunks=first_arr.chunks,
@@ -210,11 +223,24 @@ def stack(
     new_shape = list(first_shape)
     new_shape.insert(axis, length_along_new_stacked_axis)
 
-    stacked_manifest_entries = np.stack(
-        [arr.manifest.entries for arr in arrays],
+    # do stacking of entries in manifest
+    stacked_paths = np.stack(
+        [arr.manifest._paths for arr in arrays],
         axis=axis,
     )
-    stacked_manifest = ChunkManifest(entries=stacked_manifest_entries)
+    stacked_offsets = np.stack(
+        [arr.manifest._offsets for arr in arrays],
+        axis=axis,
+    )
+    stacked_lengths = np.stack(
+        [arr.manifest._lengths for arr in arrays],
+        axis=axis,
+    )
+    stacked_manifest = ChunkManifest.from_arrays(
+        paths=stacked_paths,
+        offsets=stacked_offsets,
+        lengths=stacked_lengths,
+    )
 
     # chunk size has changed because a length-1 axis has been inserted
     old_chunks = first_arr.chunks

From b45b1604942361f7b01f0389351687029614b8ce Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 May 2024 16:55:21 +0000
Subject: [PATCH 13/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/tests/test_kerchunk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
index 71c8530d..c5a6ec37 100644
--- a/virtualizarr/tests/test_kerchunk.py
+++ b/virtualizarr/tests/test_kerchunk.py
@@ -4,7 +4,7 @@
 import xarray as xr
 import xarray.testing as xrt
 
-from virtualizarr.kerchunk import _automatically_determine_filetype, FileType
+from virtualizarr.kerchunk import FileType, _automatically_determine_filetype
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.xarray import dataset_from_kerchunk_refs
 

From 7410a6669344050ef100241587d38596243e7be8 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Fri, 10 May 2024 15:05:52 -0400
Subject: [PATCH 14/32] remove manifest-level tests of concatenation

---
 .../tests/test_manifests/test_manifest.py     | 61 -------------------
 1 file changed, 61 deletions(-)

diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index 93f3ce45..368e2aa6 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -1,4 +1,3 @@
-import numpy as np
 import pytest
 
 from virtualizarr.manifests import ChunkManifest
@@ -83,66 +82,6 @@ def test_equals(self):
         assert manifest1 != manifest2
 
 
-# TODO could we use hypothesis to test this?
-# Perhaps by testing the property that splitting along a dimension then concatenating the pieces along that dimension should recreate the original manifest?
-class TestCombineManifests:
-    def test_concat(self):
-        manifest1 = ChunkManifest.from_dict(
-            {
-                "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-                "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
-            }
-        )
-        manifest2 = ChunkManifest.from_dict(
-            {
-                "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-                "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
-            }
-        )
-        axis = 1
-        expected = ChunkManifest.from_dict(
-            {
-                "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-                "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
-                "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
-                "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
-            }
-        )
-
-        result_manifest = np.concatenate(
-            [manifest1.entries, manifest2.entries], axis=axis
-        )
-        result = ChunkManifest(entries=result_manifest)
-        assert result.dict() == expected.dict()
-
-    def test_stack(self):
-        manifest1 = ChunkManifest.from_dict(
-            {
-                "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-                "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
-            }
-        )
-        manifest2 = ChunkManifest.from_dict(
-            {
-                "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-                "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
-            }
-        )
-        axis = 1
-        expected = ChunkManifest.from_dict(
-            {
-                "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-                "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
-                "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
-                "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
-            }
-        )
-
-        result_manifest = np.stack([manifest1.entries, manifest2.entries], axis=axis)
-        result = ChunkManifest(entries=result_manifest)
-        assert result.dict() == expected.dict()
-
-
 @pytest.mark.skip(reason="Not implemented")
 class TestSerializeManifest:
     def test_serialize_manifest_to_zarr(self): ...

From e1e8bf7f659f83cdcaaa31ef7bdf71ea9a535b50 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sat, 11 May 2024 13:18:51 -0400
Subject: [PATCH 15/32] generalized create_manifestarray fixture

---
 virtualizarr/tests/__init__.py | 44 +++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 291188a5..2e8730fb 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -1,7 +1,11 @@
+import itertools
+from typing import Union
+
 import numpy as np
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.zarr import ZArray
+from virtualizarr.manifests.manifest import join
+from virtualizarr.zarr import ZArray, ceildiv
 
 
 def create_manifestarray(
@@ -9,6 +13,8 @@ def create_manifestarray(
 ) -> ManifestArray:
     """
     Create an example ManifestArray with sensible defaults.
+
+    The manifest is populated with a (somewhat) unique path, offset, and length for each key.
     """
 
     zarray = ZArray(
@@ -22,14 +28,36 @@ def create_manifestarray(
         zarr_format=2,
     )
 
-    if shape != ():
-        raise NotImplementedError(
-            "Only generation of array representing a single scalar currently supported"
-        )
+    chunk_grid_shape = tuple(
+        ceildiv(axis_length, chunk_length)
+        for axis_length, chunk_length in zip(shape, chunks)
+    )
+
+    # create every possible combination of keys
+    all_possible_combos = itertools.product(
+        *[range(length) for length in chunk_grid_shape]
+    )
 
-    # TODO generalize this
-    chunkmanifest = ChunkManifest(
-        entries={"0": ChunkEntry(path="scalar.nc", offset=6144, length=48)}
+    chunkmanifest = ChunkManifest.from_dict(
+        {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos}
     )
 
     return ManifestArray(chunkmanifest=chunkmanifest, zarray=zarray)
+
+
+def entry_from_chunk_key(ind: tuple[int, ...]) -> dict[str, Union[str, int]]:
+    """Generate a (somewhat) unique manifest entry from a given chunk key"""
+    entry = {
+        "path": f"file.{join(ind)}.nc",
+        "offset": offset_from_chunk_key(ind),
+        "length": length_from_chunk_key(ind),
+    }
+    return entry
+
+
+def offset_from_chunk_key(ind: tuple[int, ...]) -> int:
+    return sum(ind) * 10
+
+
+def length_from_chunk_key(ind: tuple[int, ...]) -> int:
+    return sum(ind) + 5

From 7e97e74e5260fb312d49d86d914e5553d3d4f742 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sat, 11 May 2024 13:19:06 -0400
Subject: [PATCH 16/32] added tests of broadcasting

---
 .../tests/test_manifests/test_array.py        | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index 7def676b..0e890066 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -124,13 +124,37 @@ def test_partly_equals(self): ...
 
 
 class TestBroadcast:
+    def test_broadcast_existing_axis(self):
+        marr = create_manifestarray(shape=(1, 2), chunks=(1, 2))
+        expanded = np.broadcast_to(marr, shape=(3, 2))
+        assert expanded.shape == (3, 2)
+        assert expanded.chunks == (1, 2)
+        assert expanded.manifest.dict() == {
+            "0.0": {"path": "file.0.0.nc", "offset": 0, "length": 5},
+            "1.0": {"path": "file.0.0.nc", "offset": 0, "length": 5},
+            "2.0": {"path": "file.0.0.nc", "offset": 0, "length": 5},
+        }
+
+    def test_broadcast_new_axis(self):
+        marr = create_manifestarray(shape=(3,), chunks=(1,))
+        expanded = np.broadcast_to(marr, shape=(1, 3))
+        assert expanded.shape == (1, 3)
+        assert expanded.chunks == (1, 1)
+        assert expanded.manifest.dict() == {
+            "0.0": {"path": "file.0.nc", "offset": 0, "length": 5},
+            "0.1": {"path": "file.1.nc", "offset": 10, "length": 6},
+            "0.2": {"path": "file.2.nc", "offset": 20, "length": 7},
+        }
+
     def test_broadcast_scalar(self):
         # regression test
         marr = create_manifestarray(shape=(), chunks=())
         expanded = np.broadcast_to(marr, shape=(1,))
         assert expanded.shape == (1,)
         assert expanded.chunks == (1,)
-        assert expanded.manifest == marr.manifest
+        assert expanded.manifest == {
+            "0": {"path": "file.0.nc", "offset": 0, "length": 5},
+        }
 
 
 # TODO we really need some kind of fixtures to generate useful example data

From 96b28419a260edd6028aa33b191e62524e34baf0 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sat, 11 May 2024 13:19:43 -0400
Subject: [PATCH 17/32] made basic broadcasting tests pass by calling
 np.broadcast_to on underlying numpy arrays

---
 virtualizarr/manifests/array_api.py | 80 +++++++++++++++++------------
 1 file changed, 48 insertions(+), 32 deletions(-)

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
index 7131a314..1756f3f8 100644
--- a/virtualizarr/manifests/array_api.py
+++ b/virtualizarr/manifests/array_api.py
@@ -1,9 +1,10 @@
 import itertools
-from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Tuple, Union, cast
+from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Tuple, Union
 
 import numpy as np
 
-from ..zarr import Codec, ZArray
+from virtualizarr.zarr import Codec, ZArray, ceildiv
+
 from .manifest import ChunkManifest
 
 if TYPE_CHECKING:
@@ -281,39 +282,54 @@ def expand_dims(x: "ManifestArray", /, *, axis: int = 0) -> "ManifestArray":
 @implements(np.broadcast_to)
 def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArray":
     """
-    Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries.
+    Broadcasts a ManifestArray to a specified shape, by either adjusting chunk keys or copying chunk manifest entries.
     """
 
-    if len(x.shape) > len(shape):
-        raise ValueError("input operand has more dimensions than allowed")
-
-    # numpy broadcasting algorithm requires us to start by comparing the length of the final axes and work backwards
-    result = x
-    for axis, d, d_requested in itertools.zip_longest(
-        reversed(range(len(shape))), reversed(x.shape), reversed(shape), fillvalue=None
-    ):
-        # len(shape) check above ensures this can't be type None
-        d_requested = cast(int, d_requested)
-
-        if d == d_requested:
-            pass
-        elif d is None:
-            if result.shape == ():
-                # scalars are a special case because their manifests already have a chunk key with one dimension
-                # see https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282
-                result = _broadcast_scalar(result, new_axis_length=d_requested)
-            else:
-                # stack same array upon itself d_requested number of times, which inserts a new axis at axis=0
-                result = stack([result] * d_requested, axis=0)
-        elif d == 1:
-            # concatenate same array upon itself d_requested number of times along existing axis
-            result = concatenate([result] * d_requested, axis=axis)
-        else:
-            raise ValueError(
-                f"Array with shape {x.shape} cannot be broadcast to shape {shape}"
-            )
+    from .array import ManifestArray
+
+    new_chunk_grid_shape = tuple(
+        ceildiv(axis_length, chunk_length)
+        for axis_length, chunk_length in itertools.zip_longest(
+            shape, x.chunks, fillvalue=1
+        )
+    )
+
+    # do broadcasting of entries in manifest
+    broadcast_paths = np.broadcast_to(x.manifest._paths, shape=new_chunk_grid_shape)
+    broadcast_offsets = np.broadcast_to(
+        x.manifest._offsets,
+        shape=new_chunk_grid_shape,
+    )
+    broadcast_lengths = np.broadcast_to(
+        x.manifest._lengths,
+        shape=new_chunk_grid_shape,
+    )
+    broadcast_manifest = ChunkManifest.from_arrays(
+        paths=broadcast_paths,
+        offsets=broadcast_offsets,
+        lengths=broadcast_lengths,
+    )
+
+    new_shape = np.broadcast_shapes(x.shape, shape)
+    new_chunks = tuple(
+        ceildiv(axis_length, chunk_grid_length)
+        for axis_length, chunk_grid_length in zip(new_shape, new_chunk_grid_shape)
+    )
+
+    # refactor to use a new ZArray.copy(x, shape=..., chunks=..) constructor method
+    new_zarray = ZArray(
+        chunks=new_chunks,
+        compressor=x.zarray.compressor,
+        dtype=x.dtype,
+        fill_value=x.zarray.fill_value,
+        filters=x.zarray.filters,
+        shape=new_shape,
+        # TODO presumably these things should be checked for consistency across arrays too?
+        order=x.zarray.order,
+        zarr_format=x.zarray.zarr_format,
+    )
 
-    return result
+    return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray)
 
 
 def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray":

From 00c1757bb1ee9f3c4f0da549734274e3cc8e1290 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sat, 11 May 2024 14:27:45 -0400
Subject: [PATCH 18/32] generalize fixture for creating scalar ManifestArrays

---
 virtualizarr/tests/__init__.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 2e8730fb..7f11a409 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -33,14 +33,16 @@ def create_manifestarray(
         for axis_length, chunk_length in zip(shape, chunks)
     )
 
-    # create every possible combination of keys
-    all_possible_combos = itertools.product(
-        *[range(length) for length in chunk_grid_shape]
-    )
-
-    chunkmanifest = ChunkManifest.from_dict(
-        {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos}
-    )
+    if chunk_grid_shape == ():
+        d = {"0": entry_from_chunk_key((0,))}
+    else:
+        # create every possible combination of keys
+        all_possible_combos = itertools.product(
+            *[range(length) for length in chunk_grid_shape]
+        )
+        d = {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos}
+
+    chunkmanifest = ChunkManifest.from_dict(d)
 
     return ManifestArray(chunkmanifest=chunkmanifest, zarray=zarray)
 

From 06180b3c7a3c619318f43f5b5ba583a40622d20e Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sat, 11 May 2024 14:28:43 -0400
Subject: [PATCH 19/32] improve regression test for expanding scalar
 ManifestArray

---
 virtualizarr/tests/test_manifests/test_array.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index 0e890066..f3f59b4b 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -149,10 +149,16 @@ def test_broadcast_new_axis(self):
     def test_broadcast_scalar(self):
         # regression test
         marr = create_manifestarray(shape=(), chunks=())
+        assert marr.shape == ()
+        assert marr.chunks == ()
+        assert marr.manifest.dict() == {
+            "0": {"path": "file.0.nc", "offset": 0, "length": 5},
+        }
+
         expanded = np.broadcast_to(marr, shape=(1,))
         assert expanded.shape == (1,)
         assert expanded.chunks == (1,)
-        assert expanded.manifest == {
+        assert expanded.manifest.dict() == {
             "0": {"path": "file.0.nc", "offset": 0, "length": 5},
         }
 

From dae048b9863542f918bba5e990352e2198afd47d Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sat, 11 May 2024 14:29:10 -0400
Subject: [PATCH 20/32] remove now-unneeded scalar broadcasting logic

---
 virtualizarr/manifests/array_api.py | 35 -----------------------------
 1 file changed, 35 deletions(-)

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
index 1756f3f8..58c376bb 100644
--- a/virtualizarr/manifests/array_api.py
+++ b/virtualizarr/manifests/array_api.py
@@ -332,41 +332,6 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra
     return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray)
 
 
-def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray":
-    """
-    Add an axis to a scalar ManifestArray, but without adding a new axis to the keys of the chunk manifest.
-
-    This is not the same as concatenation, because there is no existing axis along which to concatenate.
-    It's also not the same as stacking, because we don't want to insert a new axis into the chunk keys.
-
-    Scalars are a special case because their manifests still have a chunk key with one dimension.
-    See https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282
-    """
-
-    from .array import ManifestArray
-
-    new_shape = (new_axis_length,)
-    new_chunks = (new_axis_length,)
-
-    concatenated_manifest = concat_manifests(
-        [x.manifest] * new_axis_length,
-        axis=0,
-    )
-
-    new_zarray = ZArray(
-        chunks=new_chunks,
-        compressor=x.zarray.compressor,
-        dtype=x.dtype,
-        fill_value=x.zarray.fill_value,
-        filters=x.zarray.filters,
-        shape=new_shape,
-        order=x.zarray.order,
-        zarr_format=x.zarray.zarr_format,
-    )
-
-    return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
-
-
 # TODO broadcast_arrays, squeeze, permute_dims
 
 

From 9a72df22aacf9834f5e21a2007788500173e8680 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sun, 12 May 2024 15:26:10 -0400
Subject: [PATCH 21/32] rewrite __eq__ method on ManifestArray

---
 virtualizarr/manifests/array.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
index 423610cb..e5d8d576 100644
--- a/virtualizarr/manifests/array.py
+++ b/virtualizarr/manifests/array.py
@@ -53,6 +53,9 @@ def __init__(
                 f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}"
             )
 
+        # TODO check that the zarray shape and chunkmanifest shape are consistent with one another
+        # TODO also cover the special case of scalar arrays
+
         self._zarray = _zarray
         self._manifest = _chunkmanifest
 
@@ -140,7 +143,7 @@ def __eq__(  # type: ignore[override]
 
         Returns a numpy array of booleans.
         """
-        if isinstance(other, (int, float, bool)):
+        if isinstance(other, (int, float, bool, np.ndarray)):
             # TODO what should this do when comparing against numpy arrays?
             return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool))
         elif not isinstance(other, ManifestArray):
@@ -164,9 +167,22 @@ def __eq__(  # type: ignore[override]
                     UserWarning,
                 )
 
-                # TODO do chunk-wise comparison
-                # TODO expand it into an element-wise result
-                return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool))
+                # do chunk-wise comparison
+                equal_chunk_paths = self.manifest._paths == other.manifest._paths
+                equal_chunk_offsets = self.manifest._offsets == other.manifest._offsets
+                equal_chunk_lengths = self.manifest._lengths == other.manifest._lengths
+
+                equal_chunks = (
+                    equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths
+                )
+
+                if not equal_chunks.all():
+                    # TODO expand chunk-wise comparison into an element-wise result instead of just returning all False
+                    return np.full(
+                        shape=self.shape, fill_value=False, dtype=np.dtype(bool)
+                    )
+                else:
+                    raise RuntimeWarning("Should not be possible to get here")
 
     def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray":
         """Cannot change the dtype, but needed because xarray will call this even when it's a no-op."""

From e09c18fdc6d00266c68e91ce712d36b6e067cd71 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Mon, 13 May 2024 11:40:11 -0400
Subject: [PATCH 22/32] remove unused import

---
 virtualizarr/tests/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 7f11a409..17da446b 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.manifests.manifest import join
 from virtualizarr.zarr import ZArray, ceildiv
 

From bb23e453b6e0d989b95fffdbd0730595c664f14c Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sun, 9 Jun 2024 09:27:52 -0600
Subject: [PATCH 23/32] reinstate the ChunkManifest.__init__ method to accept
 dictionaries

---
 virtualizarr/manifests/array.py               |  2 +-
 virtualizarr/manifests/manifest.py            | 71 +++++++++++--------
 virtualizarr/tests/__init__.py                |  2 +-
 virtualizarr/tests/test_kerchunk.py           | 10 +--
 .../tests/test_manifests/test_array.py        | 20 +++---
 .../tests/test_manifests/test_manifest.py     | 22 +++---
 virtualizarr/tests/test_xarray.py             | 18 ++---
 virtualizarr/tests/test_zarr.py               |  6 +-
 8 files changed, 84 insertions(+), 67 deletions(-)

diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
index e5d8d576..5d132bc2 100644
--- a/virtualizarr/manifests/array.py
+++ b/virtualizarr/manifests/array.py
@@ -50,7 +50,7 @@ def __init__(
             _chunkmanifest = ChunkManifest(entries=chunkmanifest)
         else:
             raise TypeError(
-                f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}"
+                f"chunkmanifest arg must be of type ChunkManifest or dict, but got type {type(chunkmanifest)}"
             )
 
         # TODO check that the zarray shape and chunkmanifest shape are consistent with one another
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index cdb44098..242c1db0 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -52,9 +52,9 @@ class ChunkManifest:
     """
     In-memory representation of a single Zarr chunk manifest.
 
-    Stores the manifest internally as numpy arrays.
+    Stores the manifest internally as numpy arrays, so the most efficient way to create this object is via the `.from_arrays` constructor classmethod.
 
-    The manifest can be converted to or from a dictionary from looking like this
+    The manifest can be converted to or from a dictionary which looks like this
 
         {
             "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
@@ -63,33 +63,49 @@ class ChunkManifest:
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
 
-    using the .from_dict() and .dict() methods, so users of this class can think of the manifest as if it were a dict.
+    using the .__init__() and .dict() methods, so users of this class can think of the manifest as if it were a dict mapping zarr chunk keys to byte ranges.
 
-    See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287 .
+    (See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287.)
 
     Validation is done when this object is instatiated, and this class is immutable,
-    so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks.
+    so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks.
     """
 
     _paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]]
     _offsets: np.ndarray[Any, np.dtype[np.int32]]
     _lengths: np.ndarray[Any, np.dtype[np.int32]]
 
-    @classmethod
-    def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
+    def __init__(self, entries: dict) -> None:
+        """
+        Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges.
+
+        Parameters
+        ----------
+        entries: dict
+            Chunk keys and byte range information, as a dictionary of the form
+
+                {
+                    "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
+                    "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
+                    "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
+                    "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
+                }
+        """
+
         # TODO do some input validation here first?
-        validate_chunk_keys(chunks.keys())
+        validate_chunk_keys(entries.keys())
 
-        # TODO should we actually pass shape in, in case there are not enough chunks to give correct idea of full shape?
-        shape = get_chunk_grid_shape(chunks.keys())
+        # TODO should we actually optionally pass chunk grid shape in,
+        # in case there are not enough chunks to give correct idea of full shape?
+        shape = get_chunk_grid_shape(entries.keys())
 
         # Initializing to empty implies that entries with path='' are treated as missing chunks
         paths = np.empty(shape=shape, dtype=np.dtypes.StringDType)
         offsets = np.empty(shape=shape, dtype=np.dtype("int32"))
         lengths = np.empty(shape=shape, dtype=np.dtype("int32"))
 
-        # populate the array
-        for key, entry in chunks.items():
+        # populate the arrays
+        for key, entry in entries.items():
             try:
                 path, offset, length = entry.values()
                 entry = ChunkEntry(path=path, offset=offset, length=length)
@@ -105,12 +121,9 @@ def from_dict(cls, chunks: ChunkDict) -> "ChunkManifest":
             offsets[split_key] = entry.offset
             lengths[split_key] = entry.length
 
-        obj = object.__new__(cls)
-        obj._paths = paths
-        obj._offsets = offsets
-        obj._lengths = lengths
-
-        return obj
+        self._paths = paths
+        self._offsets = offsets
+        self._lengths = lengths
 
     @classmethod
     def from_arrays(
@@ -205,14 +218,18 @@ def __getitem__(self, key: ChunkKey) -> ChunkEntry:
         return ChunkEntry(path=path, offset=offset, length=length)
 
     def __iter__(self) -> Iterator[ChunkKey]:
-        return iter(self._paths.keys())
+        # TODO make this work for numpy arrays
+        raise NotImplementedError()
+        # return iter(self._paths.keys())
 
     def __len__(self) -> int:
         return self._paths.size
 
     def dict(self) -> ChunkDict:
         """
-        Converts the entire manifest to a nested dictionary, of the form
+        Convert the entire manifest to a nested dictionary.
+
+        The returned dict will be of the form
 
         {
             "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
@@ -221,7 +238,7 @@ def dict(self) -> ChunkDict:
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
 
-        Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary._
+        Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary.
         """
 
         coord_vectors = np.mgrid[
@@ -255,17 +272,15 @@ def __eq__(self, other: Any) -> bool:
     def from_zarr_json(cls, filepath: str) -> "ChunkManifest":
         """Create a ChunkManifest from a Zarr manifest.json file."""
         with open(filepath, "r") as manifest_file:
-            entries_dict = json.load(manifest_file)
+            entries = json.load(manifest_file)
 
-        entries = {
-            cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()
-        }
         return cls(entries=entries)
 
     def to_zarr_json(self, filepath: str) -> None:
-        """Write a ChunkManifest to a Zarr manifest.json file."""
+        """Write the manifest to a Zarr manifest.json file."""
+        entries = self.dict()
         with open(filepath, "w") as json_file:
-            json.dump(self.dict(), json_file, indent=4, separators=(", ", ": "))
+            json.dump(entries, json_file, indent=4, separators=(", ", ": "))
 
     @classmethod
     def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
@@ -273,7 +288,7 @@ def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
             cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict()
             for k, v in kerchunk_chunk_dict.items()
         }
-        return ChunkManifest.from_dict(cast(ChunkDict, chunkentries))
+        return ChunkManifest(entries=cast(ChunkDict, chunkentries))
 
 
 def split(key: ChunkKey) -> Tuple[int, ...]:
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 17da446b..13ee63ab 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -42,7 +42,7 @@ def create_manifestarray(
         )
         d = {join(ind): entry_from_chunk_key(ind) for ind in all_possible_combos}
 
-    chunkmanifest = ChunkManifest.from_dict(d)
+    chunkmanifest = ChunkManifest(entries=d)
 
     return ManifestArray(chunkmanifest=chunkmanifest, zarray=zarray)
 
diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
index c5a6ec37..f3ff312f 100644
--- a/virtualizarr/tests/test_kerchunk.py
+++ b/virtualizarr/tests/test_kerchunk.py
@@ -67,8 +67,8 @@ def test_dataset_from_df_refs_with_filters():
 
 class TestAccessor:
     def test_accessor_to_kerchunk_dict(self):
-        manifest = ChunkManifest.from_dict(
-            {"0.0": dict(path="test.nc", offset=6144, length=48)}
+        manifest = ChunkManifest(
+            entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
         )
         arr = ManifestArray(
             chunkmanifest=manifest,
@@ -98,8 +98,8 @@ def test_accessor_to_kerchunk_dict(self):
         assert result_ds_refs == expected_ds_refs
 
     def test_accessor_to_kerchunk_json(self, tmp_path):
-        manifest = ChunkManifest.from_dict(
-            {"0.0": dict(path="test.nc", offset=6144, length=48)}
+        manifest = ChunkManifest(
+            entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
         )
         arr = ManifestArray(
             chunkmanifest=manifest,
@@ -140,7 +140,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
         "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
         "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
     }
-    manifest = ChunkManifest.from_dict(chunks_dict)
+    manifest = ChunkManifest(entries=chunks_dict)
     marr = ManifestArray(
         zarray=dict(
             shape=(2, 4),
diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index f3f59b4b..d847bb9b 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -14,7 +14,7 @@ def test_create_manifestarray(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest.from_dict(chunks_dict)
+        manifest = ChunkManifest(entries=chunks_dict)
         chunks = (5, 1, 10)
         shape = (5, 2, 20)
         zarray = ZArray(
@@ -69,7 +69,7 @@ def test_equals(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest.from_dict(chunks_dict)
+        manifest = ChunkManifest(entries=chunks_dict)
         chunks = (5, 1, 10)
         shape = (5, 2, 20)
         zarray = ZArray(
@@ -108,14 +108,14 @@ def test_not_equal_chunk_entries(self):
             "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
             "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest.from_dict(chunks_dict2)
+        manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         assert not (marr1 == marr2).all()
 
@@ -183,14 +183,14 @@ def test_concat(self):
             "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
             "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest.from_dict(chunks_dict2)
+        manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
 
         result = np.concatenate([marr1, marr2], axis=1)
@@ -228,14 +228,14 @@ def test_stack(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest.from_dict(chunks_dict2)
+        manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
 
         result = np.stack([marr1, marr2], axis=1)
@@ -271,11 +271,11 @@ def test_refuse_combine():
     chunks_dict1 = {
         "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
     }
-    chunkmanifest1 = ChunkManifest.from_dict(chunks_dict1)
+    chunkmanifest1 = ChunkManifest(entries=chunks_dict1)
     chunks_dict2 = {
         "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
     }
-    chunkmanifest2 = ChunkManifest.from_dict(chunks_dict2)
+    chunkmanifest2 = ChunkManifest(entries=chunks_dict2)
     marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunkmanifest1)
 
     zarray_wrong_compressor = zarray_common.copy()
diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index 368e2aa6..acf8ac42 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -8,7 +8,7 @@ def test_create_manifest(self):
         chunks = {
             "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
         }
-        manifest = ChunkManifest.from_dict(chunks)
+        manifest = ChunkManifest(entries=chunks)
         assert manifest.dict() == chunks
 
         chunks = {
@@ -17,7 +17,7 @@ def test_create_manifest(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest.from_dict(chunks)
+        manifest = ChunkManifest(entries=chunks)
         assert manifest.dict() == chunks
 
     def test_invalid_chunk_entries(self):
@@ -25,7 +25,7 @@ def test_invalid_chunk_entries(self):
             "0.0.0": {"path": "s3://bucket/foo.nc"},
         }
         with pytest.raises(ValueError, match="must be of the form"):
-            ChunkManifest.from_dict(chunks)
+            ChunkManifest(entries=chunks)
 
         chunks = {
             "0.0.0": {
@@ -35,21 +35,21 @@ def test_invalid_chunk_entries(self):
             },
         }
         with pytest.raises(ValueError, match="must be of the form"):
-            ChunkManifest.from_dict(chunks)
+            ChunkManifest(entries=chunks)
 
     def test_invalid_chunk_keys(self):
         chunks = {
             "0.0.": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
         }
         with pytest.raises(ValueError, match="Invalid format for chunk key: '0.0.'"):
-            ChunkManifest.from_dict(chunks)
+            ChunkManifest(entries=chunks)
 
         chunks = {
             "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
             "0": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
         }
         with pytest.raises(ValueError, match="Inconsistent number of dimensions"):
-            ChunkManifest.from_dict(chunks)
+            ChunkManifest(entries=chunks)
 
 
 class TestProperties:
@@ -60,21 +60,21 @@ def test_chunk_grid_info(self):
             "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
             "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
         }
-        manifest = ChunkManifest.from_dict(chunks)
+        manifest = ChunkManifest(entries=chunks)
         assert manifest.ndim_chunk_grid == 3
         assert manifest.shape_chunk_grid == (1, 2, 2)
 
 
 class TestEquals:
     def test_equals(self):
-        manifest1 = ChunkManifest.from_dict(
-            {
+        manifest1 = ChunkManifest(
+            entries={
                 "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
             }
         )
-        manifest2 = ChunkManifest.from_dict(
-            {
+        manifest2 = ChunkManifest(
+            entries={
                 "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
                 "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
             }
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index a2eb9d72..a13977ff 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -29,7 +29,7 @@ def test_wrapping():
         "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
         "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
     }
-    manifest = ChunkManifest.from_dict(chunks_dict)
+    manifest = ChunkManifest(entries=chunks_dict)
     marr = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     ds = xr.Dataset({"a": (["x", "y"], marr)})
 
@@ -59,7 +59,7 @@ def test_equals(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         ds1 = xr.Dataset({"a": (["x", "y"], marr1)})
 
@@ -71,7 +71,7 @@ def test_equals(self):
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest3 = ChunkManifest.from_dict(chunks_dict3)
+        manifest3 = ChunkManifest(entries=chunks_dict3)
         marr3 = ManifestArray(zarray=zarray, chunkmanifest=manifest3)
         ds3 = xr.Dataset({"a": (["x", "y"], marr3)})
         assert not ds1.equals(ds3)
@@ -96,7 +96,7 @@ def test_concat_along_existing_dim(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         ds1 = xr.Dataset({"a": (["x", "y"], marr1)})
 
@@ -104,7 +104,7 @@ def test_concat_along_existing_dim(self):
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest.from_dict(chunks_dict2)
+        manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         ds2 = xr.Dataset({"a": (["x", "y"], marr2)})
 
@@ -143,7 +143,7 @@ def test_concat_along_new_dim(self):
             "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         ds1 = xr.Dataset({"a": (["x", "y"], marr1)})
 
@@ -151,7 +151,7 @@ def test_concat_along_new_dim(self):
             "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
             "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest.from_dict(chunks_dict2)
+        manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         ds2 = xr.Dataset({"a": (["x", "y"], marr2)})
 
@@ -193,7 +193,7 @@ def test_concat_dim_coords_along_existing_dim(self):
             "0": {"path": "foo.nc", "offset": 100, "length": 100},
             "1": {"path": "foo.nc", "offset": 200, "length": 100},
         }
-        manifest1 = ChunkManifest.from_dict(chunks_dict1)
+        manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
         coords = xr.Coordinates({"t": (["t"], marr1)}, indexes={})
         ds1 = xr.Dataset(coords=coords)
@@ -202,7 +202,7 @@ def test_concat_dim_coords_along_existing_dim(self):
             "0": {"path": "foo.nc", "offset": 300, "length": 100},
             "1": {"path": "foo.nc", "offset": 400, "length": 100},
         }
-        manifest2 = ChunkManifest.from_dict(chunks_dict2)
+        manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
         coords = xr.Coordinates({"t": (["t"], marr2)}, indexes={})
         ds2 = xr.Dataset(coords=coords)
diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
index 71bec981..866fcc92 100644
--- a/virtualizarr/tests/test_zarr.py
+++ b/virtualizarr/tests/test_zarr.py
@@ -3,12 +3,14 @@
 import xarray.testing as xrt
 
 from virtualizarr import ManifestArray, open_virtual_dataset
-from virtualizarr.manifests.manifest import ChunkEntry
+from virtualizarr.manifests.manifest import ChunkManifest
 
 
 def test_zarr_v3_roundtrip(tmpdir):
     arr = ManifestArray(
-        chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)},
+        chunkmanifest=ChunkManifest(
+            entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
+        ),
         zarray=dict(
             shape=(2, 3),
             dtype=np.dtype("<i8"),

From 899f45785efcd2a979b9b9759f46244952deb686 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Sun, 9 Jun 2024 09:57:58 -0600
Subject: [PATCH 24/32] hopefully fix broadcasting bug

---
 virtualizarr/manifests/array_api.py | 49 ++++++++++-------------------
 virtualizarr/zarr.py                | 25 +++++++++++++++
 2 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
index 58c376bb..10561f3c 100644
--- a/virtualizarr/manifests/array_api.py
+++ b/virtualizarr/manifests/array_api.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from virtualizarr.zarr import Codec, ZArray, ceildiv
+from virtualizarr.zarr import Codec, ceildiv
 
 from .manifest import ChunkManifest
 
@@ -144,16 +144,8 @@ def concatenate(
         lengths=concatenated_lengths,
     )
 
-    new_zarray = ZArray(
-        chunks=first_arr.chunks,
-        compressor=first_arr.zarray.compressor,
-        dtype=first_arr.dtype,
-        fill_value=first_arr.zarray.fill_value,
-        filters=first_arr.zarray.filters,
-        shape=new_shape,
-        # TODO presumably these things should be checked for consistency across arrays too?
-        order=first_arr.zarray.order,
-        zarr_format=first_arr.zarray.zarr_format,
+    new_zarray = first_arr.zarray.replace(
+        shape=tuple(new_shape),
     )
 
     return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
@@ -248,16 +240,9 @@ def stack(
     new_chunks = list(old_chunks)
     new_chunks.insert(axis, 1)
 
-    new_zarray = ZArray(
-        chunks=new_chunks,
-        compressor=first_arr.zarray.compressor,
-        dtype=first_arr.dtype,
-        fill_value=first_arr.zarray.fill_value,
-        filters=first_arr.zarray.filters,
-        shape=new_shape,
-        # TODO presumably these things should be checked for consistency across arrays too?
-        order=first_arr.zarray.order,
-        zarr_format=first_arr.zarray.zarr_format,
+    new_zarray = first_arr.zarray.replace(
+        chunks=tuple(new_chunks),
+        shape=tuple(new_shape),
     )
 
     return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray)
@@ -287,10 +272,16 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra
 
     from .array import ManifestArray
 
+    # iterate from last axis to first here so that the fillvalue of 1 is inserted at the start of the shape
+    # i.e. to follow numpy's broadcasting rules
     new_chunk_grid_shape = tuple(
-        ceildiv(axis_length, chunk_length)
-        for axis_length, chunk_length in itertools.zip_longest(
-            shape, x.chunks, fillvalue=1
+        reversed(
+            [
+                ceildiv(axis_length, chunk_length)
+                for axis_length, chunk_length in itertools.zip_longest(
+                    shape[::-1], x.chunks[::-1], fillvalue=1
+                )
+            ]
         )
     )
 
@@ -316,17 +307,9 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra
         for axis_length, chunk_grid_length in zip(new_shape, new_chunk_grid_shape)
     )
 
-    # refactor to use a new ZArray.copy(x, shape=..., chunks=..) constructor method
-    new_zarray = ZArray(
+    new_zarray = x.zarray.replace(
         chunks=new_chunks,
-        compressor=x.zarray.compressor,
-        dtype=x.dtype,
-        fill_value=x.zarray.fill_value,
-        filters=x.zarray.filters,
         shape=new_shape,
-        # TODO presumably these things should be checked for consistency across arrays too?
-        order=x.zarray.order,
-        zarr_format=x.zarray.zarr_format,
     )
 
     return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray)
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index e20561e6..bc703df9 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -99,6 +99,31 @@ def dict(self) -> dict[str, Any]:
     def to_kerchunk_json(self) -> str:
         return ujson.dumps(self.dict())
 
+    def replace(
+        self,
+        chunks: Optional[tuple[int, ...]] = None,
+        compressor: Optional[str] = None,
+        dtype: Optional[np.dtype] = None,
+        fill_value: Optional[float] = None,  # float or int?
+        filters: Optional[List[Dict]] = None,
+        order: Optional[Union[Literal["C"], Literal["F"]]] = None,
+        shape: Optional[Tuple[int, ...]] = None,
+        zarr_format: Optional[Union[Literal[2], Literal[3]]] = None,
+    ) -> "ZArray":
+        """
+        Convenience method to create a new ZArray from an existing one by altering only certain attributes.
+        """
+        return ZArray(
+            chunks=chunks if chunks is not None else self.chunks,
+            compressor=compressor if compressor is not None else self.compressor,
+            dtype=dtype if dtype is not None else self.dtype,
+            fill_value=fill_value if fill_value is not None else self.fill_value,
+            filters=filters if filters is not None else self.filters,
+            shape=shape if shape is not None else self.shape,
+            order=order if order is not None else self.order,
+            zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
+        )
+
 
 def encode_dtype(dtype: np.dtype) -> str:
     # TODO not sure if there is a better way to get the '<i4' style representation of the dtype out

From 0a109bc0134ca4481daa372d082ee29fcea5fe34 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Thu, 13 Jun 2024 10:59:38 -0400
Subject: [PATCH 25/32] add backwards compatibility for pre-numpy2.0

---
 virtualizarr/kerchunk.py           |  4 ++--
 virtualizarr/manifests/manifest.py | 28 +++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
index f0aecda4..ea85a9d0 100644
--- a/virtualizarr/kerchunk.py
+++ b/virtualizarr/kerchunk.py
@@ -232,13 +232,13 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
 
     Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415).
     """
-    from virtualizarr.manifests import ChunkEntry, ManifestArray
+    from virtualizarr.manifests import ManifestArray
 
     if isinstance(var.data, ManifestArray):
         marr = var.data
 
         arr_refs: dict[str, str | list[str | int]] = {
-            str(chunk_key): [entry['path'], entry['offset'], entry['length']]
+            str(chunk_key): [entry["path"], entry["offset"], entry["length"]]
             for chunk_key, entry in marr.manifest.dict().items()
         }
 
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 2eee6b0e..86cb9f64 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,6 +1,7 @@
 import json
 import re
 from collections.abc import Iterable, Iterator
+from importlib.metadata import version
 from typing import Any, NewType, Tuple, Union, cast
 
 import numpy as np
@@ -17,6 +18,17 @@
 
 ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]])
 
+# This is a hard-coded length as its the only option for backwards-compatibility
+# The actual choice of 32-bit length is arbitrary
+# TODO remove once all dependencies support numpy 2.0
+ValidStringDTypes = [np.dtype("<U32")]
+if version("numpy") >= "2.0.0":
+    from numpy.dtypes import StringDType  # type: ignore[attr-defined]
+
+    ValidStringDTypes.append(StringDType())
+# TODO get type hinting to work here
+T_ValidStringDTypes = Any
+
 
 class ChunkEntry(BaseModel):
     """
@@ -70,7 +82,7 @@ class ChunkManifest:
     so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks.
     """
 
-    _paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]]
+    _paths: np.ndarray[Any, T_ValidStringDTypes]
     _offsets: np.ndarray[Any, np.dtype[np.int32]]
     _lengths: np.ndarray[Any, np.dtype[np.int32]]
 
@@ -98,8 +110,14 @@ def __init__(self, entries: dict) -> None:
         # in case there are not enough chunks to give correct idea of full shape?
         shape = get_chunk_grid_shape(entries.keys())
 
+        if version("numpy") < "2.0.0":
+            # TODO remove this backwards-compatibility branch once all dependencies support numpy 2.0
+            string_dtype = np.dtype("<U32")
+        else:
+            string_dtype = np.dtypes.StringDType()  # type: ignore[attr-defined]
+
         # Initializing to empty implies that entries with path='' are treated as missing chunks
-        paths = np.empty(shape=shape, dtype=np.dtypes.StringDType)
+        paths = np.empty(shape=shape, dtype=string_dtype)
         offsets = np.empty(shape=shape, dtype=np.dtype("int32"))
         lengths = np.empty(shape=shape, dtype=np.dtype("int32"))
 
@@ -127,7 +145,7 @@ def __init__(self, entries: dict) -> None:
     @classmethod
     def from_arrays(
         cls,
-        paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]],
+        paths: np.ndarray[Any, np.dtype[T_ValidStringDTypes]],
         offsets: np.ndarray[Any, np.dtype[np.int32]],
         lengths: np.ndarray[Any, np.dtype[np.int32]],
     ) -> "ChunkManifest":
@@ -157,9 +175,9 @@ def from_arrays(
             )
 
         # check dtypes
-        if paths.dtype != np.dtypes.StringDType():
+        if paths.dtype not in ValidStringDTypes:
             raise ValueError(
-                f"paths array must have numpy dtype StringDType, but got dtype {paths.dtype}"
+                f"paths array must have a numpy string dtype (i.e. one of {ValidStringDTypes}), but got dtype {paths.dtype}"
             )
         if offsets.dtype != np.dtype("int32"):
             raise ValueError(

From 8d2dcd5e452d192b1e931f037c624df8df9fabb2 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Mon, 17 Jun 2024 18:36:30 -0400
Subject: [PATCH 26/32] depend on numpy>=2.0.0

---
 pyproject.toml                     |  4 ++--
 virtualizarr/manifests/manifest.py | 30 ++++++------------------------
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 79b9d221..3dffef2a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,11 +21,11 @@ classifiers = [
 requires-python = ">=3.10"
 dynamic = ["version"]
 dependencies = [
-    "xarray>=2024.5.0",
+    "xarray>=2024.06.0",
     "kerchunk>=0.2.5",
     "h5netcdf",
     "pydantic",
-    "numpy>=2.0.0rc1",
+    "numpy>=2.0.0",
     "ujson",
     "packaging",
     "universal-pathlib",
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 86cb9f64..9cb25177 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,7 +1,6 @@
 import json
 import re
 from collections.abc import Iterable, Iterator
-from importlib.metadata import version
 from typing import Any, NewType, Tuple, Union, cast
 
 import numpy as np
@@ -18,17 +17,6 @@
 
 ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]])
 
-# This is a hard-coded length as its the only option for backwards-compatibility
-# The actual choice of 32-bit length is arbitrary
-# TODO remove once all dependencies support numpy 2.0
-ValidStringDTypes = [np.dtype("<U32")]
-if version("numpy") >= "2.0.0":
-    from numpy.dtypes import StringDType  # type: ignore[attr-defined]
-
-    ValidStringDTypes.append(StringDType())
-# TODO get type hinting to work here
-T_ValidStringDTypes = Any
-
 
 class ChunkEntry(BaseModel):
     """
@@ -78,11 +66,11 @@ class ChunkManifest:
 
     (See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287.)
 
-    Validation is done when this object is instatiated, and this class is immutable,
+    Validation is done when this object is instantiated, and this class is immutable,
     so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks.
     """
 
-    _paths: np.ndarray[Any, T_ValidStringDTypes]
+    _paths: np.ndarray[Any, np.dtypes.StringDType]
     _offsets: np.ndarray[Any, np.dtype[np.int32]]
     _lengths: np.ndarray[Any, np.dtype[np.int32]]
 
@@ -110,14 +98,8 @@ def __init__(self, entries: dict) -> None:
         # in case there are not enough chunks to give correct idea of full shape?
         shape = get_chunk_grid_shape(entries.keys())
 
-        if version("numpy") < "2.0.0":
-            # TODO remove this backwards-compatibility branch once all dependencies support numpy 2.0
-            string_dtype = np.dtype("<U32")
-        else:
-            string_dtype = np.dtypes.StringDType()  # type: ignore[attr-defined]
-
         # Initializing to empty implies that entries with path='' are treated as missing chunks
-        paths = np.empty(shape=shape, dtype=string_dtype)
+        paths = np.empty(shape=shape, dtype=np.dtypes.StringDType())
         offsets = np.empty(shape=shape, dtype=np.dtype("int32"))
         lengths = np.empty(shape=shape, dtype=np.dtype("int32"))
 
@@ -145,7 +127,7 @@ def __init__(self, entries: dict) -> None:
     @classmethod
     def from_arrays(
         cls,
-        paths: np.ndarray[Any, np.dtype[T_ValidStringDTypes]],
+        paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]],
         offsets: np.ndarray[Any, np.dtype[np.int32]],
         lengths: np.ndarray[Any, np.dtype[np.int32]],
     ) -> "ChunkManifest":
@@ -175,9 +157,9 @@ def from_arrays(
             )
 
         # check dtypes
-        if paths.dtype not in ValidStringDTypes:
+        if paths.dtype != np.dtypes.StringDType():
             raise ValueError(
-                f"paths array must have a numpy string dtype (i.e. one of {ValidStringDTypes}), but got dtype {paths.dtype}"
+                f"paths array must have a numpy variable-length string dtype, but got dtype {paths.dtype}"
             )
         if offsets.dtype != np.dtype("int32"):
             raise ValueError(

From 1e62807ff0036ebbde988c496c0f444183c28f90 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Tue, 18 Jun 2024 00:38:59 -0400
Subject: [PATCH 27/32] rewrote broadcast_to shape logic

---
 virtualizarr/manifests/array_api.py | 67 +++++++++++++++++------------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
index 41e8a3e0..0ecdc023 100644
--- a/virtualizarr/manifests/array_api.py
+++ b/virtualizarr/manifests/array_api.py
@@ -1,4 +1,3 @@
-import itertools
 from typing import TYPE_CHECKING, Callable, Iterable
 
 import numpy as np
@@ -144,6 +143,7 @@ def concatenate(
         lengths=concatenated_lengths,
     )
 
+    # chunk shape has not changed, there are just now more chunks along the concatenation axis
     new_zarray = first_arr.zarray.replace(
         shape=tuple(new_shape),
     )
@@ -235,7 +235,7 @@ def stack(
         lengths=stacked_lengths,
     )
 
-    # chunk size has changed because a length-1 axis has been inserted
+    # chunk shape has changed because a length-1 axis has been inserted
     old_chunks = first_arr.chunks
     new_chunks = list(old_chunks)
     new_chunks.insert(axis, 1)
@@ -272,47 +272,60 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
 
     from .array import ManifestArray
 
-    # iterate from last axis to first here so that the fillvalue of 1 is inserted at the start of the shape
-    # i.e. to follow numpy's broadcasting rules
-    new_chunk_grid_shape = tuple(
-        reversed(
-            [
-                ceildiv(axis_length, chunk_length)
-                for axis_length, chunk_length in itertools.zip_longest(
-                    shape[::-1], x.chunks[::-1], fillvalue=1
-                )
-            ]
+    new_shape = shape
+
+    # check its actually possible to broadcast to this new shape
+    mutually_broadcastable_shape = np.broadcast_shapes(x.shape, new_shape)
+    if mutually_broadcastable_shape != new_shape:
+        # we're not trying to broadcast both shapes to a third shape
+        raise ValueError(
+            f"array of shape {x.shape} cannot be broadcast to shape {new_shape}"
         )
+
+    # new chunk_shape is old chunk_shape with singleton dimensions pre-pended
+    # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements)
+    old_chunk_shape = x.chunks
+    new_chunk_shape = _prepend_singleton_dimensions(
+        old_chunk_shape, ndim=len(new_shape)
+    )
+
+    # find new chunk grid shape by dividing new array shape by new chunk shape
+    new_chunk_grid_shape = tuple(
+        ceildiv(axis_length, chunk_length)
+        for axis_length, chunk_length in zip(new_shape, new_chunk_shape)
     )
 
     # do broadcasting of entries in manifest
-    broadcast_paths = np.broadcast_to(x.manifest._paths, shape=new_chunk_grid_shape)
-    broadcast_offsets = np.broadcast_to(
+    broadcasted_paths = np.broadcast_to(
+        x.manifest._paths,
+        shape=new_chunk_grid_shape,
+    )
+    broadcasted_offsets = np.broadcast_to(
         x.manifest._offsets,
         shape=new_chunk_grid_shape,
     )
-    broadcast_lengths = np.broadcast_to(
+    broadcasted_lengths = np.broadcast_to(
         x.manifest._lengths,
         shape=new_chunk_grid_shape,
     )
-    broadcast_manifest = ChunkManifest.from_arrays(
-        paths=broadcast_paths,
-        offsets=broadcast_offsets,
-        lengths=broadcast_lengths,
-    )
-
-    new_shape = np.broadcast_shapes(x.shape, shape)
-    new_chunks = tuple(
-        ceildiv(axis_length, chunk_grid_length)
-        for axis_length, chunk_grid_length in zip(new_shape, new_chunk_grid_shape)
+    broadcasted_manifest = ChunkManifest.from_arrays(
+        paths=broadcasted_paths,
+        offsets=broadcasted_offsets,
+        lengths=broadcasted_lengths,
     )
 
     new_zarray = x.zarray.replace(
-        chunks=new_chunks,
+        chunks=new_chunk_shape,
         shape=new_shape,
     )
 
-    return ManifestArray(chunkmanifest=broadcast_manifest, zarray=new_zarray)
+    return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray)
+
+
+def _prepend_singleton_dimensions(shape: tuple[int, ...], ndim: int) -> tuple[int, ...]:
+    """Prepend as many new length-1 axes to shape as necessary such that the result has ndim number of axes."""
+    n_prepended_dims = ndim - len(shape)
+    return tuple([1] * n_prepended_dims + list(shape))
 
 
 # TODO broadcast_arrays, squeeze, permute_dims

From dc824bb8bce87244ac400b89f7cdfe91cb3bd0dc Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Tue, 18 Jun 2024 00:54:04 -0400
Subject: [PATCH 28/32] remove faulty hypothesis strategies

---
 .../tests/test_manifests/test_array.py        | 53 +++++++++++--------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index eb3a3df5..459e60be 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -1,8 +1,5 @@
-import hypothesis.extra.numpy as npst
-import hypothesis.strategies as st
 import numpy as np
 import pytest
-from hypothesis import given
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.tests import create_manifestarray
@@ -165,31 +162,43 @@ def test_broadcast_scalar(self):
             "0": {"path": "file.0.nc", "offset": 0, "length": 5},
         }
 
-    @given(st.data())
-    def test_broadcast_any_shape(self, data):
-        # Generate arbitary ManifestArray with arbitrary chunk shape
-        original_shape = data.draw(npst.array_shapes())
-        arr_ndim = len(original_shape)
-        chunk_shape = data.draw(
-            npst.array_shapes(
-                min_dims=arr_ndim, max_dims=arr_ndim, max_side=max(original_shape)
-            )
-        )
-        marr = create_manifestarray(shape=original_shape, chunks=chunk_shape)
-
-        # generate arbitrary broadcastable shape to broadcast to
-        target_broadcast_shape = data.draw(npst.broadcastable_shapes(marr.shape))
+    @pytest.mark.parametrize(
+        "shape, chunks, target_shape",
+        [
+            ((1,), (1,), ()),
+            ((2,), (1,), (1,)),
+            ((3,), (2,), (5, 4, 4)),
+            ((3, 2), (2, 2), (2, 3, 4)),
+        ],
+    )
+    def test_raise_on_invalid_broadcast_shapes(self, shape, chunks, target_shape):
+        marr = create_manifestarray(shape=shape, chunks=chunks)
+        with pytest.raises(ValueError):
+            np.broadcast_to(marr, shape=target_shape)
+
+    # TODO replace this parametrization with hypothesis strategies
+    @pytest.mark.parametrize(
+        "shape, chunks, target_shape",
+        [
+            ((1,), (1,), (3,)),
+            ((2,), (1,), (2,)),
+            ((3,), (2,), (5, 4, 3)),
+            ((3, 1), (2, 1), (2, 3, 4)),
+        ],
+    )
+    def test_broadcast_any_shape(self, shape, chunks, target_shape):
+        marr = create_manifestarray(shape=shape, chunks=chunks)
 
         # do the broadcasting
-        broadcasted_marr = np.broadcast_to(marr, shape=target_broadcast_shape)
-        broadcasted_chunk_shape = broadcasted_marr.chunks
+        broadcasted_marr = np.broadcast_to(marr, shape=target_shape)
 
         # check that the resultant shape is correct
-        assert broadcasted_marr.shape == target_broadcast_shape
+        assert broadcasted_marr.shape == target_shape
 
         # check that chunk shape has plausible ndims and lengths
-        assert broadcasted_chunk_shape.ndim == broadcasted_marr.ndim
-        for len_arr, len_chunk in zip(broadcasted_chunk_shape, broadcasted_marr):
+        broadcasted_chunk_shape = broadcasted_marr.chunks
+        assert len(broadcasted_chunk_shape) == broadcasted_marr.ndim
+        for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape):
             assert len_chunk <= len_arr
 
 

From a97e5d864c60d23bb6e8010431e9e746c9ae38b6 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Tue, 18 Jun 2024 01:03:58 -0400
Subject: [PATCH 29/32] remove hypothesis from dependencies

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3dffef2a..b91af30c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,6 @@ test = [
     "fsspec",
     "s3fs",
     "fastparquet",
-    "hypothesis",
 ]
 
 

From daaa33105167b9545adfa5a0baf985801227557c Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Tue, 18 Jun 2024 11:04:39 -0400
Subject: [PATCH 30/32] ignore remaining mypy errors

---
 virtualizarr/manifests/manifest.py | 8 ++++----
 virtualizarr/zarr.py               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 9cb25177..530ce17c 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -70,7 +70,7 @@ class ChunkManifest:
     so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks.
     """
 
-    _paths: np.ndarray[Any, np.dtypes.StringDType]
+    _paths: np.ndarray[Any, np.dtypes.StringDType]  # type: ignore[name-defined]
     _offsets: np.ndarray[Any, np.dtype[np.int32]]
     _lengths: np.ndarray[Any, np.dtype[np.int32]]
 
@@ -99,7 +99,7 @@ def __init__(self, entries: dict) -> None:
         shape = get_chunk_grid_shape(entries.keys())
 
         # Initializing to empty implies that entries with path='' are treated as missing chunks
-        paths = np.empty(shape=shape, dtype=np.dtypes.StringDType())
+        paths = np.empty(shape=shape, dtype=np.dtypes.StringDType())  # type: ignore[attr-defined]
         offsets = np.empty(shape=shape, dtype=np.dtype("int32"))
         lengths = np.empty(shape=shape, dtype=np.dtype("int32"))
 
@@ -127,7 +127,7 @@ def __init__(self, entries: dict) -> None:
     @classmethod
     def from_arrays(
         cls,
-        paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]],
+        paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]],  # type: ignore[name-defined]
         offsets: np.ndarray[Any, np.dtype[np.int32]],
         lengths: np.ndarray[Any, np.dtype[np.int32]],
     ) -> "ChunkManifest":
@@ -157,7 +157,7 @@ def from_arrays(
             )
 
         # check dtypes
-        if paths.dtype != np.dtypes.StringDType():
+        if paths.dtype != np.dtypes.StringDType():  # type: ignore[attr-defined]
             raise ValueError(
                 f"paths array must have a numpy variable-length string dtype, but got dtype {paths.dtype}"
             )
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index dbba794a..22b46ffb 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -109,7 +109,7 @@ def replace(
         compressor: Optional[str] = None,
         dtype: Optional[np.dtype] = None,
         fill_value: Optional[float] = None,  # float or int?
-        filters: Optional[list[dict]] = None,
+        filters: Optional[list[dict]] = None,  # type: ignore[valid-type]
         order: Optional[Literal["C"] | Literal["F"]] = None,
         shape: Optional[tuple[int, ...]] = None,
         zarr_format: Optional[Literal[2] | Literal[3]] = None,

From 5f6c91204334e698148f8382200d61b2cc115511 Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Tue, 18 Jun 2024 11:07:38 -0400
Subject: [PATCH 31/32] release notes

---
 docs/releases.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/releases.rst b/docs/releases.rst
index 2255ac10..7228ca66 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -13,6 +13,8 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Requires numpy 2.0 (for :pull:`107`).
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 Deprecations
 ~~~~~~~~~~~~
@@ -29,6 +31,8 @@ Documentation
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- Refactor `ChunkManifest` class to store chunk references internally using numpy arrays.
+  (:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
   (:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 

From 7893c8803147793d88b77fa135f49537c47be8ed Mon Sep 17 00:00:00 2001
From: TomNicholas <tom@cworthy.org>
Date: Tue, 18 Jun 2024 13:13:14 -0400
Subject: [PATCH 32/32] update dependencies in CI test env

---
 ci/environment.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index d8af6bf5..0385ea5a 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -7,10 +7,10 @@ dependencies:
   - h5py
   - hdf5
   - netcdf4
-  - xarray>=2024.5.0
+  - xarray>=2024.6.0
   - kerchunk>=0.2.5
   - pydantic
-  - numpy
+  - numpy>=2.0.0
   - ujson
   - packaging
   - universal_pathlib