From e8be25e8f69ddfa6990aad2dbdd7bc255ec48c22 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 1 Aug 2023 13:29:48 -0400 Subject: [PATCH 01/12] Very early POC --- zarr/core.py | 33 +++++++++------- zarr/indexing.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 115 insertions(+), 17 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 43ccdbaf7d..afe0e3a9be 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2067,13 +2067,16 @@ def _process_chunk( fields, out_selection, partial_read_decode=False, + coords=None ): """Take binary data from storage and fill output array""" + shape = [c if isinstance(c, int) else c[coords[i]] + for i, c in enumerate(self._chunks)] if ( out_is_ndarray and not fields and is_contiguous_selection(out_selection) - and is_total_slice(chunk_selection, self._chunks) + and is_total_slice(chunk_selection, shape) and not self._filters and self._dtype != object ): @@ -2100,7 +2103,7 @@ def _process_chunk( if isinstance(cdata, UncompressedPartialReadBufferV3): cdata = cdata.read_full() chunk = ensure_ndarray_like(cdata).view(self._dtype) - chunk = chunk.reshape(self._chunks, order=self._order) + chunk = chunk.reshape(shape, order=self._order) np.copyto(dest, chunk) return @@ -2109,7 +2112,7 @@ def _process_chunk( if partial_read_decode: cdata.prepare_chunk() # size of chunk - tmp = np.empty_like(self._meta_array, shape=self._chunks, dtype=self.dtype) + tmp = np.empty_like(self._meta_array, shape=shape, dtype=self.dtype) index_selection = PartialChunkIterator(chunk_selection, self.chunks) for start, nitems, partial_out_selection in index_selection: expected_shape = [ @@ -2138,7 +2141,7 @@ def _process_chunk( return except ArrayIndexError: cdata = cdata.read_full() - chunk = self._decode_chunk(cdata) + chunk = self._decode_chunk(cdata, expected_shape=shape) # select data from chunk if fields: @@ -2223,7 +2226,7 @@ def _chunk_getitems( contexts = ConstantMap(ckeys, constant=Context(meta_array=self._meta_array)) cdatas = self.chunk_store.getitems(ckeys, contexts=contexts) - for ckey, chunk_select, out_select in zip(ckeys, lchunk_selection, lout_selection): + for ckey, coords, chunk_select, out_select in zip(ckeys, lchunk_coords, lchunk_selection, lout_selection): if ckey in cdatas: self._process_chunk( out, @@ -2234,6 +2237,7 @@ def _chunk_getitems( fields, out_select, partial_read_decode=partial_read_decode, + coords=coords ) else: # check exception type @@ -2305,7 +2309,8 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): ckey = self._chunk_key(chunk_coords) - cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) + cdata = self._process_for_setitem(ckey, chunk_selection, value, coords=chunk_coords, + fields=fields) # attempt to delete chunk if it only contains the fill value if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): @@ -2313,8 +2318,10 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non else: self.chunk_store[ckey] = self._encode_chunk(cdata) - def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): - if is_total_slice(chunk_selection, self._chunks) and not fields: + def _process_for_setitem(self, ckey, chunk_selection, value, coords=None, fields=None): + shape = [c if isinstance(c, int) else c[coords[i]] + for i, c in enumerate(self._chunks)] + if is_total_slice(chunk_selection, shape) and not fields: # totally replace chunk # optimization: we are completely replacing the chunk, so no need @@ -2324,7 +2331,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): # setup array filled with value chunk = np.empty_like( - self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order + self._meta_array, shape=shape, dtype=self._dtype, order=self._order ) chunk.fill(value) @@ -2346,22 +2353,22 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): # chunk not initialized if self._fill_value is not None: chunk = np.empty_like( - self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order + self._meta_array, shape=shape, dtype=self._dtype, order=self._order ) chunk.fill(self._fill_value) elif self._dtype == object: - chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) + chunk = np.empty(self.shape, dtype=self._dtype, order=self._order) else: # N.B., use zeros here so any region beyond the array has consistent # and compressible data chunk = np.zeros_like( - self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order + self._meta_array, shape=self.shape, dtype=self._dtype, order=self._order ) else: # decode chunk - chunk = self._decode_chunk(cdata) + chunk = self._decode_chunk(cdata, expected_shape=shape) if not chunk.flags.writeable: chunk = chunk.copy(order="K") diff --git a/zarr/indexing.py b/zarr/indexing.py index 487cc8b9d9..c8e0842bd5 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -163,6 +163,85 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) +class VarSliceDimIndexer: + def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): + # normalize + start = dim_sel.start + if start is None: + start = 0 + elif start < 0: + start = dim_len - start + + stop = dim_sel.stop + if stop is None: + stop = dim_len + elif stop < 0: + stop = dim_len - stop + step = dim_sel.step or 1 + if step < 0: + raise NotImplementedError + + # store attributes + self.offsets = np.cumsum([0] + chunk_lengths) + self.dim_len = dim_len + self.dim_sel = dim_sel + self.projections = [] + + remainder = 0 + nfilled = 0 + for i in range(len(chunk_lengths)): + if start > self.offsets[i + 1]: + # not yet at first chunk + continue + if stop < self.offsets[i]: + # past last valid chunk + break + slice_start = max(start - self.offsets[i], 0 + remainder) + slice_end = min( + stop - self.offsets[i], self.offsets[i + 1] - self.offsets[i] + ) + if slice_start == slice_end: + continue + remainder = ( + (self.offsets[i] + slice_start) - self.offsets[i + 1] + ) % step + nelem = (slice_end - slice_start) // step + self.projections.append( + ChunkDimProjection( + i, + slice(slice_start, slice_end, step), + slice(nfilled, nfilled + nelem) + ) + ) + nfilled += nelem + + self.nitems = nfilled + + def __iter__(self): + yield from self.projections + + +class VarIntDimIndexer: + def __init__(self, dim_sel: int, dim_len: int, chunk_lengths: list[int]): + + self.offsets = np.cumsum([0] + chunk_lengths) + self.dim_len = dim_len + + # normalize + dim_sel = normalize_integer_selection(dim_sel, self.dim_len) + + # store attributes + self.dim_sel = dim_sel + self.nitems = 1 + + def __iter__(self): + for ix, off in enumerate(self.offsets): + if off > self.dim_sel: + break + ix -= 1 + yield ChunkDimProjection(ix, self.dim_sel - self.offsets[ix], None) + + def ceildiv(a, b): return math.ceil(a / b) @@ -339,10 +418,16 @@ def __init__(self, selection, array): for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + if isinstance(dim_chunk_len, int): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = VarIntDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_slice(dim_sel): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + if isinstance(dim_chunk_len, int): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = VarSliceDimIndexer(dim_sel, dim_len, dim_chunk_len) else: raise IndexError( @@ -614,10 +699,16 @@ def __init__(self, selection, array): for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + if isinstance(dim_chunk_len, int): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = VarIntDimIndexer(dim_sel, dim_len, dim_chunk_len) elif isinstance(dim_sel, slice): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + if isinstance(dim_chunk_len, int): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = VarSliceDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_integer_array(dim_sel): dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) From c50ee29b0b70e6d21559a20d915da5c767c91964 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:34:26 +0000 Subject: [PATCH 02/12] style: pre-commit fixes --- zarr/core.py | 19 ++++++++++--------- zarr/indexing.py | 12 +++--------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index afe0e3a9be..5d39ef2d18 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2067,11 +2067,10 @@ def _process_chunk( fields, out_selection, partial_read_decode=False, - coords=None + coords=None, ): """Take binary data from storage and fill output array""" - shape = [c if isinstance(c, int) else c[coords[i]] - for i, c in enumerate(self._chunks)] + shape = [c if isinstance(c, int) else c[coords[i]] for i, c in enumerate(self._chunks)] if ( out_is_ndarray and not fields @@ -2226,7 +2225,9 @@ def _chunk_getitems( contexts = ConstantMap(ckeys, constant=Context(meta_array=self._meta_array)) cdatas = self.chunk_store.getitems(ckeys, contexts=contexts) - for ckey, coords, chunk_select, out_select in zip(ckeys, lchunk_coords, lchunk_selection, lout_selection): + for ckey, coords, chunk_select, out_select in zip( + ckeys, lchunk_coords, lchunk_selection, lout_selection + ): if ckey in cdatas: self._process_chunk( out, @@ -2237,7 +2238,7 @@ def _chunk_getitems( fields, out_select, partial_read_decode=partial_read_decode, - coords=coords + coords=coords, ) else: # check exception type @@ -2309,8 +2310,9 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): ckey = self._chunk_key(chunk_coords) - cdata = self._process_for_setitem(ckey, chunk_selection, value, coords=chunk_coords, - fields=fields) + cdata = self._process_for_setitem( + ckey, chunk_selection, value, coords=chunk_coords, fields=fields + ) # attempt to delete chunk if it only contains the fill value if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): @@ -2319,8 +2321,7 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, coords=None, fields=None): - shape = [c if isinstance(c, int) else c[coords[i]] - for i, c in enumerate(self._chunks)] + shape = [c if isinstance(c, int) else c[coords[i]] for i, c in enumerate(self._chunks)] if is_total_slice(chunk_selection, shape) and not fields: # totally replace chunk diff --git a/zarr/indexing.py b/zarr/indexing.py index c8e0842bd5..a1e8fb1b01 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -197,20 +197,14 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): # past last valid chunk break slice_start = max(start - self.offsets[i], 0 + remainder) - slice_end = min( - stop - self.offsets[i], self.offsets[i + 1] - self.offsets[i] - ) + slice_end = min(stop - self.offsets[i], self.offsets[i + 1] - self.offsets[i]) if slice_start == slice_end: continue - remainder = ( - (self.offsets[i] + slice_start) - self.offsets[i + 1] - ) % step + remainder = ((self.offsets[i] + slice_start) - self.offsets[i + 1]) % step nelem = (slice_end - slice_start) // step self.projections.append( ChunkDimProjection( - i, - slice(slice_start, slice_end, step), - slice(nfilled, nfilled + nelem) + i, slice(slice_start, slice_end, step), slice(nfilled, nfilled + nelem) ) ) nfilled += nelem From 4f890ecac27874632e82e202ad7b371b6b5e6336 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 7 Aug 2023 14:12:35 +0200 Subject: [PATCH 03/12] Fix slices with steps, add tests --- zarr/indexing.py | 73 +++++++++++++++++++++++++++++-------- zarr/tests/test_indexing.py | 21 +++++++++++ 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index a1e8fb1b01..9674990103 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -177,6 +177,9 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): stop = dim_len elif stop < 0: stop = dim_len - stop + else: + stop = min(stop, dim_len) + step = dim_sel.step or 1 if step < 0: raise NotImplementedError @@ -187,27 +190,65 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): self.dim_sel = dim_sel self.projections = [] - remainder = 0 - nfilled = 0 - for i in range(len(chunk_lengths)): - if start > self.offsets[i + 1]: - # not yet at first chunk - continue - if stop < self.offsets[i]: - # past last valid chunk - break - slice_start = max(start - self.offsets[i], 0 + remainder) - slice_end = min(stop - self.offsets[i], self.offsets[i + 1] - self.offsets[i]) - if slice_start == slice_end: + first_chunk, last_chunk = np.searchsorted(self.offsets[1:], [start, stop]) + + # Special case: single chunk + if first_chunk == last_chunk: + slice_start = start - self.offsets[first_chunk] + slice_end = stop - self.offsets[first_chunk] + + n_filled = ceildiv((slice_end - slice_start), step) + + self.projections.append( + ChunkDimProjection( + first_chunk, + slice( + start - self.offsets[first_chunk], + stop - self.offsets[first_chunk], + step + ), + slice(0, n_filled) + ) + ) + self.nitems = n_filled + return + + # First chunk, don't need to worry about step for starting pos + slice_start = start - self.offsets[first_chunk] + slice_end = chunk_lengths[first_chunk] + nfilled = ceildiv((slice_end - slice_start), step) + self.projections.append( + ChunkDimProjection( + first_chunk, + slice( + slice_start, + slice_end, + step, + ), + slice(0, nfilled) + ) + ) + for i in range(first_chunk + 1, last_chunk + 1): + rem = (self.offsets[i] - start) % step + + if rem != 0: + slice_start = step - rem + else: + slice_start = 0 + + slice_end = min(chunk_lengths[i], stop-self.offsets[i]) + slice_len = ceildiv((slice_end - slice_start), step) + if slice_end <= slice_start: continue - remainder = ((self.offsets[i] + slice_start) - self.offsets[i + 1]) % step - nelem = (slice_end - slice_start) // step + cur_nfilled = nfilled + slice_len self.projections.append( ChunkDimProjection( - i, slice(slice_start, slice_end, step), slice(nfilled, nfilled + nelem) + i, + slice(slice_start, slice_end, step), + slice(nfilled, cur_nfilled), ) ) - nfilled += nelem + nfilled = cur_nfilled self.nitems = nfilled diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 8a34c1e715..726b70d9c6 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1790,3 +1790,24 @@ def test_accessed_chunks(shape, chunks, ops): ) == 1 # Check that no other chunks were accessed assert len(delta_counts) == 0 + +@pytest.mark.parametrize( + "chunking, index", + [ + ([[10]], (slice(5, 7),)), + ([[1, 2, 2, 5]], (slice(None),)), + ([[1, 2, 2, 5]], (slice(2, 8),)), + ([[1, 2, 2, 5]], (slice(None, None, 2),)), + ([[1, 2, 2, 5]], (slice(1, None, 2),)), + ([[1, 2, 2, 5], 5], (slice(1, None, 2), slice(None))), + ] +) +def test_varchunk_indexing(chunking, index): + from functools import reduce + from operator import mul + shape = tuple(x * 2 if isinstance(x, int) else sum(x) for x in chunking) + np_array = np.arange(reduce(mul, shape)).reshape(shape) + var_chunked = zarr.array(np_array, chunks=chunking) + regular_chunked = zarr.array(np_array) + + np.testing.assert_equal(var_chunked[index], regular_chunked[index]) \ No newline at end of file From b9ba867e14286732acd1ab669617c7d5e63eee52 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 7 Aug 2023 14:37:04 +0200 Subject: [PATCH 04/12] More indexing test cases --- zarr/tests/test_indexing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 726b70d9c6..bbb6ddd12c 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1795,6 +1795,8 @@ def test_accessed_chunks(shape, chunks, ops): "chunking, index", [ ([[10]], (slice(5, 7),)), + ([[1, 9]], (slice(5, 7),)), + ([[1, 5, 4]], (slice(5, 7),)), ([[1, 2, 2, 5]], (slice(None),)), ([[1, 2, 2, 5]], (slice(2, 8),)), ([[1, 2, 2, 5]], (slice(None, None, 2),)), From e12a84920780e8c409ef9da4f3476f349e630831 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 7 Aug 2023 15:04:13 +0200 Subject: [PATCH 05/12] Fix indexing with offsets and steps --- zarr/indexing.py | 57 +++++++++++-------------------------- zarr/tests/test_indexing.py | 12 +++++++- 2 files changed, 27 insertions(+), 42 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 9674990103..81a3f2d3ed 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -192,54 +192,29 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): first_chunk, last_chunk = np.searchsorted(self.offsets[1:], [start, stop]) - # Special case: single chunk - if first_chunk == last_chunk: - slice_start = start - self.offsets[first_chunk] - slice_end = stop - self.offsets[first_chunk] + rem = 0 + nfilled = 0 + next_slice_start = start - self.offsets[first_chunk] - n_filled = ceildiv((slice_end - slice_start), step) - - self.projections.append( - ChunkDimProjection( - first_chunk, - slice( - start - self.offsets[first_chunk], - stop - self.offsets[first_chunk], - step - ), - slice(0, n_filled) - ) - ) - self.nitems = n_filled - return - - # First chunk, don't need to worry about step for starting pos - slice_start = start - self.offsets[first_chunk] - slice_end = chunk_lengths[first_chunk] - nfilled = ceildiv((slice_end - slice_start), step) - self.projections.append( - ChunkDimProjection( - first_chunk, - slice( - slice_start, - slice_end, - step, - ), - slice(0, nfilled) - ) - ) - for i in range(first_chunk + 1, last_chunk + 1): - rem = (self.offsets[i] - start) % step + for i in range(first_chunk, last_chunk + 1): + # Setup slice for this chunk + slice_start = next_slice_start + slice_end = min(chunk_lengths[i], stop-self.offsets[i]) + slice_len = ceildiv((slice_end - slice_start), step) + # Prepare for next iteration + # We do this ahead of the next iteration so that we don't need to special case the first chunk + rem = (self.offsets[i + 1] - start) % step if rem != 0: - slice_start = step - rem + next_slice_start = step - rem else: - slice_start = 0 + next_slice_start = 0 - slice_end = min(chunk_lengths[i], stop-self.offsets[i]) - slice_len = ceildiv((slice_end - slice_start), step) + # Skip iteration if slice is empty if slice_end <= slice_start: continue + + # Add projection for this chunk + update nfilled cur_nfilled = nfilled + slice_len self.projections.append( ChunkDimProjection( diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index bbb6ddd12c..98b06b245f 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1794,13 +1794,23 @@ def test_accessed_chunks(shape, chunks, ops): @pytest.mark.parametrize( "chunking, index", [ + # Single chunk ([[10]], (slice(5, 7),)), + # Sample from single chunk ([[1, 9]], (slice(5, 7),)), + # Slice across multiple chunks ([[1, 5, 4]], (slice(5, 7),)), + # Full slice across multiple chunks ([[1, 2, 2, 5]], (slice(None),)), + # slice across multiple chunks ([[1, 2, 2, 5]], (slice(2, 8),)), + # slice across multiple chunks with step ([[1, 2, 2, 5]], (slice(None, None, 2),)), + # slice across multiple chunks with start and step ([[1, 2, 2, 5]], (slice(1, None, 2),)), + # Skip many chunks + ([[3, 3, 1, 1, 1, 1, 3, 1, 3]], (slice(1, None, 4),)), + # Slice 2d case ([[1, 2, 2, 5], 5], (slice(1, None, 2), slice(None))), ] ) @@ -1812,4 +1822,4 @@ def test_varchunk_indexing(chunking, index): var_chunked = zarr.array(np_array, chunks=chunking) regular_chunked = zarr.array(np_array) - np.testing.assert_equal(var_chunked[index], regular_chunked[index]) \ No newline at end of file + np.testing.assert_equal(var_chunked[index], regular_chunked[index]) From 467293703e5ae2c624e8e380adb262d0517b85a7 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 7 Aug 2023 17:09:48 +0200 Subject: [PATCH 06/12] Get boolean + integer indexing working --- zarr/indexing.py | 148 +++++++++++++++++++++++++++++++++++- zarr/tests/test_indexing.py | 10 ++- 2 files changed, 155 insertions(+), 3 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 81a3f2d3ed..3eb41ef1b5 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -522,6 +522,55 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) +class VarBoolArrayDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_lens: list[int]): + + # check number of dimensions + if not is_bool_array(dim_sel, 1): + raise IndexError( + "Boolean arrays in an orthogonal selection must " "be 1-dimensional only" + ) + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError( + "Boolean array has the wrong length for dimension; " + "expected {}, got {}".format(dim_len, dim_sel.shape[0]) + ) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_lens = dim_chunk_lens + self.nchunks = len(dim_chunk_lens) + self.offsets = np.cumsum([0] + dim_chunk_lens) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype="i8") + np.add.reduceat(dim_sel, self.offsets[:-1], out=self.chunk_nitems) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + def __iter__(self): + + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + + # find region in chunk + dim_offset = self.offsets[dim_chunk_ix] + dim_chunk_sel = self.dim_sel[dim_offset : self.offsets[dim_chunk_ix + 1]] + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + class Order: UNKNOWN = 0 INCREASING = 1 @@ -555,6 +604,95 @@ def boundscheck_indices(x, dim_len): raise BoundsCheckError(dim_len) +class VarIntArrayDimIndexer: + """Integer array selection against a single dimension.""" + + def __init__( + self, + dim_sel, + dim_len, + dim_chunk_lens: list[int], + wraparound=True, + boundscheck=True, + order=Order.UNKNOWN, + ): + + # ensure 1d array + dim_sel = np.asanyarray(dim_sel) + if not is_integer_array(dim_sel, 1): + raise IndexError( + "integer arrays in an orthogonal selection must be " "1-dimensional only" + ) + + # handle wraparound + if wraparound: + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + if boundscheck: + boundscheck_indices(dim_sel, dim_len) + + # store attributes + self.dim_len = dim_len + self.dim_chunk_lens = dim_chunk_lens + self.nchunks = len(dim_chunk_lens) + self.nitems = len(dim_sel) + self.offsets = np.cumsum([0] + dim_chunk_lens) + + # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck + dim_sel_chunk = np.digitize(dim_sel, self.offsets[1:]) + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + self.order = order + + if self.order == Order.INCREASING: + self.dim_sel = dim_sel + self.dim_out_sel = None + elif self.order == Order.DECREASING: + self.dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange + self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) + else: + # sort indices to group by chunk + self.dim_out_sel = np.argsort(dim_sel_chunk) + self.dim_sel = np.take(dim_sel, self.dim_out_sel) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) + + # find chunks that we need to visit + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + # compute offsets into the output array + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + + def __iter__(self): + + for dim_chunk_ix in self.dim_chunk_ixs: + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.order == Order.INCREASING: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_out_sel[start:stop] + + # find region in chunk + dim_offset = self.offsets[dim_chunk_ix] + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + + class IntArrayDimIndexer: """Integer array selection against a single dimension.""" @@ -721,10 +859,16 @@ def __init__(self, selection, array): dim_indexer = VarSliceDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_integer_array(dim_sel): - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + if isinstance(dim_chunk_len, int): + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = VarIntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_bool_array(dim_sel): - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + if isinstance(dim_chunk_len, int): + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = VarBoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) else: raise IndexError( diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 98b06b245f..894570d1b3 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1812,6 +1812,12 @@ def test_accessed_chunks(shape, chunks, ops): ([[3, 3, 1, 1, 1, 1, 3, 1, 3]], (slice(1, None, 4),)), # Slice 2d case ([[1, 2, 2, 5], 5], (slice(1, None, 2), slice(None))), + # Fancy indexing + ([[1, 2, 2, 5], 1], (np.array([0, 3, 7]),)), + # out of order fancy indexing + ([[1, 3, 1, 1, 5], 1], (np.array([6, 6, 6, 0, 10]),)), + # boolean indexing + ([[2, 1, 2], 2], ([True, False, False, True, True], slice(None))), ] ) def test_varchunk_indexing(chunking, index): @@ -1822,4 +1828,6 @@ def test_varchunk_indexing(chunking, index): var_chunked = zarr.array(np_array, chunks=chunking) regular_chunked = zarr.array(np_array) - np.testing.assert_equal(var_chunked[index], regular_chunked[index]) + expected = regular_chunked[index] + actual = var_chunked[index] + np.testing.assert_equal(actual, expected) From 8033a0a24b3cf330cf9173dca618fbb991cafb94 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Aug 2023 14:17:21 +0000 Subject: [PATCH 07/12] style: pre-commit fixes --- zarr/indexing.py | 6 +++--- zarr/tests/test_indexing.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 3eb41ef1b5..31185d3fe2 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -199,7 +199,7 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): for i in range(first_chunk, last_chunk + 1): # Setup slice for this chunk slice_start = next_slice_start - slice_end = min(chunk_lengths[i], stop-self.offsets[i]) + slice_end = min(chunk_lengths[i], stop - self.offsets[i]) slice_len = ceildiv((slice_end - slice_start), step) # Prepare for next iteration @@ -218,7 +218,7 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): cur_nfilled = nfilled + slice_len self.projections.append( ChunkDimProjection( - i, + i, slice(slice_start, slice_end, step), slice(nfilled, cur_nfilled), ) @@ -571,6 +571,7 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + class Order: UNKNOWN = 0 INCREASING = 1 @@ -692,7 +693,6 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - class IntArrayDimIndexer: """Integer array selection against a single dimension.""" diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 894570d1b3..eb9e5a60cb 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1791,6 +1791,7 @@ def test_accessed_chunks(shape, chunks, ops): # Check that no other chunks were accessed assert len(delta_counts) == 0 + @pytest.mark.parametrize( "chunking, index", [ @@ -1818,11 +1819,12 @@ def test_accessed_chunks(shape, chunks, ops): ([[1, 3, 1, 1, 5], 1], (np.array([6, 6, 6, 0, 10]),)), # boolean indexing ([[2, 1, 2], 2], ([True, False, False, True, True], slice(None))), - ] + ], ) def test_varchunk_indexing(chunking, index): from functools import reduce from operator import mul + shape = tuple(x * 2 if isinstance(x, int) else sum(x) for x in chunking) np_array = np.arange(reduce(mul, shape)).reshape(shape) var_chunked = zarr.array(np_array, chunks=chunking) From 11245007a560522b6ff2b9bf6b45a7861ae670c1 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 9 Aug 2023 10:54:36 -0400 Subject: [PATCH 08/12] types --- zarr/indexing.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 3eb41ef1b5..22d3b18f8b 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -1,7 +1,10 @@ +from __future__ import annotations + import collections import itertools import math import numbers +from typing import List import numpy as np @@ -164,7 +167,7 @@ def __iter__(self): class VarSliceDimIndexer: - def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: list[int]): + def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: List[int]): # normalize start = dim_sel.start if start is None: @@ -232,7 +235,7 @@ def __iter__(self): class VarIntDimIndexer: - def __init__(self, dim_sel: int, dim_len: int, chunk_lengths: list[int]): + def __init__(self, dim_sel: int, dim_len: int, chunk_lengths: List[int]): self.offsets = np.cumsum([0] + chunk_lengths) self.dim_len = dim_len @@ -523,7 +526,7 @@ def __iter__(self): class VarBoolArrayDimIndexer: - def __init__(self, dim_sel, dim_len, dim_chunk_lens: list[int]): + def __init__(self, dim_sel, dim_len, dim_chunk_lens: List[int]): # check number of dimensions if not is_bool_array(dim_sel, 1): @@ -611,7 +614,7 @@ def __init__( self, dim_sel, dim_len, - dim_chunk_lens: list[int], + dim_chunk_lens: List[int], wraparound=True, boundscheck=True, order=Order.UNKNOWN, From 28315217ff3a1445a6b6a1ae888d550e24989831 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 14 Aug 2023 11:10:47 -0400 Subject: [PATCH 09/12] some fix --- zarr/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 5d39ef2d18..06d848558b 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2070,7 +2070,7 @@ def _process_chunk( coords=None, ): """Take binary data from storage and fill output array""" - shape = [c if isinstance(c, int) else c[coords[i]] for i, c in enumerate(self._chunks)] + shape = tuple(c if isinstance(c, int) else c[coords[i]] for i, c in enumerate(self._chunks)) if ( out_is_ndarray and not fields @@ -2321,7 +2321,7 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, coords=None, fields=None): - shape = [c if isinstance(c, int) else c[coords[i]] for i, c in enumerate(self._chunks)] + shape = tuple(c if isinstance(c, int) else c[coords[i]] for i, c in enumerate(self._chunks)) if is_total_slice(chunk_selection, shape) and not fields: # totally replace chunk @@ -2358,12 +2358,12 @@ def _process_for_setitem(self, ckey, chunk_selection, value, coords=None, fields ) chunk.fill(self._fill_value) elif self._dtype == object: - chunk = np.empty(self.shape, dtype=self._dtype, order=self._order) + chunk = np.empty(shape, dtype=self._dtype, order=self._order) else: # N.B., use zeros here so any region beyond the array has consistent # and compressible data chunk = np.zeros_like( - self._meta_array, shape=self.shape, dtype=self._dtype, order=self._order + self._meta_array, shape=shape, dtype=self._dtype, order=self._order ) else: From 0d10ccb2fdf6db1bd49a1c495a252957a7247a9c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 14 Aug 2023 11:21:57 -0400 Subject: [PATCH 10/12] fix more --- zarr/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/util.py b/zarr/util.py index ea0dd9fcec..ad7f29a009 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -175,7 +175,7 @@ def normalize_chunks(chunks: Any, shape: Tuple[int, ...], typesize: int) -> Tupl if -1 in chunks or None in chunks: chunks = tuple(s if c == -1 or c is None else int(c) for s, c in zip(shape, chunks)) - chunks = tuple(int(c) for c in chunks) + chunks = tuple(list(int(_) for _ in c) if isinstance(c, list) else int(c) for c in chunks) return chunks From 882f301ec0813d9b983111771ee6c94fa57db927 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 14 Aug 2023 15:12:06 -0400 Subject: [PATCH 11/12] some coverage --- zarr/indexing.py | 8 ++++---- zarr/tests/test_indexing.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index e06d624e44..26ef6d900b 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -173,13 +173,13 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: List[int]): if start is None: start = 0 elif start < 0: - start = dim_len - start + start = dim_len + start stop = dim_sel.stop if stop is None: stop = dim_len elif stop < 0: - stop = dim_len - stop + stop = dim_len + stop else: stop = min(stop, dim_len) @@ -205,8 +205,8 @@ def __init__(self, dim_sel: slice, dim_len: int, chunk_lengths: List[int]): slice_end = min(chunk_lengths[i], stop - self.offsets[i]) slice_len = ceildiv((slice_end - slice_start), step) - # Prepare for next iteration - # We do this ahead of the next iteration so that we don't need to special case the first chunk + # Prepare for next iteration. We do this ahead of the next iteration + # so that we don't need to special case the first chunk rem = (self.offsets[i + 1] - start) % step if rem != 0: next_slice_start = step - rem diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index eb9e5a60cb..66c194396f 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1801,6 +1801,8 @@ def test_accessed_chunks(shape, chunks, ops): ([[1, 9]], (slice(5, 7),)), # Slice across multiple chunks ([[1, 5, 4]], (slice(5, 7),)), + # Slice across multiple chunks, negative + ([[1, 5, 4]], (slice(-6, -2),)), # Full slice across multiple chunks ([[1, 2, 2, 5]], (slice(None),)), # slice across multiple chunks @@ -1819,6 +1821,8 @@ def test_accessed_chunks(shape, chunks, ops): ([[1, 3, 1, 1, 5], 1], (np.array([6, 6, 6, 0, 10]),)), # boolean indexing ([[2, 1, 2], 2], ([True, False, False, True, True], slice(None))), + # single int indexing + ([[2, 1, 2]], (1,)), ], ) def test_varchunk_indexing(chunking, index): @@ -1833,3 +1837,26 @@ def test_varchunk_indexing(chunking, index): expected = regular_chunked[index] actual = var_chunked[index] np.testing.assert_equal(actual, expected) + + +# single int indexing +@pytest.mark.parametrize( + "chunking, index, err", + [ + # negative step + ([[2, 1, 2]], (slice(None, None, -1)), NotImplementedError), + # boolean, wrong length + ([[2, 1, 2], 2], ([True, False, False, True], slice(None)), IndexError), + # boolean orthogonal + ([[2, 1, 2], 2], ([True, False, False, True, True], [True, False]), IndexError), + ] +) +def test_varchunk_errors(chunking, index, err): + from functools import reduce + from operator import mul + + shape = tuple(x * 2 if isinstance(x, int) else sum(x) for x in chunking) + np_array = np.arange(reduce(mul, shape)).reshape(shape) + var_chunked = zarr.array(np_array, chunks=chunking) + with pytest.raises(err): + var_chunked[index] From f8d9bf2672a80846a866aa9967ce5331fd1f1e87 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Aug 2023 19:12:47 +0000 Subject: [PATCH 12/12] style: pre-commit fixes --- zarr/tests/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 66c194396f..3d43780610 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1849,7 +1849,7 @@ def test_varchunk_indexing(chunking, index): ([[2, 1, 2], 2], ([True, False, False, True], slice(None)), IndexError), # boolean orthogonal ([[2, 1, 2], 2], ([True, False, False, True, True], [True, False]), IndexError), - ] + ], ) def test_varchunk_errors(chunking, index, err): from functools import reduce