diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md index fc8129af6b0..074f8cf17e7 100644 --- a/design_notes/named_array_design_doc.md +++ b/design_notes/named_array_design_doc.md @@ -87,6 +87,39 @@ The named-array package is designed to be interoperable with other scientific Py Further implementation details are in Appendix: [Implementation Details](#appendix-implementation-details). +## Plan for decoupling lazy indexing functionality from NamedArray + +Today's implementation Xarray's lazy indexing functionality uses three private objects: `*Indexer`, `*IndexingAdapter`, `*Array`. +These objects are needed for two reason: +1. We need to translate from Xarray (NamedArray) indexing rules to bare array indexing rules. + - `*Indexer` objects track the type of indexing - basic, orthogonal, vectorized +2. Not all arrays support the same indexing rules, so we need `*Indexing` adapters + 1. Indexing Adapters today implement `__getitem__` and use type of `*Indexer` object to do appropriate conversions. +3. We also want to support lazy indexing of on-disk arrays. + 1. These again support different types of indexing, so we have `explicit_indexing_adapter` that understands `*Indexer` objects. + +### Goals +1. We would like to keep the lazy indexing array objects, and backend array objects within Xarray. Thus NamedArray cannot treat these objects specially. +2. A key source of confusion (and coupling) is that both lazy indexing arrays and indexing adapters, both handle Indexer objects, and both subclass `ExplicitlyIndexedNDArrayMixin`. These are however conceptually different. + +### Proposal + +1. The `NumpyIndexingAdapter`, `DaskIndexingAdapter`, and `ArrayApiIndexingAdapter` classes will need to migrate to Named Array project since we will want to support indexing of numpy, dask, and array-API arrays appropriately. +2. The `as_indexable` function which wraps an array with the appropriate adapter will also migrate over to named array. +3. Lazy indexing arrays will implement `__getitem__` for basic indexing, `.oindex` for orthogonal indexing, and `.vindex` for vectorized indexing. +4. IndexingAdapter classes will similarly implement `__getitem__`, `oindex`, and `vindex`. +5. `NamedArray.__getitem__` (and `__setitem__`) will still use `*Indexer` objects internally (for e.g. in `NamedArray._broadcast_indexes`), but use `.oindex`, `.vindex` on the underlying indexing adapters. +6. We will move the `*Indexer` and `*IndexingAdapter` classes to Named Array. These will be considered private in the long-term. +7. `as_indexable` will no longer special case `ExplicitlyIndexed` objects (we can special case a new `IndexingAdapter` mixin class that will be private to NamedArray). To handle Xarray's lazy indexing arrays, we will introduce a new `ExplicitIndexingAdapter` which will wrap any array with any of `.oindex` of `.vindex` implemented. + 1. This will be the last case in the if-chain that is, we will try to wrap with all other `IndexingAdapter` objects before using `ExplicitIndexingAdapter` as a fallback. This Adapter will be used for the lazy indexing arrays, and backend arrays. + 2. As with other indexing adapters (point 4 above), this `ExplicitIndexingAdapter` will only implement `__getitem__` and will understand `*Indexer` objects. +8. For backwards compatibility with external backends, we will have to gracefully deprecate `indexing.explicit_indexing_adapter` which translates from Xarray's indexing rules to the indexing supported by the backend. + 1. We could split `explicit_indexing_adapter` in to 3: + - `basic_indexing_adapter`, `outer_indexing_adapter` and `vectorized_indexing_adapter` for public use. + 2. Implement fall back `.oindex`, `.vindex` properties on `BackendArray` base class. These will simply rewrap the `key` tuple with the appropriate `*Indexer` object, and pass it on to `__getitem__` or `__setitem__`. These methods will also raise DeprecationWarning so that external backends will know to migrate to `.oindex`, and `.vindex` over the next year. + +THe most uncertain piece here is maintaining backward compatibility with external backends. We should first migrate a single internal backend, and test out the proposed approach. + ## Project Timeline and Milestones We have identified the following milestones for the completion of this project: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 01c63e889b3..cf494cbd13b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,11 +15,44 @@ What's New np.random.seed(123456) -.. _whats-new.2024.02.0: +.. _whats-new.2024.03.0: -v2024.02.0 (unreleased) +v2024.03.0 (unreleased) ----------------------- +New Features +~~~~~~~~~~~~ + +- Add the ``.oindex`` property to Explicitly Indexed Arrays for orthogonal indexing functionality. (:issue:`8238`, :pull:`8750`) + By `Anderson Banihirwe `_. + + +Breaking changes +~~~~~~~~~~~~~~~~ + + +Deprecations +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + + +.. _whats-new.2024.02.0: + +v2024.02.0 (Feb 19, 2024) +------------------------- + This release brings size information to the text ``repr``, changes to the accepted frequency strings, and various bug fixes. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 7331ab1a056..43867bc552b 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -325,6 +325,21 @@ def as_integer_slice(value): return slice(start, stop, step) +class IndexCallable: + """Provide getitem syntax for a callable object.""" + + __slots__ = ("func",) + + def __init__(self, func): + self.func = func + + def __getitem__(self, key): + return self.func(key) + + def __setitem__(self, key, value): + raise NotImplementedError + + class BasicIndexer(ExplicitIndexer): """Tuple for basic indexing. @@ -470,6 +485,13 @@ def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: # Note this is the base class for all lazy indexing classes return np.asarray(self.get_duck_array(), dtype=dtype) + def _oindex_get(self, key): + raise NotImplementedError("This method should be overridden") + + @property + def oindex(self): + return IndexCallable(self._oindex_get) + class ImplicitToExplicitIndexingAdapter(NDArrayMixin): """Wrap an array, converting tuples into the indicated explicit indexer.""" @@ -560,6 +582,9 @@ def get_duck_array(self): def transpose(self, order): return LazilyVectorizedIndexedArray(self.array, self.key).transpose(order) + def _oindex_get(self, indexer): + return type(self)(self.array, self._updated_key(indexer)) + def __getitem__(self, indexer): if isinstance(indexer, VectorizedIndexer): array = LazilyVectorizedIndexedArray(self.array, self.key) @@ -663,6 +688,9 @@ def _ensure_copied(self): def get_duck_array(self): return self.array.get_duck_array() + def _oindex_get(self, key): + return type(self)(_wrap_numpy_scalars(self.array[key])) + def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) @@ -696,6 +724,9 @@ def get_duck_array(self): self._ensure_cached() return self.array.get_duck_array() + def _oindex_get(self, key): + return type(self)(_wrap_numpy_scalars(self.array[key])) + def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) @@ -1332,6 +1363,10 @@ def _indexing_array_and_key(self, key): def transpose(self, order): return self.array.transpose(order) + def _oindex_get(self, key): + array, key = self._indexing_array_and_key(key) + return array[key] + def __getitem__(self, key): array, key = self._indexing_array_and_key(key) return array[key] @@ -1376,16 +1411,19 @@ def __init__(self, array): ) self.array = array + def _oindex_get(self, key): + # manual orthogonal indexing (implemented like DaskIndexingAdapter) + key = key.tuple + value = self.array + for axis, subkey in reversed(list(enumerate(key))): + value = value[(slice(None),) * axis + (subkey, Ellipsis)] + return value + def __getitem__(self, key): if isinstance(key, BasicIndexer): return self.array[key.tuple] elif isinstance(key, OuterIndexer): - # manual orthogonal indexing (implemented like DaskIndexingAdapter) - key = key.tuple - value = self.array - for axis, subkey in reversed(list(enumerate(key))): - value = value[(slice(None),) * axis + (subkey, Ellipsis)] - return value + return self.oindex[key] else: if isinstance(key, VectorizedIndexer): raise TypeError("Vectorized indexing is not supported") @@ -1395,11 +1433,10 @@ def __getitem__(self, key): def __setitem__(self, key, value): if isinstance(key, (BasicIndexer, OuterIndexer)): self.array[key.tuple] = value + elif isinstance(key, VectorizedIndexer): + raise TypeError("Vectorized indexing is not supported") else: - if isinstance(key, VectorizedIndexer): - raise TypeError("Vectorized indexing is not supported") - else: - raise TypeError(f"Unrecognized indexer: {key}") + raise TypeError(f"Unrecognized indexer: {key}") def transpose(self, order): xp = self.array.__array_namespace__() @@ -1417,24 +1454,25 @@ def __init__(self, array): """ self.array = array - def __getitem__(self, key): + def _oindex_get(self, key): + key = key.tuple + try: + return self.array[key] + except NotImplementedError: + # manual orthogonal indexing + value = self.array + for axis, subkey in reversed(list(enumerate(key))): + value = value[(slice(None),) * axis + (subkey,)] + return value + def __getitem__(self, key): if isinstance(key, BasicIndexer): return self.array[key.tuple] elif isinstance(key, VectorizedIndexer): return self.array.vindex[key.tuple] else: assert isinstance(key, OuterIndexer) - key = key.tuple - try: - return self.array[key] - except NotImplementedError: - # manual orthogonal indexing. - # TODO: port this upstream into dask in a saner way. - value = self.array - for axis, subkey in reversed(list(enumerate(key))): - value = value[(slice(None),) * axis + (subkey,)] - return value + return self.oindex[key] def __setitem__(self, key, value): if isinstance(key, BasicIndexer): @@ -1510,6 +1548,9 @@ def _convert_scalar(self, item): # a NumPy array. return to_0d_array(item) + def _oindex_get(self, key): + return self.__getitem__(key) + def __getitem__( self, indexer ) -> ( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 8d76cfbe004..6834931fa11 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -41,11 +41,7 @@ maybe_coerce_to_str, ) from xarray.namedarray.core import NamedArray, _raise_if_any_duplicate_dimensions -from xarray.namedarray.pycompat import ( - integer_types, - is_0d_dask_array, - to_duck_array, -) +from xarray.namedarray.pycompat import integer_types, is_0d_dask_array, to_duck_array NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( indexing.ExplicitlyIndexed, @@ -761,7 +757,14 @@ def __getitem__(self, key) -> Self: array `x.values` directly. """ dims, indexer, new_order = self._broadcast_indexes(key) - data = as_indexable(self._data)[indexer] + indexable = as_indexable(self._data) + + if isinstance(indexer, BasicIndexer): + data = indexable[indexer] + elif isinstance(indexer, OuterIndexer): + data = indexable.oindex[indexer] + else: + data = indexable[indexer] if new_order: data = np.moveaxis(data, range(len(new_order)), new_order) return self._finalize_indexing_result(dims, data) @@ -794,7 +797,12 @@ def _getitem_with_mask(self, key, fill_value=dtypes.NA): else: actual_indexer = indexer - data = as_indexable(self._data)[actual_indexer] + indexable = as_indexable(self._data) + + if isinstance(indexer, OuterIndexer): + data = indexable.oindex[indexer] + else: + data = indexable[actual_indexer] mask = indexing.create_mask(indexer, self.shape, data) # we need to invert the mask in order to pass data first. This helps # pint to choose the correct unit