Merge branch 'main' into eschalk/issue-8231-align

pydata · Feb 24, 2024 · 5cc61b0 · 5cc61b0
2 parents 651289f + d9760f3
commit 5cc61b0
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 30 deletions.
diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md
@@ -87,6 +87,39 @@ The named-array package is designed to be interoperable with other scientific Py
 
 Further implementation details are in Appendix: [Implementation Details](#appendix-implementation-details).
 
+## Plan for decoupling lazy indexing functionality from NamedArray
+
+Today's implementation Xarray's lazy indexing functionality uses three private objects: `*Indexer`, `*IndexingAdapter`, `*Array`.
+These objects are needed for two reason:
+1. We need to translate from Xarray (NamedArray) indexing rules to bare array indexing rules.
+   - `*Indexer` objects track the type of indexing - basic, orthogonal, vectorized
+2. Not all arrays support the same indexing rules, so we need `*Indexing` adapters
+   1. Indexing Adapters today implement `__getitem__` and use type of `*Indexer` object to do appropriate conversions.
+3. We also want to support lazy indexing of on-disk arrays.
+   1. These again support different types of indexing, so we have `explicit_indexing_adapter` that understands `*Indexer` objects.
+
+### Goals
+1. We would like to keep the lazy indexing array objects, and backend array objects within Xarray. Thus NamedArray cannot treat these objects specially.
+2. A key source of confusion (and coupling) is that both lazy indexing arrays and indexing adapters, both handle Indexer objects, and both subclass `ExplicitlyIndexedNDArrayMixin`. These are however conceptually different.
+
+### Proposal
+
+1. The `NumpyIndexingAdapter`, `DaskIndexingAdapter`, and `ArrayApiIndexingAdapter` classes will need to migrate to Named Array project since we will want to support indexing of numpy, dask, and array-API arrays appropriately.
+2. The `as_indexable` function which wraps an array with the appropriate adapter will also migrate over to named array.
+3. Lazy indexing arrays will implement `__getitem__` for basic indexing, `.oindex` for orthogonal indexing, and `.vindex` for vectorized indexing.
+4. IndexingAdapter classes will similarly implement `__getitem__`, `oindex`, and `vindex`.
+5. `NamedArray.__getitem__` (and `__setitem__`) will still use `*Indexer` objects internally (for e.g. in `NamedArray._broadcast_indexes`), but use `.oindex`, `.vindex` on the underlying indexing adapters.
+6. We will move the `*Indexer` and `*IndexingAdapter` classes to Named Array. These will be considered private in the long-term.
+7. `as_indexable` will no longer special case `ExplicitlyIndexed` objects (we can special case a new `IndexingAdapter` mixin class that will be private to NamedArray). To handle Xarray's lazy indexing arrays, we will introduce a new `ExplicitIndexingAdapter` which will wrap any array with any of `.oindex` of `.vindex` implemented.
+   1. This will be the last case in the if-chain that is, we will try to wrap with all other `IndexingAdapter` objects before using `ExplicitIndexingAdapter` as a fallback. This Adapter will be used for the lazy indexing arrays, and backend arrays.
+   2. As with other indexing adapters (point 4 above), this `ExplicitIndexingAdapter` will only implement `__getitem__` and will understand `*Indexer` objects.
+8. For backwards compatibility with external backends, we will have to gracefully deprecate `indexing.explicit_indexing_adapter` which translates from Xarray's indexing rules to the indexing supported by the backend.
+   1. We could split `explicit_indexing_adapter` in to 3:
+      - `basic_indexing_adapter`, `outer_indexing_adapter` and `vectorized_indexing_adapter` for public use.
+   2. Implement fall back `.oindex`, `.vindex` properties on `BackendArray` base class. These will simply rewrap the `key` tuple with the appropriate `*Indexer` object, and pass it on to `__getitem__` or `__setitem__`. These methods will also raise DeprecationWarning so that external backends will know to migrate to `.oindex`, and `.vindex` over the next year.
+
+THe most uncertain piece here is maintaining backward compatibility with external backends. We should first migrate a single internal backend, and test out the proposed approach.
+
 ## Project Timeline and Milestones
 
 We have identified the following milestones for the completion of this project:

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -15,11 +15,44 @@ What's New
     np.random.seed(123456)
 
 
-.. _whats-new.2024.02.0:
+.. _whats-new.2024.03.0:
 
-v2024.02.0 (unreleased)
+v2024.03.0 (unreleased)
 -----------------------
 
+New Features
+~~~~~~~~~~~~
+
+- Add the ``.oindex`` property to Explicitly Indexed Arrays for orthogonal indexing functionality. (:issue:`8238`, :pull:`8750`)
+  By `Anderson Banihirwe <https://github.com/andersy005>`_.
+
+
+Breaking changes
+~~~~~~~~~~~~~~~~
+
+
+Deprecations
+~~~~~~~~~~~~
+
+
+Bug fixes
+~~~~~~~~~
+
+
+Documentation
+~~~~~~~~~~~~~
+
+
+Internal Changes
+~~~~~~~~~~~~~~~~
+
+
+
+.. _whats-new.2024.02.0:
+
+v2024.02.0 (Feb 19, 2024)
+-------------------------
+
 This release brings size information to the text ``repr``, changes to the accepted frequency
 strings, and various bug fixes.
 

diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -325,6 +325,21 @@ def as_integer_slice(value):
     return slice(start, stop, step)
 
 
+class IndexCallable:
+    """Provide getitem syntax for a callable object."""
+
+    __slots__ = ("func",)
+
+    def __init__(self, func):
+        self.func = func
+
+    def __getitem__(self, key):
+        return self.func(key)
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError
+
+
 class BasicIndexer(ExplicitIndexer):
     """Tuple for basic indexing.
 
@@ -470,6 +485,13 @@ def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray:
         # Note this is the base class for all lazy indexing classes
         return np.asarray(self.get_duck_array(), dtype=dtype)
 
+    def _oindex_get(self, key):
+        raise NotImplementedError("This method should be overridden")
+
+    @property
+    def oindex(self):
+        return IndexCallable(self._oindex_get)
+
 
 class ImplicitToExplicitIndexingAdapter(NDArrayMixin):
     """Wrap an array, converting tuples into the indicated explicit indexer."""
@@ -560,6 +582,9 @@ def get_duck_array(self):
     def transpose(self, order):
         return LazilyVectorizedIndexedArray(self.array, self.key).transpose(order)
 
+    def _oindex_get(self, indexer):
+        return type(self)(self.array, self._updated_key(indexer))
+
     def __getitem__(self, indexer):
         if isinstance(indexer, VectorizedIndexer):
             array = LazilyVectorizedIndexedArray(self.array, self.key)
@@ -663,6 +688,9 @@ def _ensure_copied(self):
     def get_duck_array(self):
         return self.array.get_duck_array()
 
+    def _oindex_get(self, key):
+        return type(self)(_wrap_numpy_scalars(self.array[key]))
+
     def __getitem__(self, key):
         return type(self)(_wrap_numpy_scalars(self.array[key]))
 
@@ -696,6 +724,9 @@ def get_duck_array(self):
         self._ensure_cached()
         return self.array.get_duck_array()
 
+    def _oindex_get(self, key):
+        return type(self)(_wrap_numpy_scalars(self.array[key]))
+
     def __getitem__(self, key):
         return type(self)(_wrap_numpy_scalars(self.array[key]))
 
@@ -1332,6 +1363,10 @@ def _indexing_array_and_key(self, key):
     def transpose(self, order):
         return self.array.transpose(order)
 
+    def _oindex_get(self, key):
+        array, key = self._indexing_array_and_key(key)
+        return array[key]
+
     def __getitem__(self, key):
         array, key = self._indexing_array_and_key(key)
         return array[key]
@@ -1376,16 +1411,19 @@ def __init__(self, array):
             )
         self.array = array
 
+    def _oindex_get(self, key):
+        # manual orthogonal indexing (implemented like DaskIndexingAdapter)
+        key = key.tuple
+        value = self.array
+        for axis, subkey in reversed(list(enumerate(key))):
+            value = value[(slice(None),) * axis + (subkey, Ellipsis)]
+        return value
+
     def __getitem__(self, key):
         if isinstance(key, BasicIndexer):
             return self.array[key.tuple]
         elif isinstance(key, OuterIndexer):
-            # manual orthogonal indexing (implemented like DaskIndexingAdapter)
-            key = key.tuple
-            value = self.array
-            for axis, subkey in reversed(list(enumerate(key))):
-                value = value[(slice(None),) * axis + (subkey, Ellipsis)]
-            return value
+            return self.oindex[key]
         else:
             if isinstance(key, VectorizedIndexer):
                 raise TypeError("Vectorized indexing is not supported")
@@ -1395,11 +1433,10 @@ def __getitem__(self, key):
     def __setitem__(self, key, value):
         if isinstance(key, (BasicIndexer, OuterIndexer)):
             self.array[key.tuple] = value
+        elif isinstance(key, VectorizedIndexer):
+            raise TypeError("Vectorized indexing is not supported")
         else:
-            if isinstance(key, VectorizedIndexer):
-                raise TypeError("Vectorized indexing is not supported")
-            else:
-                raise TypeError(f"Unrecognized indexer: {key}")
+            raise TypeError(f"Unrecognized indexer: {key}")
 
     def transpose(self, order):
         xp = self.array.__array_namespace__()
@@ -1417,24 +1454,25 @@ def __init__(self, array):
         """
         self.array = array
 
-    def __getitem__(self, key):
+    def _oindex_get(self, key):
+        key = key.tuple
+        try:
+            return self.array[key]
+        except NotImplementedError:
+            # manual orthogonal indexing
+            value = self.array
+            for axis, subkey in reversed(list(enumerate(key))):
+                value = value[(slice(None),) * axis + (subkey,)]
+            return value
 
+    def __getitem__(self, key):
         if isinstance(key, BasicIndexer):
             return self.array[key.tuple]
         elif isinstance(key, VectorizedIndexer):
             return self.array.vindex[key.tuple]
         else:
             assert isinstance(key, OuterIndexer)
-            key = key.tuple
-            try:
-                return self.array[key]
-            except NotImplementedError:
-                # manual orthogonal indexing.
-                # TODO: port this upstream into dask in a saner way.
-                value = self.array
-                for axis, subkey in reversed(list(enumerate(key))):
-                    value = value[(slice(None),) * axis + (subkey,)]
-                return value
+            return self.oindex[key]
 
     def __setitem__(self, key, value):
         if isinstance(key, BasicIndexer):
@@ -1510,6 +1548,9 @@ def _convert_scalar(self, item):
         # a NumPy array.
         return to_0d_array(item)
 
+    def _oindex_get(self, key):
+        return self.__getitem__(key)
+
     def __getitem__(
         self, indexer
     ) -> (

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -41,11 +41,7 @@
     maybe_coerce_to_str,
 )
 from xarray.namedarray.core import NamedArray, _raise_if_any_duplicate_dimensions
-from xarray.namedarray.pycompat import (
-    integer_types,
-    is_0d_dask_array,
-    to_duck_array,
-)
+from xarray.namedarray.pycompat import integer_types, is_0d_dask_array, to_duck_array
 
 NON_NUMPY_SUPPORTED_ARRAY_TYPES = (
     indexing.ExplicitlyIndexed,
@@ -761,7 +757,14 @@ def __getitem__(self, key) -> Self:
         array `x.values` directly.
         """
         dims, indexer, new_order = self._broadcast_indexes(key)
-        data = as_indexable(self._data)[indexer]
+        indexable = as_indexable(self._data)
+
+        if isinstance(indexer, BasicIndexer):
+            data = indexable[indexer]
+        elif isinstance(indexer, OuterIndexer):
+            data = indexable.oindex[indexer]
+        else:
+            data = indexable[indexer]
         if new_order:
             data = np.moveaxis(data, range(len(new_order)), new_order)
         return self._finalize_indexing_result(dims, data)
@@ -794,7 +797,12 @@ def _getitem_with_mask(self, key, fill_value=dtypes.NA):
             else:
                 actual_indexer = indexer
 
-            data = as_indexable(self._data)[actual_indexer]
+            indexable = as_indexable(self._data)
+
+            if isinstance(indexer, OuterIndexer):
+                data = indexable.oindex[indexer]
+            else:
+                data = indexable[actual_indexer]
             mask = indexing.create_mask(indexer, self.shape, data)
             # we need to invert the mask in order to pass data first. This helps
             # pint to choose the correct unit