Enable numbagg in calculation of quantiles (#8684)

* Use `numbagg.nanquantile` by default when `method=linear` and `skipna=True` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add `"None"` option to `compute_backend` * skip tests when `compute_backend == "numbagg"` * adjust regex pattern to include numbagg error message * skip test if `compute_backend == "numbagg"` and `q == -0.1` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test quantile method w/o numbagg backend * change `compute_backend` param `"None"` to `None` * add numbagg `minversion` requirement in `quantile` method * align `test_quantile_out_of_bounds` with numbagg>=0.7.2 * avoid using numbagg on pint arrays; remove exclusion from tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move numbagg nanquantiles logic to `nputils`-module * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix logic related to numbagg `nanquantiles` * fix logic related to numbagg `nanquantiles` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add `whats-new` entry --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pydata · Feb 7, 2024 · 0eb6658 · 0eb6658
1 parent 0f7a034
commit 0eb6658
Show file tree

Hide file tree

Showing 8 changed files with 41 additions and 14 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -28,10 +28,13 @@ New Features
   By `Mathias Hauser <https://github.com/mathause>`_.
 - Add :py:meth:`NamedArray.expand_dims`, :py:meth:`NamedArray.permute_dims` and :py:meth:`NamedArray.broadcast_to`
   (:pull:`8380`) By `Anderson Banihirwe <https://github.com/andersy005>`_.
-
 - Xarray now defers to flox's `heuristics <https://flox.readthedocs.io/en/latest/implementation.html#heuristics>`_
   to set default `method` for groupby problems. This only applies to ``flox>=0.9``.
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- All `quantile` methods (e.g. :py:meth:`DataArray.quantile`) now use `numbagg`
+  for the calculation of nanquantiles (i.e., `skipna=True`) if it is installed.
+  This is currently limited to the linear interpolation method (`method='linear'`).
+  (:issue:`7377`, :pull:`8684`) By `Marco Wolsza <https://github.com/maawoo>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py
@@ -195,6 +195,14 @@ def f(values, axis=None, **kwargs):
             and values.dtype.kind in "uifc"
             # and values.dtype.isnative
             and (dtype is None or np.dtype(dtype) == values.dtype)
+            # numbagg.nanquantile only available after 0.8.0 and with linear method
+            and (
+                name != "nanquantile"
+                or (
+                    pycompat.mod_version("numbagg") >= Version("0.8.0")
+                    and kwargs.get("method", "linear") == "linear"
+                )
+            )
         ):
             import numbagg
 
@@ -206,6 +214,9 @@ def f(values, axis=None, **kwargs):
                 # to ddof=1 above.
                 if pycompat.mod_version("numbagg") < Version("0.7.0"):
                     kwargs.pop("ddof", None)
+                if name == "nanquantile":
+                    kwargs["quantiles"] = kwargs.pop("q")
+                    kwargs.pop("method", None)
                 return nba_func(values, axis=axis, **kwargs)
         if (
             _BOTTLENECK_AVAILABLE
@@ -285,3 +296,4 @@ def least_squares(lhs, rhs, rcond=None, skipna=False):
 nancumprod = _create_method("nancumprod")
 nanargmin = _create_method("nanargmin")
 nanargmax = _create_method("nanargmax")
+nanquantile = _create_method("nanquantile")
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1992,7 +1992,7 @@ def quantile(
             method = interpolation
 
         if skipna or (skipna is None and self.dtype.kind in "cfO"):
-            _quantile_func = np.nanquantile
+            _quantile_func = nputils.nanquantile
         else:
             _quantile_func = np.quantile
 

diff --git a/xarray/tests/conftest.py b/xarray/tests/conftest.py
@@ -14,9 +14,11 @@ def backend(request):
     return request.param
 
 
-@pytest.fixture(params=["numbagg", "bottleneck"])
+@pytest.fixture(params=["numbagg", "bottleneck", None])
 def compute_backend(request):
-    if request.param == "bottleneck":
+    if request.param is None:
+        options = dict(use_bottleneck=False, use_numbagg=False)
+    elif request.param == "bottleneck":
         options = dict(use_bottleneck=True, use_numbagg=False)
     elif request.param == "numbagg":
         options = dict(use_bottleneck=False, use_numbagg=True)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -2888,12 +2888,13 @@ def test_reduce_out(self) -> None:
         with pytest.raises(TypeError):
             orig.mean(out=np.ones(orig.shape))
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize("skipna", [True, False, None])
     @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]])
     @pytest.mark.parametrize(
         "axis, dim", zip([None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]])
     )
-    def test_quantile(self, q, axis, dim, skipna) -> None:
+    def test_quantile(self, q, axis, dim, skipna, compute_backend) -> None:
         va = self.va.copy(deep=True)
         va[0, 0] = np.nan
 

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -5612,9 +5612,10 @@ def test_reduce_keepdims(self) -> None:
         )
         assert_identical(expected, actual)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize("skipna", [True, False, None])
     @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]])
-    def test_quantile(self, q, skipna) -> None:
+    def test_quantile(self, q, skipna, compute_backend) -> None:
         ds = create_test_data(seed=123)
         ds.var1.data[0, 0] = np.nan
 
@@ -5635,8 +5636,9 @@ def test_quantile(self, q, skipna) -> None:
         assert "dim3" in ds_quantile.dims
         assert all(d not in ds_quantile.dims for d in dim)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize("skipna", [True, False])
-    def test_quantile_skipna(self, skipna) -> None:
+    def test_quantile_skipna(self, skipna, compute_backend) -> None:
         q = 0.1
         dim = "time"
         ds = Dataset({"a": ([dim], np.arange(0, 11))})

diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py
@@ -2014,6 +2014,7 @@ def test_squeeze(self, dim, dtype):
         assert_units_equal(expected, actual)
         assert_identical(expected, actual)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize(
         "func",
         (
@@ -2035,7 +2036,7 @@ def test_squeeze(self, dim, dtype):
         ),
         ids=repr,
     )
-    def test_computation(self, func, dtype):
+    def test_computation(self, func, dtype, compute_backend):
         base_unit = unit_registry.m
         array = np.linspace(0, 5, 5 * 10).reshape(5, 10).astype(dtype) * base_unit
         variable = xr.Variable(("x", "y"), array)
@@ -3767,6 +3768,7 @@ def test_differentiate_integrate(self, func, variant, dtype):
         assert_units_equal(expected, actual)
         assert_identical(expected, actual)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize(
         "variant",
         (
@@ -3787,7 +3789,7 @@ def test_differentiate_integrate(self, func, variant, dtype):
         ),
         ids=repr,
     )
-    def test_computation(self, func, variant, dtype):
+    def test_computation(self, func, variant, dtype, compute_backend):
         unit = unit_registry.m
 
         variants = {
@@ -3893,6 +3895,7 @@ def test_resample(self, dtype):
         assert_units_equal(expected, actual)
         assert_identical(expected, actual)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize(
         "variant",
         (
@@ -3913,7 +3916,7 @@ def test_resample(self, dtype):
         ),
         ids=repr,
     )
-    def test_grouped_operations(self, func, variant, dtype):
+    def test_grouped_operations(self, func, variant, dtype, compute_backend):
         unit = unit_registry.m
 
         variants = {
@@ -5250,6 +5253,7 @@ def test_interp_reindex_like_indexing(self, func, unit, error, dtype):
         assert_units_equal(expected, actual)
         assert_equal(expected, actual)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize(
         "func",
         (
@@ -5272,7 +5276,7 @@ def test_interp_reindex_like_indexing(self, func, unit, error, dtype):
             "coords",
         ),
     )
-    def test_computation(self, func, variant, dtype):
+    def test_computation(self, func, variant, dtype, compute_backend):
         variants = {
             "data": ((unit_registry.degK, unit_registry.Pa), 1, 1),
             "dims": ((1, 1), unit_registry.m, 1),
@@ -5404,6 +5408,7 @@ def test_resample(self, variant, dtype):
         assert_units_equal(expected, actual)
         assert_equal(expected, actual)
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize(
         "func",
         (
@@ -5425,7 +5430,7 @@ def test_resample(self, variant, dtype):
             "coords",
         ),
     )
-    def test_grouped_operations(self, func, variant, dtype):
+    def test_grouped_operations(self, func, variant, dtype, compute_backend):
         variants = {
             "data": ((unit_registry.degK, unit_registry.Pa), 1, 1),
             "dims": ((1, 1), unit_registry.m, 1),

diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py
@@ -1842,13 +1842,15 @@ def test_quantile_chunked_dim_error(self):
         with pytest.raises(ValueError, match=r"consists of multiple chunks"):
             v.quantile(0.5, dim="x")
 
+    @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True)
     @pytest.mark.parametrize("q", [-0.1, 1.1, [2], [0.25, 2]])
-    def test_quantile_out_of_bounds(self, q):
+    def test_quantile_out_of_bounds(self, q, compute_backend):
         v = Variable(["x", "y"], self.d)
 
         # escape special characters
         with pytest.raises(
-            ValueError, match=r"Quantiles must be in the range \[0, 1\]"
+            ValueError,
+            match=r"(Q|q)uantiles must be in the range \[0, 1\]",
         ):
             v.quantile(q, dim="x")