From e65bb4d97645dd172f3332dffa9029866be3e2c9 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:29:25 +0200 Subject: [PATCH 01/46] Introduce new arguments limit_direction, limit_area, limit_use coordinate --- xarray/core/dataarray.py | 140 ++++++++--- xarray/core/dataset.py | 142 +++++++---- xarray/core/missing.py | 316 +++++++++++++++++------- xarray/core/types.py | 2 + xarray/tests/test_missing.py | 460 +++++++++++++++++++++++++++++++++-- 5 files changed, 868 insertions(+), 192 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d287564cfe5..1d862e27e57 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -109,6 +109,8 @@ GroupIndices, GroupInput, InterpOptions, + LimitAreaOptions, + LimitDirectionOptions, PadModeOptions, PadReflectOptions, QuantileMethods, @@ -3520,10 +3522,21 @@ def fillna(self, value: Any) -> Self: def interpolate_na( self, - dim: Hashable | None = None, + dim: Hashable, method: InterpOptions = "linear", - limit: int | None = None, - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate: bool | Hashable = False, max_gap: ( None | int @@ -3540,7 +3553,7 @@ def interpolate_na( Parameters ---------- - dim : Hashable or None, optional + dim : Hashable Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -3555,17 +3568,54 @@ def interpolate_na( - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their respective :py:class:`scipy.interpolate` classes. - use_coordinate : bool or str, default: True + use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. If False, values are treated as if - equally-spaced along ``dim``. If True, the IndexVariable `dim` is - used. If ``use_coordinate`` is a string, it specifies the name of a - coordinate variable to use as the index. - limit : int or None, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``limit_use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only interpolate over gaps less than a given length, see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values (interpolate). + - "outside": Only fill NaNs outside valid values (extrapolate). + + limit_use_coordinate : bool or Hashable, default: True + Specifies which index to use for the ``limit`` distance. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3576,8 +3626,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled - dimensions has not been implemented yet. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -3601,33 +3651,62 @@ def interpolate_na( interpolated: DataArray Filled in DataArray. + Warning + -------- + When passing fill_value as a keyword argument with method="linear", it does not use + ``numpy.interp`` but it uses ``scipy.interpolate.interp1d``, which provides the fill_value parameter. + See Also -------- numpy.interp scipy.interpolate + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. Examples -------- >>> da = xr.DataArray( - ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} + ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], + ... dims="x", + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) >>> da - Size: 40B - array([nan, 2., 3., nan, 0.]) + + array([nan, 2., nan, nan, 5., nan, 0.]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 - + * x (x) int64 0 1 2 3 4 5 6 >>> da.interpolate_na(dim="x", method="linear") - Size: 40B - array([nan, 2. , 3. , 1.5, 0. ]) + + array([nan, 2. , 3. , 4. , 5. , 2.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 - - >>> da.interpolate_na(dim="x", method="linear", fill_value="extrapolate") - Size: 40B - array([1. , 2. , 3. , 1.5, 0. ]) + * x (x) int64 0 1 2 3 4 5 6 + >>> da.interpolate_na( + ... dim="x", + ... method="linear", + ... limit_direction="both", + ... fill_value="extrapolate", + ... ) + + array([1. , 2. , 3. , 4. , 5. , 2.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 + >>> da.interpolate_na( + ... dim="x", method="linear", limit=1, limit_direction="forward" + ... ) + + array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 + >>> da.interpolate_na( + ... dim="x", method="linear", max_gap=2, limit_direction="forward" + ... ) + + array([nan, 2. , nan, nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 """ from xarray.core.missing import interp_na @@ -3635,8 +3714,11 @@ def interpolate_na( self, dim=dim, method=method, - limit=limit, use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + limit_use_coordinate=limit_use_coordinate, max_gap=max_gap, keep_attrs=keep_attrs, **kwargs, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a943d9bfc57..eafcb005992 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -160,6 +160,8 @@ GroupInput, InterpOptions, JoinOptions, + LimitAreaOptions, + LimitDirectionOptions, PadModeOptions, PadReflectOptions, QueryEngineOptions, @@ -6707,10 +6709,21 @@ def fillna(self, value: Any) -> Self: def interpolate_na( self, - dim: Hashable | None = None, + dim: Hashable, method: InterpOptions = "linear", - limit: int | None = None, use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate: bool | Hashable = False, max_gap: ( int | float @@ -6720,13 +6733,14 @@ def interpolate_na( | datetime.timedelta | None ) = None, + keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: """Fill in NaNs by interpolating according to different methods. Parameters ---------- - dim : Hashable or None, optional + dim : Hashable Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -6743,15 +6757,52 @@ def interpolate_na( use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. If False, values are treated as if - equally-spaced along ``dim``. If True, the IndexVariable `dim` is - used. If ``use_coordinate`` is a string, it specifies the name of a - coordinate variable to use as the index. - limit : int, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``limit_use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only interpolate over gaps less than a given length, see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values (interpolate). + - "outside": Only fill NaNs outside valid values (extrapolate). + + limit_use_coordinate : bool or Hashable, default: True + Specifies which index to use for the ``limit`` distance. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta \ or None, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. @@ -6763,8 +6814,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled - dimensions has not been implemented yet. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -6776,6 +6827,10 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6794,49 +6849,50 @@ def interpolate_na( numpy.interp scipy.interpolate + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + Examples -------- >>> ds = xr.Dataset( ... { - ... "A": ("x", [np.nan, 2, 3, np.nan, 0]), - ... "B": ("x", [3, 4, np.nan, 1, 7]), - ... "C": ("x", [np.nan, np.nan, np.nan, 5, 0]), - ... "D": ("x", [np.nan, 3, np.nan, -1, 4]), + ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), + ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), ... }, - ... coords={"x": [0, 1, 2, 3, 4]}, + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) >>> ds - Size: 200B - Dimensions: (x: 5) + + Dimensions: (x: 7) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 Data variables: - A (x) float64 40B nan 2.0 3.0 nan 0.0 - B (x) float64 40B 3.0 4.0 nan 1.0 7.0 - C (x) float64 40B nan nan nan 5.0 0.0 - D (x) float64 40B nan 3.0 nan -1.0 4.0 - - >>> ds.interpolate_na(dim="x", method="linear") - Size: 200B - Dimensions: (x: 5) + A (x) float64 nan 2.0 nan nan 5.0 nan 0.0 + B (x) float64 nan 2.0 nan nan 5.0 6.0 nan + >>> ds.interpolate_na( + ... dim="x", + ... method="linear", + ... limit_direction="both", + ... fill_value="extrapolate", + ... ) + + Dimensions: (x: 7) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 Data variables: - A (x) float64 40B nan 2.0 3.0 1.5 0.0 - B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 - C (x) float64 40B nan nan nan 5.0 0.0 - D (x) float64 40B nan 3.0 1.0 -1.0 4.0 - - >>> ds.interpolate_na(dim="x", method="linear", fill_value="extrapolate") - Size: 200B - Dimensions: (x: 5) + A (x) float64 1.0 2.0 3.0 4.0 5.0 2.5 0.0 + B (x) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 + >>> ds.interpolate_na( + ... dim="x", method="linear", limit=1, limit_direction="forward" + ... ) + + Dimensions: (x: 7) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 Data variables: - A (x) float64 40B 1.0 2.0 3.0 1.5 0.0 - B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 - C (x) float64 40B 20.0 15.0 10.0 5.0 0.0 - D (x) float64 40B 5.0 3.0 1.0 -1.0 4.0 + A (x) float64 nan 2.0 3.0 nan 5.0 2.5 0.0 + B (x) float64 nan 2.0 3.0 nan 5.0 6.0 nan """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na diff --git a/xarray/core/missing.py b/xarray/core/missing.py index b4ca36b31df..d7d69b32872 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -12,7 +12,6 @@ import numpy as np import pandas as pd -from xarray.core import utils from xarray.core.common import _contains_datetime_like_objects, ones_like from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import ( @@ -25,7 +24,13 @@ transpose, ) from xarray.core.options import _get_keep_attrs -from xarray.core.types import Interp1dOptions, InterpnOptions, InterpOptions +from xarray.core.types import ( + Interp1dOptions, + InterpnOptions, + InterpOptions, + LimitAreaOptions, + LimitDirectionOptions, +) from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import ( Variable, @@ -45,6 +50,84 @@ T = TypeVar("T") +def _get_gap_left_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False +): + arange = ones_like(obj) * index + left = arange.where(~obj.isnull()).ffill(dim) + if outside: + return left.fillna(index[0]) + return left + + +def _get_gap_right_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False +): + arange = ones_like(obj) * index + right = arange.where(~obj.isnull()).bfill(dim) + if outside: + return right.fillna(index[-1]) + return right + + +def _get_gap_dist_to_left_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable +): + arange = ones_like(obj) * index + return arange - _get_gap_left_edge(obj, dim, index) + + +def _get_gap_dist_to_right_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable +): + arange = ones_like(obj) * index + return _get_gap_right_edge(obj, dim, index) - arange + + +def _get_limit_fill_mask( + obj: Dataset | DataArray | Variable, + dim: Hashable, + index: Variable, + limit, + limit_direction, +): + if limit_direction == "forward": + limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) <= limit + elif limit_direction == "backward": + limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) <= limit + elif limit_direction == "both": + limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) <= limit) | ( + _get_gap_dist_to_right_edge(obj, dim, index) <= limit + ) + else: + raise ValueError( + f"limit_direction must be one of 'forward', 'backward', 'both'. Got {limit_direction}" + ) + return limit_mask + + +def _get_limit_area_mask( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, limit_area +): + if limit_area == "inside": + area_mask = ( + _get_gap_left_edge(obj, dim, index).notnull() + & _get_gap_right_edge(obj, dim, index).notnull() + ) + area_mask = area_mask | obj.notnull() + elif limit_area == "outside": + area_mask = ( + _get_gap_left_edge(obj, dim, index).isnull() + | _get_gap_right_edge(obj, dim, index).isnull() + ) + area_mask = area_mask | obj.notnull() + else: + raise ValueError( + f"limit_area must be one of 'inside', 'outside' or None. Got {limit_area}" + ) + return area_mask + + def _get_nan_block_lengths( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable ): @@ -52,26 +135,78 @@ def _get_nan_block_lengths( Return an object where each NaN element in 'obj' is replaced by the length of the gap the element is in. """ + return _get_gap_right_edge(obj, dim, index, outside=True) - _get_gap_left_edge( + obj, dim, index, outside=True + ) - # make variable so that we get broadcasting for free - index = Variable([dim], index) - # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072 - arange = ones_like(obj) * index - valid = obj.notnull() - valid_arange = arange.where(valid) - cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0]) - - nan_block_lengths = ( - cumulative_nans.diff(dim=dim, label="upper") - .reindex({dim: obj[dim]}) - .where(valid) - .bfill(dim=dim) - .where(~valid, 0) - .fillna(index[-1] - valid_arange.max(dim=[dim])) +def _get_max_gap_mask( + obj: Dataset | DataArray | Variable, + dim: Hashable, + index: Variable, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta, +): + nan_block_lengths = _get_nan_block_lengths(obj, dim, index) + return nan_block_lengths <= max_gap + + +def _get_gap_masks( + obj: Dataset | DataArray | Variable, + dim: Hashable, + limit=None, + limit_direction="both", + limit_area=None, + limit_use_coordinate=False, + max_gap=None, + max_gap_use_coordinate=False, +): + # Input checking + ##Limit + if not is_scalar(limit): + raise ValueError("limit must be a scalar.") + + if limit is None: + limit = np.inf + else: + if limit_use_coordinate is False: + if not isinstance(limit, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point limit since limit_use_coordinate=False. Received {type(limit).__name__}." + ) + if _is_time_index(_get_raw_interp_index(obj, dim, limit_use_coordinate)): + limit = timedelta_to_numeric(limit) + + ## Max_gap + if max_gap is not None: + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + + if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): + max_gap = timedelta_to_numeric(max_gap) + + if not max_gap_use_coordinate: + if not isinstance(max_gap, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." + ) + # Calculate indexes + index_limit = get_clean_interp_index(obj, dim, use_coordinate=limit_use_coordinate) + index_max_gap = get_clean_interp_index( + obj, dim, use_coordinate=max_gap_use_coordinate ) + # Calculate fill masks + limit_mask = None + if limit != np.inf or limit_direction != "both": + limit_mask = _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) - return nan_block_lengths + limit_area_mask = None + if limit_area is not None: + limit_area_mask = _get_limit_area_mask(obj, dim, index_limit, limit_area) + + max_gap_mask = None + if max_gap is not None: + max_gap_mask = _get_max_gap_mask(obj, dim, index_max_gap, max_gap) + return limit_mask, limit_area_mask, max_gap_mask class BaseInterpolator: @@ -243,8 +378,39 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds +def _get_raw_interp_index(arr, dim: Hashable, use_coordinate: bool | Hashable = True): + """Return index to use for x values in interpolation or curve fitting. + In comparison to get_clean_interp_index, this function does not convert + to numeric values.""" + + if dim not in arr.dims: + raise ValueError(f"{dim} is not a valid dimension") + + if use_coordinate is False: + return pd.RangeIndex(arr.sizes[dim], name=dim) + + elif use_coordinate is True: + coordinate = arr.coords[ + dim + ] # this will default to a linear coordinate, if no index is present + else: # string/hashable + coordinate = arr.coords[use_coordinate] + if dim not in coordinate.dims: + raise ValueError( + f"Coordinate given by {use_coordinate} must have dimension {dim}." + ) + + if coordinate.ndim != 1: + raise ValueError( + f"Coordinates used for interpolation must be 1D, " + f"{use_coordinate} is {coordinate.ndim}D." + ) + index = coordinate.to_index() + return index + + def get_clean_interp_index( - arr, dim: Hashable, use_coordinate: Hashable | bool = True, strict: bool = True + arr, dim: Hashable, use_coordinate: bool | Hashable = True, strict: bool = True ): """Return index to use for x values in interpolation or curve fitting. @@ -254,7 +420,7 @@ def get_clean_interp_index( Array to interpolate or fit to a curve. dim : str Name of dimension along which to fit. - use_coordinate : str or bool + use_coordinate : bool or hashable If use_coordinate is True, the coordinate that shares the name of the dimension along which interpolation is being performed will be used as the x values. If False, the x values are set as an equally spaced sequence. @@ -272,26 +438,10 @@ def get_clean_interp_index( to time deltas with respect to 1970-01-01. """ - # Question: If use_coordinate is a string, what role does `dim` play? from xarray.coding.cftimeindex import CFTimeIndex - if use_coordinate is False: - axis = arr.get_axis_num(dim) - return np.arange(arr.shape[axis], dtype=np.float64) - - if use_coordinate is True: - index = arr.get_index(dim) - - else: # string - index = arr.coords[use_coordinate] - if index.ndim != 1: - raise ValueError( - f"Coordinates used for interpolation must be 1D, " - f"{use_coordinate} is {index.ndim}D." - ) - index = index.to_index() - - # TODO: index.name is None for multiindexes + index = _get_raw_interp_index(arr, dim, use_coordinate) + # index.name is None for multiindexes # set name for nice error messages below if isinstance(index, pd.MultiIndex): index.name = dim @@ -324,51 +474,52 @@ def get_clean_interp_index( f"Index {index.name!r} must be castable to float64 to support " f"interpolation or curve fitting, got {type(index).__name__}." ) from err - + index = Variable([dim], index) return index +def _is_time_index(index): + from xarray.coding.cftimeindex import CFTimeIndex + + return isinstance(index, (pd.DatetimeIndex, CFTimeIndex)) + + def interp_na( self, dim: Hashable | None = None, - use_coordinate: bool | str = True, method: InterpOptions = "linear", - limit: int | None = None, - max_gap: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, + use_coordinate: bool | str = True, + limit: int + | float + | str + | pd.Timedelta + | np.timedelta64 + | dt.timedelta + | None = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate: bool + | str = False, # backward compatibility + pandas (2.1.4) compatibility + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None = None, keep_attrs: bool | None = None, **kwargs, ): """Interpolate values according to different methods.""" - from xarray.coding.cftimeindex import CFTimeIndex + # Preprocess arguments and do consistency checks if dim is None: raise NotImplementedError("dim is a required argument") - if limit is not None: - valids = _get_valid_fill_mask(self, dim, limit) - - if max_gap is not None: - max_type = type(max_gap).__name__ - if not is_scalar(max_gap): - raise ValueError("max_gap must be a scalar.") - - if ( - dim in self._indexes - and isinstance( - self._indexes[dim].to_pandas_index(), pd.DatetimeIndex | CFTimeIndex - ) - and use_coordinate - ): - # Convert to float - max_gap = timedelta_to_numeric(max_gap) - - if not use_coordinate: - if not isinstance(max_gap, Number | np.number): - raise TypeError( - f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}." - ) + masks = _get_gap_masks( + self, + dim, + limit, + limit_direction, + limit_area, + limit_use_coordinate, + max_gap, + use_coordinate, + ) # method index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) @@ -384,7 +535,7 @@ def interp_na( arr = apply_ufunc( interpolator, self, - index, + index.values, input_core_dims=[[dim], [dim]], output_core_dims=[[dim]], output_dtypes=[self.dtype], @@ -393,16 +544,9 @@ def interp_na( keep_attrs=keep_attrs, ).transpose(*self.dims) - if limit is not None: - arr = arr.where(valids) - - if max_gap is not None: - if dim not in self.coords: - raise NotImplementedError( - "max_gap not implemented for unlabeled coordinates yet." - ) - nan_block_lengths = _get_nan_block_lengths(self, dim, index) - arr = arr.where(nan_block_lengths <= max_gap) + for m in masks: + if m is not None: + arr = arr.where(m) return arr @@ -563,20 +707,6 @@ def _get_interpolator_nd(method, **kwargs): return interp_class, kwargs -def _get_valid_fill_mask(arr, dim, limit): - """helper function to determine values that can be filled when limit is not - None""" - kw = {dim: limit + 1} - # we explicitly use construct method to avoid copy. - new_dim = utils.get_temp_dimname(arr.dims, "_window") - return ( - arr.isnull() - .rolling(min_periods=1, **kw) - .construct(new_dim, fill_value=False) - .sum(new_dim, skipna=False) - ) <= limit - - def _localize(obj: T, indexes_coords: SourceDest) -> tuple[T, SourceDest]: """Speed up for linear and nearest neighbor method. Only consider a subspace that is needed for the interpolation diff --git a/xarray/core/types.py b/xarray/core/types.py index d4224dcead9..ca538cbf19c 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -244,6 +244,8 @@ def copy( ] InterpnOptions = Literal["linear", "nearest", "slinear", "cubic", "quintic", "pchip"] InterpOptions = Union[Interp1dOptions, InterpolantOptions, InterpnOptions] +LimitDirectionOptions = Literal["forward", "backward", "both"] +LimitAreaOptions = Literal["inside", "outside"] DatetimeUnitOptions = Literal[ "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", None diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index eb21cca0861..ddd818d0a3b 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -11,6 +11,12 @@ NumpyInterpolator, ScipyInterpolator, SplineInterpolator, + _get_gap_dist_to_left_edge, + _get_gap_dist_to_right_edge, + _get_gap_left_edge, + _get_gap_right_edge, + _get_limit_area_mask, + _get_limit_fill_mask, _get_nan_block_lengths, get_clean_interp_index, ) @@ -108,12 +114,9 @@ def test_interpolate_pd_compat(method, fill_value) -> None: for dim in ["time", "x"]: actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) - # need limit_direction="both" here, to let pandas fill - # in both directions instead of default forward direction only expected = df.interpolate( method=method, axis=da.get_axis_num(dim), - limit_direction="both", fill_value=fill_value, ) @@ -193,6 +196,48 @@ def test_interpolate_pd_compat_polynomial(): np.testing.assert_allclose(actual.values, expected.values) +@requires_scipy +def test_interpolate_pd_compat_limits(): + shapes = [(7, 7)] + frac_nan = 0.5 + method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' + limits = [ + None, + 1, + 3, + ] # pandas 2.1.4 is currently unable to handle coordinate based limits! + limit_directions = [ + "forward", + "backward", + ] # xarray does not support 'None' (pandas: None='forward', unless method='bfill') + limit_areas = [None, "outside", "inside"] + + for shape, limit, limit_direction, limit_area in itertools.product( + shapes, limits, limit_directions, limit_areas + ): + da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) + for dim in ["time", "x"]: + actual = da.interpolate_na( + method=method, + dim=dim, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + use_coordinate=True, + limit_use_coordinate=False, + fill_value="extrapolate", + ) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value="extrapolate", + ) + np.testing.assert_allclose(actual.values, expected.values) + + @requires_scipy def test_interpolate_unsorted_index_raises(): vals = np.array([1, 2, 3], dtype=np.float64) @@ -201,12 +246,6 @@ def test_interpolate_unsorted_index_raises(): expected.interpolate_na(dim="x", method="index") -def test_interpolate_no_dim_raises(): - da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims="x") - with pytest.raises(NotImplementedError, match=r"dim is a required argument"): - da.interpolate_na(method="linear") - - def test_interpolate_invalid_interpolator_raises(): da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims="x") with pytest.raises(ValueError, match=r"not a valid"): @@ -307,18 +346,57 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): - da = xr.DataArray( - np.array([1, 2, np.nan, np.nan, np.nan, 6], dtype=np.float64), dims="x" + n = np.nan + coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(8) * 2) + coords = {"yt": ("y", pd.Timestamp("2000-01-01") + coord_deltas)} + da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + + actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") + expected = da.copy(data=[n, n, 2, 3, 4, 5, 6, 7]) + assert_equal(actual, expected) + + actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") + expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.interpolate_na( + dim="y", + limit=pd.Timedelta("3H"), + limit_use_coordinate="yt", + fill_value="extrapolate", ) + expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + assert_equal(actual, expected) - actual = da.interpolate_na(dim="x", limit=None) - assert actual.isnull().sum() == 0 - actual = da.interpolate_na(dim="x", limit=2) - expected = xr.DataArray( - np.array([1, 2, 3, 4, np.nan, 6], dtype=np.float64), dims="x" +def test_interpolate_double_coordinate(): + # Check if limit is using 'limit_use_coordinate' and max_gap is using 'use_coordinate' + n = np.nan + da = xr.DataArray( + [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], + dims=["x", "y"], + coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, + ) + actual = da.interpolate_na( + "y", + limit=1, + max_gap=4, + limit_use_coordinate="y1", + use_coordinate="y2", + fill_value="extrapolate", ) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + assert_equal(actual, expected) + actual = da.interpolate_na( + "y", + limit=3, + max_gap=3, + limit_use_coordinate="y2", + use_coordinate="y1", + fill_value="extrapolate", + ) + expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) assert_equal(actual, expected) @@ -563,6 +641,114 @@ def test_bfill_dataset(ds): ds.ffill(dim="time") +def test_get_gap_left_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 3, 3, 3, 3, 3, 3, 3, 24], [n, n, n, 9, 9, 9, 18, 18, 18]] + ) + assert_equal(actual, expected) + + actual = _get_gap_left_edge(da, dim="y", index=index, outside=True) + expected = da.copy( + data=[[0, 3, 3, 3, 3, 3, 3, 3, 24], [0, 0, 0, 9, 9, 9, 18, 18, 18]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 2, 2, 2, 2, 2, 2, 2, 14], [n, n, n, 6, 6, 6, 10, 10, 10]] + ) + + +def test_get_gap_right_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[3, 3, 24, 24, 24, 24, 24, 24, 24], [9, 9, 9, 9, 18, 18, 18, n, n]] + ) + assert_equal(actual, expected) + + actual = _get_gap_right_edge(da, dim="y", index=index, outside=True) + expected = da.copy( + data=[[3, 3, 24, 24, 24, 24, 24, 24, 24], [9, 9, 9, 9, 18, 18, 18, 24, 24]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[2, 2, 14, 14, 14, 14, 14, 14, 14], [6, 6, 6, 6, 10, 10, 10, n, n]] + ) + + +def test_get_gap_dist_to_left_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 0, 3, 6, 9, 12, 15, 18, 0], [n, n, n, 0, 3, 6, 0, 3, 6]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_left_edge(da, dim="y", index=index) + expected = da.copy(data=[[n, 0, 3, 4, 5, 6, 8, 10, 0], [n, n, n, 0, 1, 2, 0, 2, 4]]) + + +def test_get_gap_dist_to_right_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[3, 0, 18, 15, 12, 9, 6, 3, 0], [9, 6, 3, 0, 6, 3, 0, n, n]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_right_edge(da, dim="y", index=index) + expected = da.copy(data=[[2, 0, 9, 8, 7, 6, 4, 2, 0], [5, 3, 0, 4, 3, 2, 0, n, n]]) + + @requires_bottleneck @pytest.mark.parametrize( "y, lengths_expected", @@ -590,6 +776,82 @@ def test_interpolate_na_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +def test_get_limit_fill_mask(): + T = True + F = False + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + + with pytest.raises(ValueError, match=r"limit_direction must be one of"): + _get_limit_fill_mask(da, dim="y", index=index, limit=3, limit_direction="cat") + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="forward" + ) + expected = da.copy(data=[[F, T, T, F, F, F, F, F, T], [F, F, F, T, T, T, T, T, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="backward" + ) + expected = da.copy(data=[[T, T, F, F, F, F, F, T, T], [F, F, T, T, T, T, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="both" + ) + expected = da.copy(data=[[T, T, T, F, F, F, F, T, T], [F, F, T, T, T, T, T, T, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="forward" + ) + expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, F, T, T, F, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="backward" + ) + expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, F, F, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="both" + ) + expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, T, F, T, F, F]]) + assert_equal(actual, expected) + + +def test_get_area_mask(): + T = True + F = False + n = np.nan + arr = [ + [n, 1, n, n, 5, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + + with pytest.raises(ValueError, match=r"limit_area must be one of"): + _get_limit_area_mask(da, dim="y", index=index, limit_area="cow") + + actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="inside") + expected = da.copy(data=[[F, T, T, T, T, T, T, T, T], [F, F, F, T, T, T, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="outside") + expected = da.copy(data=[[T, T, F, F, T, F, F, F, T], [T, T, T, T, F, F, T, T, T]]) + assert_equal(actual, expected) + + @requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_get_clean_interp_index_cf_calendar(cf_da, calendar): @@ -638,6 +900,31 @@ def test_get_clean_interp_index_strict(index): assert clean.dtype == np.float64 +def test_get_clean_interp_index_double_coordinate(): + da = xr.DataArray( + np.ones((2, 7)), + dims=["x", "y"], + coords={ + "x": ("x", [10, 20]), + "y1": ("y", np.arange(7) * 2), + "y2": ("y", np.arange(7) * 3), + }, + ) + with pytest.raises(ValueError, match=r"not a valid dimension"): + get_clean_interp_index(da, "y1", use_coordinate=True) + + actual = get_clean_interp_index(da, "y", use_coordinate=True) + expected = xr.Variable(["y"], np.arange(7)) + assert_equal(actual, expected) + + actual = get_clean_interp_index(da, "y", use_coordinate="y1") + expected = xr.Variable(["y"], np.arange(7) * 2) + assert_equal(actual, expected) + + with pytest.raises(ValueError, match=r"must have dimension"): + get_clean_interp_index(da, "x", use_coordinate="y1") + + @pytest.fixture def da_time(): return xr.DataArray( @@ -647,11 +934,6 @@ def da_time(): def test_interpolate_na_max_gap_errors(da_time): - with pytest.raises( - NotImplementedError, match=r"max_gap not implemented for unlabeled coordinates" - ): - da_time.interpolate_na("t", max_gap=1) - with pytest.raises(ValueError, match=r"max_gap must be a scalar."): da_time.interpolate_na("t", max_gap=(1,)) @@ -690,12 +972,16 @@ def test_interpolate_na_max_gap_time_specifier( @pytest.mark.parametrize( "coords", [ - pytest.param(None, marks=pytest.mark.xfail()), + None, {"x": np.arange(4), "y": np.arange(12)}, ], ) -def test_interpolate_na_2d(coords): +def test_interpolate_na_max_gap_2d(coords): n = np.nan + if coords is None: + use_coordinate = False + else: + use_coordinate = True da = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -707,7 +993,7 @@ def test_interpolate_na_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", max_gap=2) + actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2) expected_y = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], @@ -718,18 +1004,20 @@ def test_interpolate_na_2d(coords): ) assert_equal(actual, expected_y) - actual = da.interpolate_na("y", max_gap=1, fill_value="extrapolate") + actual = da.interpolate_na( + "y", use_coordinate=use_coordinate, max_gap=1, fill_value="extrapolate" + ) expected_y_extra = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], [n, n, 3, n, n, 6, n, n, n, 10, n, n], [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", max_gap=3) + actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3) expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -743,6 +1031,124 @@ def test_interpolate_na_2d(coords): assert_equal(actual, expected_x) +def test_interpolate_na_limit_2d(): + n = np.nan + coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(12) * 3) + coords = { + "x": np.arange(3) * 2, + "time": (pd.Timestamp("2000-01-01") + coord_deltas), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + actual = da.interpolate_na("time", limit=1, fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, n, n, 10, 11, 12], + [n, n, 3, 4, n, 6, 7, n, n, 10, 11, n], + [n, 2, 3, 4, 5, 6, 7, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + + actual = da.interpolate_na( + "time", limit=2, limit_direction="backward", fill_value="extrapolate" + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, n, n], + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + actual = da.interpolate_na( + "time", + limit=pd.Timedelta("3H"), + limit_direction="backward", + limit_area="inside", + limit_use_coordinate=True, + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, n, 9, 10, 11, n], + [n, n, 3, n, 5, 6, n, n, 9, 10, n, n], + [n, 2, 3, 4, 5, 6, n, n, 9, 10, 11, n], + ] + ) + + actual = da.interpolate_na( + "time", + limit=pd.Timedelta("3H"), + limit_direction="backward", + limit_area="outside", + limit_use_coordinate=True, + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, 2, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + actual = da.interpolate_na( + "time", + limit=None, + limit_direction="backward", + limit_area="outside", + limit_use_coordinate=True, + fill_value=8, + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [8, 8, 3, n, n, 6, n, n, n, 10, n, n], + [8, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + da = xr.DataArray( + [ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, n], + [n, n, 3, 3, n, n], + [n, n, n, 4, 4, 4], + ], + dims=["x", "y"], + coords={"x": np.arange(4) * 2}, + ) + actual = da.interpolate_na( + method="linear", + dim="x", + limit=3, + limit_direction="forward", + limit_area=None, + limit_use_coordinate=True, + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, 2], + [n, 3, 3, 3, 3, n], + [n, n, 4, 4, 4, 4], + ] + ) + assert_equal(actual, expected) + + @requires_scipy def test_interpolators_complex_out_of_bounds(): """Ensure complex nans are used for complex data""" From 63dabc90c4c3e50f09be6566cba180b95421ebb6 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:30:08 +0200 Subject: [PATCH 02/46] Use internal broadcasting and transpose instead of ones_like --- xarray/core/missing.py | 38 +++++++++++++++++++++--------------- xarray/tests/test_missing.py | 38 +++++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index d7d69b32872..de45c379da2 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -12,7 +12,7 @@ import numpy as np import pandas as pd -from xarray.core.common import _contains_datetime_like_objects, ones_like +from xarray.core.common import _contains_datetime_like_objects from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import ( datetime_to_numeric, @@ -53,8 +53,7 @@ def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): - arange = ones_like(obj) * index - left = arange.where(~obj.isnull()).ffill(dim) + left = index.where(~obj.isnull()).ffill(dim).transpose(*obj.dims) if outside: return left.fillna(index[0]) return left @@ -63,8 +62,7 @@ def _get_gap_left_edge( def _get_gap_right_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): - arange = ones_like(obj) * index - right = arange.where(~obj.isnull()).bfill(dim) + right = index.where(~obj.isnull()).bfill(dim).transpose(*obj.dims) if outside: return right.fillna(index[-1]) return right @@ -73,15 +71,13 @@ def _get_gap_right_edge( def _get_gap_dist_to_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable ): - arange = ones_like(obj) * index - return arange - _get_gap_left_edge(obj, dim, index) + return (index - _get_gap_left_edge(obj, dim, index)).transpose(*obj.dims) def _get_gap_dist_to_right_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable ): - arange = ones_like(obj) * index - return _get_gap_right_edge(obj, dim, index) - arange + return (_get_gap_right_edge(obj, dim, index) - index).transpose(*obj.dims) def _get_limit_fill_mask( @@ -189,22 +185,32 @@ def _get_gap_masks( raise TypeError( f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." ) + # Which masks are really needed? + need_limit_mask = limit != np.inf or limit_direction != "both" + need_area_mask = limit_area is not None + need_max_gap_mask = max_gap is not None # Calculate indexes - index_limit = get_clean_interp_index(obj, dim, use_coordinate=limit_use_coordinate) - index_max_gap = get_clean_interp_index( - obj, dim, use_coordinate=max_gap_use_coordinate - ) + if need_limit_mask or need_area_mask: + index_limit = get_clean_interp_index( + obj, dim, use_coordinate=limit_use_coordinate + ) + # index_limit = ones_like(obj) * index_limit + if need_max_gap_mask: + index_max_gap = get_clean_interp_index( + obj, dim, use_coordinate=max_gap_use_coordinate + ) + # index_max_gap = ones_like(obj) * index_max_gap # Calculate fill masks limit_mask = None - if limit != np.inf or limit_direction != "both": + if need_limit_mask: limit_mask = _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) limit_area_mask = None - if limit_area is not None: + if need_area_mask: limit_area_mask = _get_limit_area_mask(obj, dim, index_limit, limit_area) max_gap_mask = None - if max_gap is not None: + if need_max_gap_mask: max_gap_mask = _get_max_gap_mask(obj, dim, index_max_gap, max_gap) return limit_mask, limit_area_mask, max_gap_mask diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index ddd818d0a3b..46d27e83874 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -764,7 +764,7 @@ def test_get_gap_dist_to_right_edge(): ], ], ) -def test_interpolate_na_nan_block_lengths(y, lengths_expected): +def test_get_nan_block_lengths(y, lengths_expected): arr = [ [np.nan, 1, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 4], [np.nan, np.nan, np.nan, 1, np.nan, np.nan, 4, np.nan, np.nan], @@ -776,6 +776,42 @@ def test_interpolate_na_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +def test_get_nan_block_lengths_2d(): + n = np.nan + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + dims=["x", "y"], + coords={"x": np.arange(4), "y": np.arange(12) ** 2}, + ) + index = get_clean_interp_index(da, dim="y", use_coordinate=False) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected_y = da.copy( + data=[ + [0, 0, 0, 0, 2, 0, 4, 4, 4, 0, 0, 1], + [2, 2, 0, 3, 3, 0, 4, 4, 4, 0, 2, 2], + [2, 2, 0, 3, 3, 0, 4, 4, 4, 0, 2, 2], + [1, 0, 0, 0, 2, 0, 4, 4, 4, 0, 0, 1], + ] + ) + assert_equal(actual, expected_y) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected_y = da.copy( + data=[ + [0, 0, 0, 0, 16, 0, 56, 56, 56, 0, 0, 21], + [4, 4, 0, 21, 21, 0, 56, 56, 56, 0, 40, 40], + [4, 4, 0, 21, 21, 0, 56, 56, 56, 0, 40, 40], + [1, 0, 0, 0, 16, 0, 56, 56, 56, 0, 0, 21], + ] + ) + assert_equal(actual, expected_y) + + def test_get_limit_fill_mask(): T = True F = False From fdd3ca7514381330b7451d3da964be779eaeae56 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:30:17 +0200 Subject: [PATCH 03/46] Typo: Default False in doc for limit_use_coordinates --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1d862e27e57..8f757b3dc92 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3609,7 +3609,7 @@ def interpolate_na( - "inside": Only fill NaNs surrounded by valid values (interpolate). - "outside": Only fill NaNs outside valid values (extrapolate). - limit_use_coordinate : bool or Hashable, default: True + limit_use_coordinate : bool or Hashable, default: False Specifies which index to use for the ``limit`` distance. - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index eafcb005992..f5a038b4e39 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6796,7 +6796,7 @@ def interpolate_na( - "inside": Only fill NaNs surrounded by valid values (interpolate). - "outside": Only fill NaNs outside valid values (extrapolate). - limit_use_coordinate : bool or Hashable, default: True + limit_use_coordinate : bool or Hashable, default: False Specifies which index to use for the ``limit`` distance. - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). From 8393d722d2c970855d04ddfd24edd3828a537418 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 18:31:27 +0000 Subject: [PATCH 04/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index de45c379da2..a6e7f3af125 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -495,13 +495,9 @@ def interp_na( dim: Hashable | None = None, method: InterpOptions = "linear", use_coordinate: bool | str = True, - limit: int - | float - | str - | pd.Timedelta - | np.timedelta64 - | dt.timedelta - | None = None, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, limit_direction: LimitDirectionOptions = "forward", limit_area: LimitAreaOptions | None = None, limit_use_coordinate: bool From e7250087b0a2181a1d182c1e5d8fc9fb22f17d00 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:57:54 +0200 Subject: [PATCH 05/46] Towards masked implementation --- xarray/core/missing.py | 52 +++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index a6e7f3af125..b9dd0474f23 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -50,6 +50,37 @@ T = TypeVar("T") +class MaskedDataArray: + def __init__(self, da: DataArray, mask: np.ndarray): + self.da = da + self.mask = mask + + +def mask_gaps( + self, + dim: Hashable | None = None, + use_coordinate: bool | str = True, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, +): + """Mask continues gaps in the data, providing functionality to control gap length and offsets""" + + masks = _get_gap_masks( + self, + dim, + limit, + limit_direction, + limit_area, + max_gap, + use_coordinate, + ) + return masks # tbd + + def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): @@ -88,12 +119,12 @@ def _get_limit_fill_mask( limit_direction, ): if limit_direction == "forward": - limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) <= limit + limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) > limit elif limit_direction == "backward": - limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) <= limit + limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) > limit elif limit_direction == "both": - limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) <= limit) | ( - _get_gap_dist_to_right_edge(obj, dim, index) <= limit + limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) > limit) & ( + _get_gap_dist_to_right_edge(obj, dim, index) > limit ) else: raise ValueError( @@ -107,16 +138,15 @@ def _get_limit_area_mask( ): if limit_area == "inside": area_mask = ( - _get_gap_left_edge(obj, dim, index).notnull() - & _get_gap_right_edge(obj, dim, index).notnull() + _get_gap_left_edge(obj, dim, index).isnull() + | _get_gap_right_edge(obj, dim, index).isnull() ) - area_mask = area_mask | obj.notnull() elif limit_area == "outside": area_mask = ( - _get_gap_left_edge(obj, dim, index).isnull() - | _get_gap_right_edge(obj, dim, index).isnull() + _get_gap_left_edge(obj, dim, index).notnull() + & _get_gap_right_edge(obj, dim, index).notnull() ) - area_mask = area_mask | obj.notnull() + area_mask = area_mask & obj.isnull() else: raise ValueError( f"limit_area must be one of 'inside', 'outside' or None. Got {limit_area}" @@ -143,7 +173,7 @@ def _get_max_gap_mask( max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta, ): nan_block_lengths = _get_nan_block_lengths(obj, dim, index) - return nan_block_lengths <= max_gap + return nan_block_lengths > max_gap def _get_gap_masks( From d5466f580dfd1b508f06a6d21d29d435051e4c18 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Thu, 20 Jun 2024 21:04:16 +0200 Subject: [PATCH 06/46] Working fill_gaps implementation --- xarray/core/dataarray.py | 308 +++++++++++++++++++----------- xarray/core/dataset.py | 291 ++++++++++++++++++++-------- xarray/core/missing.py | 209 +++++++++++++-------- xarray/tests/test_missing.py | 354 +++++++++++++++++++++++++---------- 4 files changed, 795 insertions(+), 367 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8f757b3dc92..38f380c32cd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -99,6 +99,7 @@ from xarray.core.groupby import DataArrayGroupBy from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling + from xarray.core.missing import GapMask from xarray.core.types import ( CoarsenBoundaryOptions, DatetimeLike, @@ -3520,6 +3521,7 @@ def fillna(self, value: Any) -> Self: out = ops.fillna(self, value) return out + def interpolate_na( self, dim: Hashable, @@ -3534,9 +3536,6 @@ def interpolate_na( | np.timedelta64 | datetime.timedelta ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - limit_use_coordinate: bool | Hashable = False, max_gap: ( None | int @@ -3560,62 +3559,25 @@ def interpolate_na( String indicating which method to use for interpolation: - 'linear': linear interpolation. Additional keyword - arguments are passed to :py:func:`numpy.interp` + arguments are passed to :py:func:`numpy.interp` - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': - are passed to :py:func:`scipy.interpolate.interp1d`. If - ``method='polynomial'``, the ``order`` keyword argument must also be - provided. + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be + provided. - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their - respective :py:class:`scipy.interpolate` classes. + respective :py:class:`scipy.interpolate` classes. - use_coordinate : bool or Hashable, default: True + use_coordinate : bool or str, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - - limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None - Maximum number or distance of consecutive NaNs to fill. - Use None for no limit. When interpolating along a datetime64 dimension - and ``limit_use_coordinate=True``, ``limit`` can be one of the following: - - - a string that is valid input for pandas.to_timedelta - - a :py:class:`numpy.timedelta64` object - - a :py:class:`pandas.Timedelta` object - - a :py:class:`datetime.timedelta` object - - Otherwise, ``limit`` must be an int or a float. - If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined - as the difference between the coordinate at a NaN value and the coordinate of the next valid value - to the left (right for ``limit_direction=backward``). - For example, consider:: - - - array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 - - For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. - To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. If False, values are treated as if + equally-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a + coordinate variable to use as the index. + limit : int or None, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "forward" - Consecutive NaNs will be filled in this direction. - limit_area: {"inside", "outside"} or None: default: None - Consecutive NaNs will be filled with this restriction. - - - None: No fill restriction. - - "inside": Only fill NaNs surrounded by valid values (interpolate). - - "outside": Only fill NaNs outside valid values (extrapolate). - - limit_use_coordinate : bool or Hashable, default: False - Specifies which index to use for the ``limit`` distance. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3626,8 +3588,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer - index is created. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -3636,7 +3598,7 @@ def interpolate_na( array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 + * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively keep_attrs : bool or None, default: None @@ -3651,78 +3613,47 @@ def interpolate_na( interpolated: DataArray Filled in DataArray. - Warning - -------- - When passing fill_value as a keyword argument with method="linear", it does not use - ``numpy.interp`` but it uses ``scipy.interpolate.interp1d``, which provides the fill_value parameter. - See Also -------- numpy.interp scipy.interpolate - pandas.DataFrame.interpolate - - Notes - ----- - ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. Examples -------- >>> da = xr.DataArray( - ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], - ... dims="x", - ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} ... ) >>> da - - array([nan, 2., nan, nan, 5., nan, 0.]) + Size: 40B + array([nan, 2., 3., nan, 0.]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 + >>> da.interpolate_na(dim="x", method="linear") - - array([nan, 2. , 3. , 4. , 5. , 2.5, 0. ]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 - >>> da.interpolate_na( - ... dim="x", - ... method="linear", - ... limit_direction="both", - ... fill_value="extrapolate", - ... ) - - array([1. , 2. , 3. , 4. , 5. , 2.5, 0. ]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 - >>> da.interpolate_na( - ... dim="x", method="linear", limit=1, limit_direction="forward" - ... ) - - array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Size: 40B + array([nan, 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 - >>> da.interpolate_na( - ... dim="x", method="linear", max_gap=2, limit_direction="forward" - ... ) - - array([nan, 2. , nan, nan, 5. , 2.5, 0. ]) + * x (x) int64 40B 0 1 2 3 4 + + >>> da.interpolate_na(dim="x", method="linear", fill_value="extrapolate") + Size: 40B + array([1. , 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 """ from xarray.core.missing import interp_na return interp_na( - self, - dim=dim, - method=method, - use_coordinate=use_coordinate, - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - limit_use_coordinate=limit_use_coordinate, - max_gap=max_gap, - keep_attrs=keep_attrs, - **kwargs, - ) + self, + dim=dim, + method=method, + limit=limit, + use_coordinate=use_coordinate, + max_gap=max_gap, + keep_attrs=keep_attrs, + **kwargs, + ) + def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward @@ -3891,6 +3822,161 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: from xarray.core.missing import bfill return bfill(self, dim, limit=limit) + + def fill_gaps( + self, + dim: Hashable, + use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions | None = None, + max_gap: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + ) -> GapMask: + """Fill in gaps in the data using one of several filling methods. + Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + + Parameters + ---------- + dim : Hashable + Specifies the dimension along which to calculate gap sizes. + use_coordinate : bool or Hashable, default: True + Specifies which index to use when calculating gap sizes. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only fill gaps less than a given length, + see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values + - "outside": Only fill NaNs outside valid values (extrapolate). + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When calculated along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last valid value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. + + Returns + ------- + Gap Mask: GapMask + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. + + See Also + -------- + DataArray.fillna + DataArray.ffill + DataArray.bfill + DataArray.interpolate_na + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + + Examples + -------- + >>> da = xr.DataArray( + ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], + ... dims="x", + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... ) + >>> da + Size: 56B + array([nan, 2., nan, nan, 5., nan, 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps( + ... dim="x", limit=1, limit_direction="forward" + ... ).interpolate_na(dim="x") + Size: 56B + array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps( + ... dim="x", max_gap=2, limit_direction="forward" + ... ).ffill(dim="x") + Size: 56B + array([nan, 2., nan, nan, 5., 5., 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps( + ... dim="x", limit_area="inside" + ... ).fillna(9) + Size: 56B + array([nan, 2., 9., 9., 5., 9., 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + """ + from xarray.core.missing import mask_gaps + + return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) + def combine_first(self, other: Self) -> Self: """Combine two DataArray objects, with union of coordinates. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f5a038b4e39..87c7448ba5e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6740,7 +6740,7 @@ def interpolate_na( Parameters ---------- - dim : Hashable + dim : Hashable or None, optional Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -6757,52 +6757,15 @@ def interpolate_na( use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - - limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None - Maximum number or distance of consecutive NaNs to fill. - Use None for no limit. When interpolating along a datetime64 dimension - and ``limit_use_coordinate=True``, ``limit`` can be one of the following: - - - a string that is valid input for pandas.to_timedelta - - a :py:class:`numpy.timedelta64` object - - a :py:class:`pandas.Timedelta` object - - a :py:class:`datetime.timedelta` object - - Otherwise, ``limit`` must be an int or a float. - If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined - as the difference between the coordinate at a NaN value and the coordinate of the next valid value - to the left (right for ``limit_direction=backward``). - For example, consider:: - - - array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 - - For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. - To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. If False, values are treated as if + equally-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a + coordinate variable to use as the index. + limit : int, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "forward" - Consecutive NaNs will be filled in this direction. - limit_area: {"inside", "outside"} or None: default: None - Consecutive NaNs will be filled with this restriction. - - - None: No fill restriction. - - "inside": Only fill NaNs surrounded by valid values (interpolate). - - "outside": Only fill NaNs outside valid values (extrapolate). - - limit_use_coordinate : bool or Hashable, default: False - Specifies which index to use for the ``limit`` distance. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta \ or None, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. @@ -6814,8 +6777,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer - index is created. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -6827,10 +6790,6 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively - keep_attrs : bool or None, default: None - If True, the dataarray's attributes (`attrs`) will be copied from - the original object to the new one. If False, the new - object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6849,50 +6808,49 @@ def interpolate_na( numpy.interp scipy.interpolate - Notes - ----- - ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. - Examples -------- >>> ds = xr.Dataset( ... { - ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), - ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), + ... "A": ("x", [np.nan, 2, 3, np.nan, 0]), + ... "B": ("x", [3, 4, np.nan, 1, 7]), + ... "C": ("x", [np.nan, np.nan, np.nan, 5, 0]), + ... "D": ("x", [np.nan, 3, np.nan, -1, 4]), ... }, - ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... coords={"x": [0, 1, 2, 3, 4]}, ... ) >>> ds - - Dimensions: (x: 7) + Size: 200B + Dimensions: (x: 5) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 Data variables: - A (x) float64 nan 2.0 nan nan 5.0 nan 0.0 - B (x) float64 nan 2.0 nan nan 5.0 6.0 nan - >>> ds.interpolate_na( - ... dim="x", - ... method="linear", - ... limit_direction="both", - ... fill_value="extrapolate", - ... ) - - Dimensions: (x: 7) + A (x) float64 40B nan 2.0 3.0 nan 0.0 + B (x) float64 40B 3.0 4.0 nan 1.0 7.0 + C (x) float64 40B nan nan nan 5.0 0.0 + D (x) float64 40B nan 3.0 nan -1.0 4.0 + + >>> ds.interpolate_na(dim="x", method="linear") + Size: 200B + Dimensions: (x: 5) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 Data variables: - A (x) float64 1.0 2.0 3.0 4.0 5.0 2.5 0.0 - B (x) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 - >>> ds.interpolate_na( - ... dim="x", method="linear", limit=1, limit_direction="forward" - ... ) - - Dimensions: (x: 7) + A (x) float64 40B nan 2.0 3.0 1.5 0.0 + B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 + C (x) float64 40B nan nan nan 5.0 0.0 + D (x) float64 40B nan 3.0 1.0 -1.0 4.0 + + >>> ds.interpolate_na(dim="x", method="linear", fill_value="extrapolate") + Size: 200B + Dimensions: (x: 5) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 Data variables: - A (x) float64 nan 2.0 3.0 nan 5.0 2.5 0.0 - B (x) float64 nan 2.0 3.0 nan 5.0 6.0 nan + A (x) float64 40B 1.0 2.0 3.0 1.5 0.0 + B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 + C (x) float64 40B 20.0 15.0 10.0 5.0 0.0 + D (x) float64 40B 5.0 3.0 1.0 -1.0 4.0 """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na @@ -7037,6 +6995,173 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: new = _apply_over_vars_with_dim(bfill, self, dim=dim, limit=limit) return new + def fill_gaps( + self, + dim: Hashable, + use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions | None = None, + max_gap: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + ) -> GapMask: + """Fill in gaps in the data using one of several filling methods. + Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + + Parameters + ---------- + dim : Hashable + Specifies the dimension along which to calculate gap sizes. + use_coordinate : bool or Hashable, default: True + Specifies which index to use when calculating gap sizes. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only fill gaps less than a given length, + see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values + - "outside": Only fill NaNs outside valid values (extrapolate). + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When calculated along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last valid value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. + + Returns + ------- + Gap Mask: GapMask + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. + + See Also + -------- + DataArray.fillna + DataArray.ffill + DataArray.bfill + DataArray.interpolate_na + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + + Examples + -------- + >>> ds = xr.Dataset( + ... { + ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), + ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), + ... }, + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... ) + >>> ds + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 + B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan + >>> ds.fill_gaps( + ... dim="x", limit=1, limit_direction="forward" + ... ).interpolate_na(dim="x") + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 + B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan + >>> ds.fill_gaps( + ... dim="x", max_gap=2, limit_direction="forward" + ... ).ffill(dim="x") + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 + B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 + >>> ds.fill_gaps( + ... dim="x", limit_area="inside" + ... ).fillna(9) + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 9.0 9.0 5.0 9.0 0.0 + B (x) float64 56B nan 2.0 9.0 9.0 5.0 6.0 nan + """ + from xarray.core.missing import mask_gaps + + return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) def combine_first(self, other: Self) -> Self: """Combine two Datasets, default to data_vars of self. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index b9dd0474f23..78397a94f7c 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -50,36 +50,23 @@ T = TypeVar("T") -class MaskedDataArray: - def __init__(self, da: DataArray, mask: np.ndarray): - self.da = da - self.mask = mask - - -def mask_gaps( - self, - dim: Hashable | None = None, - use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, -): - """Mask continues gaps in the data, providing functionality to control gap length and offsets""" - - masks = _get_gap_masks( - self, - dim, - limit, - limit_direction, - limit_area, - max_gap, - use_coordinate, - ) - return masks # tbd - +_FILL_MISSING_DOCSTRING_TEMPLATE = """\ +Partly fill nan values in this object's data by applying `{name}` to all unmasked values. + +Parameters +---------- +keep_attrs : bool, default: None + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False, the new object will be returned + without attributes. If None uses the global default. +**kwargs : dict + Additional keyword arguments passed on to `{name}`. + +Returns +------- +filled : same type as caller + New object with `{name}` applied to all unmasked values. +""" def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False @@ -118,13 +105,15 @@ def _get_limit_fill_mask( limit, limit_direction, ): + #At the left boundary, distance to left is nan. + #For nan, a<=b and ~(a>b) behave differently if limit_direction == "forward": - limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) > limit + limit_mask = ~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit) elif limit_direction == "backward": - limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) > limit + limit_mask = ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) elif limit_direction == "both": - limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) > limit) & ( - _get_gap_dist_to_right_edge(obj, dim, index) > limit + limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & (~( + _get_gap_dist_to_right_edge(obj, dim, index) <= limit) ) else: raise ValueError( @@ -176,7 +165,7 @@ def _get_max_gap_mask( return nan_block_lengths > max_gap -def _get_gap_masks( +def _get_gap_mask( obj: Dataset | DataArray | Variable, dim: Hashable, limit=None, @@ -230,20 +219,24 @@ def _get_gap_masks( obj, dim, use_coordinate=max_gap_use_coordinate ) # index_max_gap = ones_like(obj) * index_max_gap - # Calculate fill masks - limit_mask = None + if not (need_limit_mask or need_area_mask or need_max_gap_mask): + return None + + # Calculate individual masks + masks=[] if need_limit_mask: - limit_mask = _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) + masks.append(_get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction)) - limit_area_mask = None if need_area_mask: - limit_area_mask = _get_limit_area_mask(obj, dim, index_limit, limit_area) + masks.append(_get_limit_area_mask(obj, dim, index_limit, limit_area)) - max_gap_mask = None if need_max_gap_mask: - max_gap_mask = _get_max_gap_mask(obj, dim, index_max_gap, max_gap) - return limit_mask, limit_area_mask, max_gap_mask - + masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) + #Combine masks + mask=masks[0] + for m in masks[1:]: + mask|=m + return mask class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods""" @@ -519,41 +512,15 @@ def _is_time_index(index): return isinstance(index, (pd.DatetimeIndex, CFTimeIndex)) - -def interp_na( +def _interp_na_all( self, dim: Hashable | None = None, method: InterpOptions = "linear", use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - limit_use_coordinate: bool - | str = False, # backward compatibility + pandas (2.1.4) compatibility - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None = None, keep_attrs: bool | None = None, **kwargs, ): - """Interpolate values according to different methods.""" - - # Preprocess arguments and do consistency checks - if dim is None: - raise NotImplementedError("dim is a required argument") - - masks = _get_gap_masks( - self, - dim, - limit, - limit_direction, - limit_area, - limit_use_coordinate, - max_gap, - use_coordinate, - ) - - # method + """Interpolate all nan values, without restrictions regarding the gap size.""" index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) interpolator = partial(func_interpolate_na, interp_class, **kwargs) @@ -575,14 +542,106 @@ def interp_na( vectorize=True, keep_attrs=keep_attrs, ).transpose(*self.dims) + return arr + +class GapMask: + """An object that allows for flexible masking of gaps.""" + def __init__(self, content: DataArray | Dataset, mask: np.ndarray): + self.content = content + self.mask = mask + + def _fill_method(name: str, _fill_function: Callable | None = None): + def method(self, *args, _fill_function=_fill_function, **kwargs): + if _fill_function is None: + _fill_function=getattr(self.content, name) + filled=_fill_function(*args, **kwargs) + else: + filled=_fill_function(self.content, *args, **kwargs) + + if self.mask is not None: + filled=filled.where(~self.mask, other=self.content) + return filled + method.__name__ = name + method.__doc__ = _FILL_MISSING_DOCSTRING_TEMPLATE.format(name=name) + return method - for m in masks: - if m is not None: - arr = arr.where(m) + ffill=_fill_method('ffill') + bfill=_fill_method('bfill') + fillna=_fill_method('fillna') + interpolate_na=_fill_method('interpolate_na') +def mask_gaps( + self, + dim: Hashable | None = None, + use_coordinate: bool | str = True, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, + limit_direction: LimitDirectionOptions ="both", + limit_area: LimitAreaOptions | None = None, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, +) -> GapMask: + """Mask continuous gaps in the data, providing functionality to control gap length and offsets""" + + mask = _get_gap_mask( + self, + dim, + limit, + limit_direction, + limit_area, + use_coordinate, + max_gap, + use_coordinate, + ) + return GapMask(self, mask) + + + + +def interp_na( + self, + dim: Hashable | None = None, + method: InterpOptions = "linear", + use_coordinate: bool | str = True, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, + keep_attrs: bool | None = None, + **kwargs, +): + """Interpolate values according to different methods.""" + # Preprocess arguments and do consistency checks + if dim is None: + raise NotImplementedError("dim is a required argument") + + #This was the original behaviour of interp_na and is kept for backward compatibility + #Limit=None: Fill everything, including both boundaries + #Limit!=None: Do forward interpolation until limit + limit_use_coordinate=False + if limit is None: + limit_direction = "both" + else: + limit_direction = "forward" + limit_area = None + mask = _get_gap_mask( + self, + dim, + limit, + limit_direction, + limit_area, + limit_use_coordinate, + max_gap, + use_coordinate, + ) + + arr=_interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) + if mask is not None: + arr = arr.where(~mask) return arr + def func_interpolate_na(interpolator, y, x, **kwargs): """helper function to apply interpolation along 1 dimension""" # reversed arguments are so that attrs are preserved from da, not index diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 46d27e83874..a2639a05dac 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -104,34 +104,33 @@ def make_interpolate_example_data(shape, frac_nan, seed=12345, non_uniform=False @pytest.mark.parametrize( "method", ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"] ) +@pytest.mark.parametrize("dim", ["time", "x"]) +@pytest.mark.parametrize("shape", [(8, 8), (1, 20), (20, 1), (100, 100)]) +@pytest.mark.parametrize("frac_nan", [0, 0.5, 1]) @requires_scipy -def test_interpolate_pd_compat(method, fill_value) -> None: - shapes = [(8, 8), (1, 20), (20, 1), (100, 100)] - frac_nans = [0, 0.5, 1] +def test_interpolate_pd_compat(method, fill_value, dim, shape, frac_nan) -> None: - for shape, frac_nan in itertools.product(shapes, frac_nans): - da, df = make_interpolate_example_data(shape, frac_nan) + da, df = make_interpolate_example_data(shape, frac_nan) - for dim in ["time", "x"]: - actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) - expected = df.interpolate( - method=method, - axis=da.get_axis_num(dim), - fill_value=fill_value, - ) - - if method == "linear": - # Note, Pandas does not take left/right fill_value into account - # for the numpy linear methods. - # see https://github.com/pandas-dev/pandas/issues/55144 - # This aligns the pandas output with the xarray output - fixed = expected.values.copy() - fixed[pd.isnull(actual.values)] = np.nan - fixed[actual.values == fill_value] = fill_value - else: - fixed = expected.values + actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + fill_value=fill_value, + limit_direction='both' + ) - np.testing.assert_allclose(actual.values, fixed) + if method == "linear": + # Note, Pandas does not take left/right fill_value into account + # for the numpy linear methods. + # see https://github.com/pandas-dev/pandas/issues/55144 + # This aligns the pandas output with the xarray output + fixed = expected.values.copy() + fixed[pd.isnull(actual.values)] = np.nan + fixed[actual.values == fill_value] = fill_value + else: + fixed = expected.values + np.testing.assert_allclose(actual.values, fixed) @requires_scipy @@ -217,14 +216,16 @@ def test_interpolate_pd_compat_limits(): ): da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) for dim in ["time", "x"]: - actual = da.interpolate_na( - method=method, + actual = da.fill_gaps( dim=dim, limit=limit, limit_direction=limit_direction, limit_area=limit_area, + use_coordinate=False, + ).interpolate_na( + dim=dim, + method=method, use_coordinate=True, - limit_use_coordinate=False, fill_value="extrapolate", ) expected = df.interpolate( @@ -347,30 +348,30 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): n = np.nan - coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(8) * 2) - coords = {"yt": ("y", pd.Timestamp("2000-01-01") + coord_deltas)} - da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + times=pd.date_range("2000-01-01", periods=9, freq="2h") + coords = {"yt": ("y", times)} + da = xr.DataArray([n,n,3, n, n, 6, n, 8, n], dims=["y"], coords=coords) actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") - expected = da.copy(data=[n, n, 2, 3, 4, 5, 6, 7]) + #With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility + expected = da.copy(data=[1, 2, 3, 4, 5, 6, 7, 8, 9]) assert_equal(actual, expected) - actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") - expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + actual = da.interpolate_na(dim="y", limit=None, max_gap=2, fill_value="extrapolate") + expected = da.copy(data=[1, 2, 3, n, n, 6, 7, 8, 9]) assert_equal(actual, expected) - actual = da.interpolate_na( - dim="y", - limit=pd.Timedelta("3H"), - limit_use_coordinate="yt", - fill_value="extrapolate", - ) - expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") + expected = da.copy(data=[n, n, 3, 4, n, 6, 7, 8, 9]) assert_equal(actual, expected) + actual = da.interpolate_na(dim="y", limit=1, max_gap=2, fill_value="extrapolate") + expected = da.copy(data=[n, n, 3, n, n, 6, 7, 8, 9]) + assert_equal(actual, expected) def test_interpolate_double_coordinate(): - # Check if limit is using 'limit_use_coordinate' and max_gap is using 'use_coordinate' + # Check if max_gap is able to handle string coordinate names + # Limit is always refering to an index n = np.nan da = xr.DataArray( [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], @@ -378,25 +379,22 @@ def test_interpolate_double_coordinate(): coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, ) actual = da.interpolate_na( - "y", + dim="y", limit=1, max_gap=4, - limit_use_coordinate="y1", - use_coordinate="y2", - fill_value="extrapolate", - ) - expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + use_coordinate="y1", + fill_value="extrapolate") + expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, 2, n, n, 5, 6, n]]) assert_equal(actual, expected) actual = da.interpolate_na( "y", - limit=3, - max_gap=3, - limit_use_coordinate="y2", - use_coordinate="y1", + limit=2, + max_gap=4, + use_coordinate="y2", fill_value="extrapolate", ) - expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, 7]]) assert_equal(actual, expected) @@ -830,37 +828,37 @@ def test_get_limit_fill_mask(): actual = _get_limit_fill_mask( da, dim="y", index=index, limit=3, limit_direction="forward" ) - expected = da.copy(data=[[F, T, T, F, F, F, F, F, T], [F, F, F, T, T, T, T, T, F]]) + expected = da.copy(data=[[T, F, F, T, T, T, T, T, F], [T, T, T, F, F, F, F, F, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=3, limit_direction="backward" ) - expected = da.copy(data=[[T, T, F, F, F, F, F, T, T], [F, F, T, T, T, T, T, F, F]]) + expected = da.copy(data=[[F, F, T, T, T, T, T, F, F], [T, T, F, F, F, F, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=3, limit_direction="both" ) - expected = da.copy(data=[[T, T, T, F, F, F, F, T, T], [F, F, T, T, T, T, T, T, F]]) + expected = da.copy(data=[[F, F, F, T, T, T, T, F, F], [T, T, F, F, F, F, F, F, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=1, limit_direction="forward" ) - expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, F, T, T, F, T, F, F]]) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, T, F, F, T, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=1, limit_direction="backward" ) - expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, F, F, T, F, F]]) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, F, F, T, T, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=1, limit_direction="both" ) - expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, T, F, T, F, F]]) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, F, F, F, T, F, T, T]]) assert_equal(actual, expected) @@ -880,11 +878,11 @@ def test_get_area_mask(): _get_limit_area_mask(da, dim="y", index=index, limit_area="cow") actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="inside") - expected = da.copy(data=[[F, T, T, T, T, T, T, T, T], [F, F, F, T, T, T, T, F, F]]) + expected = da.copy(data=[[T, F, F, F, F, F, F, F, F], [T, T, T, F, F, F, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="outside") - expected = da.copy(data=[[T, T, F, F, T, F, F, F, T], [T, T, T, T, F, F, T, T, T]]) + expected = da.copy(data=[[F, F, T, T, F, T, T, T, F], [F, F, F, F, T, T, F, F, F]]) assert_equal(actual, expected) @@ -1029,13 +1027,13 @@ def test_interpolate_na_max_gap_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2) + actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2, fill_value='extrapolate') expected_y = da.copy( data=[ - [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], - [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y) @@ -1048,12 +1046,12 @@ def test_interpolate_na_max_gap_2d(coords): [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], [n, n, 3, n, n, 6, n, n, n, 10, n, n], [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3) + actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate") expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -1066,13 +1064,12 @@ def test_interpolate_na_max_gap_2d(coords): ) assert_equal(actual, expected_x) - def test_interpolate_na_limit_2d(): n = np.nan - coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(12) * 3) + times=pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, - "time": (pd.Timestamp("2000-01-01") + coord_deltas), + "time": (times), } da = xr.DataArray( [ @@ -1093,8 +1090,105 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) - actual = da.interpolate_na( - "time", limit=2, limit_direction="backward", fill_value="extrapolate" +@requires_scipy +def test_interpolators_complex_out_of_bounds(): + """Ensure complex nans are used for complex data""" + + xi = np.array([-1, 0, 1, 2, 5], dtype=np.float64) + yi = np.exp(1j * xi) + x = np.array([-2, 1, 6], dtype=np.float64) + + expected = np.array( + [np.nan + np.nan * 1j, np.exp(1j), np.nan + np.nan * 1j], dtype=yi.dtype + ) + + for method, interpolator in [ + ("linear", NumpyInterpolator), + ("linear", ScipyInterpolator), + ]: + f = interpolator(xi, yi, method=method) + actual = f(x) + assert_array_equal(actual, expected) + +####Masking Functionality +def test_fill_gaps_limit(): + n = np.nan + times=pd.date_range("2000-01-01", periods=8, freq="2h") + coords = {"yt": ("y", times)} + da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + + actual = da.fill_gaps(dim='y', limit=None).interpolate_na(dim="y", fill_value="extrapolate") + expected = da.copy(data=[0, 1, 2, 3, 4, 5, 6, 7]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim='y', limit=1).interpolate_na(dim="y", fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps( + dim="y", + limit=pd.Timedelta("3h"), + use_coordinate="yt", + ).interpolate_na(dim='y', fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps( + dim="y", + limit=pd.Timedelta("3h"), + limit_direction="backward", + use_coordinate="yt", + ).interpolate_na(dim='y', fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, n, 4, 5, n, n]) + assert_equal(actual, expected) + +def test_mask_gap_limit_2d(): + n = np.nan + times=pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + mask = da.fill_gaps('time', limit=1, use_coordinate=False) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], + [n, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, n], + [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + actual=mask.ffill(dim="time") + expected = da.copy( + data=[ + [1, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], + [n, n, 3, 3, 3, 6, 6, n, 6, 10, 10, n], + [n, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], + ] + ) + assert_equal(actual, expected) + actual=mask.fillna(0) + expected = da.copy( + data=[ + [1, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], + [n, 0, 3, 0, 0, 6, 0, n, 0, 10, 0, n], + [0, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], + ] + ) + assert_equal(actual, expected) + + actual = da.fill_gaps('time', limit=2, use_coordinate=False, limit_direction='backward').interpolate_na( + "time", fill_value="extrapolate" ) expected = da.copy( data=[ @@ -1105,12 +1199,14 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) - actual = da.interpolate_na( + actual = da.fill_gaps( "time", - limit=pd.Timedelta("3H"), + limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="inside", - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'time', fill_value="extrapolate", ) expected = da.copy( @@ -1121,12 +1217,14 @@ def test_interpolate_na_limit_2d(): ] ) - actual = da.interpolate_na( + actual = da.fill_gaps( "time", - limit=pd.Timedelta("3H"), + limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="outside", - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'time', fill_value="extrapolate", ) expected = da.copy( @@ -1138,12 +1236,14 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) - actual = da.interpolate_na( + actual = da.fill_gaps( "time", limit=None, limit_direction="backward", limit_area="outside", - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'time', fill_value=8, ) expected = da.copy( @@ -1165,14 +1265,16 @@ def test_interpolate_na_limit_2d(): dims=["x", "y"], coords={"x": np.arange(4) * 2}, ) - actual = da.interpolate_na( - method="linear", + actual = da.fill_gaps( dim="x", limit=3, limit_direction="forward", limit_area=None, - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'x', fill_value="extrapolate", + method="linear", ) expected = da.copy( data=[ @@ -1184,23 +1286,79 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) +def test_mask_gap_max_gap_2d(): + n = np.nan + times=pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) -@requires_scipy -def test_interpolators_complex_out_of_bounds(): - """Ensure complex nans are used for complex data""" + mask = da.fill_gaps('time', max_gap=1, use_coordinate=False) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + mask = da.fill_gaps('time', max_gap=2, use_coordinate=False) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) - xi = np.array([-1, 0, 1, 2, 5], dtype=np.float64) - yi = np.exp(1j * xi) - x = np.array([-2, 1, 6], dtype=np.float64) + mask = da.fill_gaps('time', max_gap=pd.Timedelta("3h"), use_coordinate=True) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) - expected = np.array( - [np.nan + np.nan * 1j, np.exp(1j), np.nan + np.nan * 1j], dtype=yi.dtype +def test_mask_double_coordinate(): + # Check if limit and max_gap are able to handle string coordinate names + n = np.nan + da = xr.DataArray( + [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], + dims=["x", "y"], + coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, ) + actual = da.fill_gaps( + "y", + limit=1, + max_gap=4, + use_coordinate="y1", + ).interpolate_na("y", fill_value="extrapolate") + expected = da.copy(data=[[1, 2, 3, 4, 5, 6, 7], [1, 2, n, 4, 5, 6, n]]) + assert_equal(actual, expected) - for method, interpolator in [ - ("linear", NumpyInterpolator), - ("linear", ScipyInterpolator), - ]: - f = interpolator(xi, yi, method=method) - actual = f(x) - assert_array_equal(actual, expected) + actual = da.fill_gaps( + "y", + limit=2, + max_gap=4, + use_coordinate="y2" + ).interpolate_na( + "y", + fill_value="extrapolate", + ) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + assert_equal(actual, expected) \ No newline at end of file From 1b8ea9ed3310b3f83a4d8b6b7158f743c474eebb Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 23 Aug 2024 17:01:20 +0200 Subject: [PATCH 07/46] Remove keep_attrs from docstring of filling functions --- xarray/core/missing.py | 70 ++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 78397a94f7c..af2e60365e5 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -55,10 +55,6 @@ Parameters ---------- -keep_attrs : bool, default: None - If True, the attributes (``attrs``) will be copied from the original - object to the new one. If False, the new object will be returned - without attributes. If None uses the global default. **kwargs : dict Additional keyword arguments passed on to `{name}`. @@ -68,6 +64,7 @@ New object with `{name}` applied to all unmasked values. """ + def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): @@ -105,15 +102,15 @@ def _get_limit_fill_mask( limit, limit_direction, ): - #At the left boundary, distance to left is nan. - #For nan, a<=b and ~(a>b) behave differently + # At the left boundary, distance to left is nan. + # For nan, a<=b and ~(a>b) behave differently if limit_direction == "forward": limit_mask = ~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit) elif limit_direction == "backward": limit_mask = ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) elif limit_direction == "both": - limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & (~( - _get_gap_dist_to_right_edge(obj, dim, index) <= limit) + limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & ( + ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) ) else: raise ValueError( @@ -184,7 +181,7 @@ def _get_gap_mask( limit = np.inf else: if limit_use_coordinate is False: - if not isinstance(limit, (Number, np.number)): + if not isinstance(limit, Number | np.number): raise TypeError( f"Expected integer or floating point limit since limit_use_coordinate=False. Received {type(limit).__name__}." ) @@ -200,7 +197,7 @@ def _get_gap_mask( max_gap = timedelta_to_numeric(max_gap) if not max_gap_use_coordinate: - if not isinstance(max_gap, (Number, np.number)): + if not isinstance(max_gap, Number | np.number): raise TypeError( f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." ) @@ -223,21 +220,24 @@ def _get_gap_mask( return None # Calculate individual masks - masks=[] + masks = [] if need_limit_mask: - masks.append(_get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction)) + masks.append( + _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) + ) if need_area_mask: masks.append(_get_limit_area_mask(obj, dim, index_limit, limit_area)) if need_max_gap_mask: masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) - #Combine masks - mask=masks[0] + # Combine masks + mask = masks[0] for m in masks[1:]: - mask|=m + mask |= m return mask + class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods""" @@ -510,7 +510,8 @@ def get_clean_interp_index( def _is_time_index(index): from xarray.coding.cftimeindex import CFTimeIndex - return isinstance(index, (pd.DatetimeIndex, CFTimeIndex)) + return isinstance(index, pd.DatetimeIndex | CFTimeIndex) + def _interp_na_all( self, @@ -544,31 +545,35 @@ def _interp_na_all( ).transpose(*self.dims) return arr + class GapMask: """An object that allows for flexible masking of gaps.""" + def __init__(self, content: DataArray | Dataset, mask: np.ndarray): self.content = content self.mask = mask - + def _fill_method(name: str, _fill_function: Callable | None = None): def method(self, *args, _fill_function=_fill_function, **kwargs): if _fill_function is None: - _fill_function=getattr(self.content, name) - filled=_fill_function(*args, **kwargs) + _fill_function = getattr(self.content, name) + filled = _fill_function(*args, **kwargs) else: - filled=_fill_function(self.content, *args, **kwargs) + filled = _fill_function(self.content, *args, **kwargs) if self.mask is not None: - filled=filled.where(~self.mask, other=self.content) + filled = filled.where(~self.mask, other=self.content) return filled + method.__name__ = name method.__doc__ = _FILL_MISSING_DOCSTRING_TEMPLATE.format(name=name) return method - ffill=_fill_method('ffill') - bfill=_fill_method('bfill') - fillna=_fill_method('fillna') - interpolate_na=_fill_method('interpolate_na') + ffill = _fill_method("ffill") + bfill = _fill_method("bfill") + fillna = _fill_method("fillna") + interpolate_na = _fill_method("interpolate_na") + def mask_gaps( self, @@ -577,7 +582,7 @@ def mask_gaps( limit: ( int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None ) = None, - limit_direction: LimitDirectionOptions ="both", + limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, ) -> GapMask: @@ -596,8 +601,6 @@ def mask_gaps( return GapMask(self, mask) - - def interp_na( self, dim: Hashable | None = None, @@ -615,10 +618,10 @@ def interp_na( if dim is None: raise NotImplementedError("dim is a required argument") - #This was the original behaviour of interp_na and is kept for backward compatibility - #Limit=None: Fill everything, including both boundaries - #Limit!=None: Do forward interpolation until limit - limit_use_coordinate=False + # This was the original behaviour of interp_na and is kept for backward compatibility + # Limit=None: Fill everything, including both boundaries + # Limit!=None: Do forward interpolation until limit + limit_use_coordinate = False if limit is None: limit_direction = "both" else: @@ -635,13 +638,12 @@ def interp_na( use_coordinate, ) - arr=_interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) + arr = _interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) if mask is not None: arr = arr.where(~mask) return arr - def func_interpolate_na(interpolator, y, x, **kwargs): """helper function to apply interpolation along 1 dimension""" # reversed arguments are so that attrs are preserved from da, not index From b956e14468e5936a9110ae2bb5646b628bf8ad68 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:11:22 +0200 Subject: [PATCH 08/46] Fix typos, undo empty spaces, remove temporarily introduced arguments --- xarray/core/dataarray.py | 91 ++++++++++++++++++---------------------- xarray/core/dataset.py | 60 +++++++++++++------------- xarray/core/missing.py | 2 +- 3 files changed, 71 insertions(+), 82 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 38f380c32cd..ce2d79580e8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -97,9 +97,9 @@ from xarray.backends import ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.groupby import DataArrayGroupBy + from xarray.core.missing import GapMask from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling - from xarray.core.missing import GapMask from xarray.core.types import ( CoarsenBoundaryOptions, DatetimeLike, @@ -3521,21 +3521,12 @@ def fillna(self, value: Any) -> Self: out = ops.fillna(self, value) return out - def interpolate_na( self, dim: Hashable, method: InterpOptions = "linear", + limit: int | None = None, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, max_gap: ( None | int @@ -3559,25 +3550,25 @@ def interpolate_na( String indicating which method to use for interpolation: - 'linear': linear interpolation. Additional keyword - arguments are passed to :py:func:`numpy.interp` + arguments are passed to :py:func:`numpy.interp` - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': - are passed to :py:func:`scipy.interpolate.interp1d`. If - ``method='polynomial'``, the ``order`` keyword argument must also be - provided. + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be + provided. - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their - respective :py:class:`scipy.interpolate` classes. + respective :py:class:`scipy.interpolate` classes. + limit : int or None, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. use_coordinate : bool or str, default: True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if equally-spaced along ``dim``. If True, the IndexVariable `dim` is used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variable to use as the index. - limit : int or None, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, - see ``max_gap``. max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3598,7 +3589,7 @@ def interpolate_na( array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 + * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively keep_attrs : bool or None, default: None @@ -3627,33 +3618,32 @@ def interpolate_na( Size: 40B array([nan, 2., 3., nan, 0.]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 40B 0 1 2 3 4 >>> da.interpolate_na(dim="x", method="linear") Size: 40B array([nan, 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 40B 0 1 2 3 4 >>> da.interpolate_na(dim="x", method="linear", fill_value="extrapolate") Size: 40B array([1. , 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 40B 0 1 2 3 4 """ from xarray.core.missing import interp_na return interp_na( - self, - dim=dim, - method=method, - limit=limit, - use_coordinate=use_coordinate, - max_gap=max_gap, - keep_attrs=keep_attrs, - **kwargs, - ) - + self, + dim=dim, + method=method, + limit=limit, + use_coordinate=use_coordinate, + max_gap=max_gap, + keep_attrs=keep_attrs, + **kwargs, + ) def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward @@ -3822,7 +3812,7 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: from xarray.core.missing import bfill return bfill(self, dim, limit=limit) - + def fill_gaps( self, dim: Hashable, @@ -3917,10 +3907,6 @@ def fill_gaps( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively - keep_attrs : bool or None, default: None - If True, the dataarray's attributes (`attrs`) will be copied from - the original object to the new one. If False, the new - object will be returned without attributes. Returns ------- @@ -3951,23 +3937,19 @@ def fill_gaps( array([nan, 2., nan, nan, 5., nan, 0.]) Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps( - ... dim="x", limit=1, limit_direction="forward" - ... ).interpolate_na(dim="x") + >>> da.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( + ... dim="x" + ... ) Size: 56B array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps( - ... dim="x", max_gap=2, limit_direction="forward" - ... ).ffill(dim="x") + >>> da.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 56B array([nan, 2., nan, nan, 5., 5., 0.]) Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps( - ... dim="x", limit_area="inside" - ... ).fillna(9) + >>> da.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 56B array([nan, 2., 9., 9., 5., 9., 0.]) Coordinates: @@ -3975,8 +3957,15 @@ def fill_gaps( """ from xarray.core.missing import mask_gaps - return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) - + return mask_gaps( + self, + dim, + use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + max_gap=max_gap, + ) def combine_first(self, other: Self) -> Self: """Combine two DataArray objects, with union of coordinates. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 87c7448ba5e..8b9ed014a39 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -142,6 +142,7 @@ from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy from xarray.core.merge import CoercibleMapping, CoercibleValue, _MergeResult + from xarray.core.missing import GapMask from xarray.core.resample import DatasetResample from xarray.core.rolling import DatasetCoarsen, DatasetRolling from xarray.core.types import ( @@ -6711,19 +6712,8 @@ def interpolate_na( self, dim: Hashable, method: InterpOptions = "linear", + limit: int | None = None, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - limit_use_coordinate: bool | Hashable = False, max_gap: ( int | float @@ -6790,6 +6780,10 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6854,6 +6848,9 @@ def interpolate_na( """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + new = _apply_over_vars_with_dim( interp_na, self, @@ -6862,8 +6859,10 @@ def interpolate_na( limit=limit, use_coordinate=use_coordinate, max_gap=max_gap, + keep_attrs=keep_attrs, **kwargs, ) + new.attrs = self.attrs if keep_attrs else None return new def ffill(self, dim: Hashable, limit: int | None = None) -> Self: @@ -7089,10 +7088,6 @@ def fill_gaps( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively - keep_attrs : bool or None, default: None - If True, the dataarray's attributes (`attrs`) will be copied from - the original object to the new one. If False, the new - object will be returned without attributes. Returns ------- @@ -7101,10 +7096,10 @@ def fill_gaps( See Also -------- - DataArray.fillna - DataArray.ffill - DataArray.bfill - DataArray.interpolate_na + Dataset.fillna + Dataset.ffill + Dataset.bfill + Dataset.interpolate_na pandas.DataFrame.interpolate Notes @@ -7128,9 +7123,9 @@ def fill_gaps( Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan - >>> ds.fill_gaps( - ... dim="x", limit=1, limit_direction="forward" - ... ).interpolate_na(dim="x") + >>> ds.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( + ... dim="x" + ... ) Size: 168B Dimensions: (x: 7) Coordinates: @@ -7138,9 +7133,7 @@ def fill_gaps( Data variables: A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan - >>> ds.fill_gaps( - ... dim="x", max_gap=2, limit_direction="forward" - ... ).ffill(dim="x") + >>> ds.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 168B Dimensions: (x: 7) Coordinates: @@ -7148,9 +7141,7 @@ def fill_gaps( Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 - >>> ds.fill_gaps( - ... dim="x", limit_area="inside" - ... ).fillna(9) + >>> ds.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 168B Dimensions: (x: 7) Coordinates: @@ -7161,7 +7152,16 @@ def fill_gaps( """ from xarray.core.missing import mask_gaps - return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) + return mask_gaps( + self, + dim, + use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + max_gap=max_gap, + ) + def combine_first(self, other: Self) -> Self: """Combine two Datasets, default to data_vars of self. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index af2e60365e5..687d7902452 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -470,7 +470,7 @@ def get_clean_interp_index( from xarray.coding.cftimeindex import CFTimeIndex index = _get_raw_interp_index(arr, dim, use_coordinate) - # index.name is None for multiindexes + # TODO: index.name is None for multiindexes # set name for nice error messages below if isinstance(index, pd.MultiIndex): index.name = dim From d717dd9493a1f2c3934edaef06ea2bcf78ecbc8e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:22:12 +0000 Subject: [PATCH 09/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_missing.py | 101 +++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index a2639a05dac..d9d04b953a3 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -117,7 +117,7 @@ def test_interpolate_pd_compat(method, fill_value, dim, shape, frac_nan) -> None method=method, axis=da.get_axis_num(dim), fill_value=fill_value, - limit_direction='both' + limit_direction="both", ) if method == "linear": @@ -348,12 +348,12 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): n = np.nan - times=pd.date_range("2000-01-01", periods=9, freq="2h") + times = pd.date_range("2000-01-01", periods=9, freq="2h") coords = {"yt": ("y", times)} - da = xr.DataArray([n,n,3, n, n, 6, n, 8, n], dims=["y"], coords=coords) + da = xr.DataArray([n, n, 3, n, n, 6, n, 8, n], dims=["y"], coords=coords) actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") - #With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility + # With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility expected = da.copy(data=[1, 2, 3, 4, 5, 6, 7, 8, 9]) assert_equal(actual, expected) @@ -369,6 +369,7 @@ def test_interpolate_limits(): expected = da.copy(data=[n, n, 3, n, n, 6, 7, 8, 9]) assert_equal(actual, expected) + def test_interpolate_double_coordinate(): # Check if max_gap is able to handle string coordinate names # Limit is always refering to an index @@ -379,11 +380,8 @@ def test_interpolate_double_coordinate(): coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, ) actual = da.interpolate_na( - dim="y", - limit=1, - max_gap=4, - use_coordinate="y1", - fill_value="extrapolate") + dim="y", limit=1, max_gap=4, use_coordinate="y1", fill_value="extrapolate" + ) expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, 2, n, n, 5, 6, n]]) assert_equal(actual, expected) @@ -1027,7 +1025,9 @@ def test_interpolate_na_max_gap_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2, fill_value='extrapolate') + actual = da.interpolate_na( + "y", use_coordinate=use_coordinate, max_gap=2, fill_value="extrapolate" + ) expected_y = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], @@ -1051,7 +1051,9 @@ def test_interpolate_na_max_gap_2d(coords): ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate") + actual = da.interpolate_na( + "x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate" + ) expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -1064,9 +1066,10 @@ def test_interpolate_na_max_gap_2d(coords): ) assert_equal(actual, expected_x) + def test_interpolate_na_limit_2d(): n = np.nan - times=pd.date_range("2000-01-01", periods=12, freq="3h") + times = pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, "time": (times), @@ -1090,6 +1093,7 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) + @requires_scipy def test_interpolators_complex_out_of_bounds(): """Ensure complex nans are used for complex data""" @@ -1110,18 +1114,23 @@ def test_interpolators_complex_out_of_bounds(): actual = f(x) assert_array_equal(actual, expected) + ####Masking Functionality def test_fill_gaps_limit(): n = np.nan - times=pd.date_range("2000-01-01", periods=8, freq="2h") + times = pd.date_range("2000-01-01", periods=8, freq="2h") coords = {"yt": ("y", times)} da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) - actual = da.fill_gaps(dim='y', limit=None).interpolate_na(dim="y", fill_value="extrapolate") + actual = da.fill_gaps(dim="y", limit=None).interpolate_na( + dim="y", fill_value="extrapolate" + ) expected = da.copy(data=[0, 1, 2, 3, 4, 5, 6, 7]) assert_equal(actual, expected) - actual = da.fill_gaps(dim='y', limit=1).interpolate_na(dim="y", fill_value="extrapolate") + actual = da.fill_gaps(dim="y", limit=1).interpolate_na( + dim="y", fill_value="extrapolate" + ) expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) assert_equal(actual, expected) @@ -1129,7 +1138,7 @@ def test_fill_gaps_limit(): dim="y", limit=pd.Timedelta("3h"), use_coordinate="yt", - ).interpolate_na(dim='y', fill_value="extrapolate") + ).interpolate_na(dim="y", fill_value="extrapolate") expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) assert_equal(actual, expected) @@ -1138,13 +1147,14 @@ def test_fill_gaps_limit(): limit=pd.Timedelta("3h"), limit_direction="backward", use_coordinate="yt", - ).interpolate_na(dim='y', fill_value="extrapolate") + ).interpolate_na(dim="y", fill_value="extrapolate") expected = da.copy(data=[n, 1, 2, n, 4, 5, n, n]) assert_equal(actual, expected) + def test_mask_gap_limit_2d(): n = np.nan - times=pd.date_range("2000-01-01", periods=12, freq="3h") + times = pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, "time": (times), @@ -1158,8 +1168,8 @@ def test_mask_gap_limit_2d(): coords=coords, ) - mask = da.fill_gaps('time', limit=1, use_coordinate=False) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", limit=1, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], @@ -1168,7 +1178,7 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) - actual=mask.ffill(dim="time") + actual = mask.ffill(dim="time") expected = da.copy( data=[ [1, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], @@ -1177,7 +1187,7 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) - actual=mask.fillna(0) + actual = mask.fillna(0) expected = da.copy( data=[ [1, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], @@ -1187,9 +1197,9 @@ def test_mask_gap_limit_2d(): ) assert_equal(actual, expected) - actual = da.fill_gaps('time', limit=2, use_coordinate=False, limit_direction='backward').interpolate_na( - "time", fill_value="extrapolate" - ) + actual = da.fill_gaps( + "time", limit=2, use_coordinate=False, limit_direction="backward" + ).interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], @@ -1204,9 +1214,9 @@ def test_mask_gap_limit_2d(): limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="inside", - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'time', + "time", fill_value="extrapolate", ) expected = da.copy( @@ -1222,9 +1232,9 @@ def test_mask_gap_limit_2d(): limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="outside", - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'time', + "time", fill_value="extrapolate", ) expected = da.copy( @@ -1241,9 +1251,9 @@ def test_mask_gap_limit_2d(): limit=None, limit_direction="backward", limit_area="outside", - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'time', + "time", fill_value=8, ) expected = da.copy( @@ -1270,9 +1280,9 @@ def test_mask_gap_limit_2d(): limit=3, limit_direction="forward", limit_area=None, - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'x', + "x", fill_value="extrapolate", method="linear", ) @@ -1286,9 +1296,10 @@ def test_mask_gap_limit_2d(): ) assert_equal(actual, expected) + def test_mask_gap_max_gap_2d(): n = np.nan - times=pd.date_range("2000-01-01", periods=12, freq="3h") + times = pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, "time": (times), @@ -1302,8 +1313,8 @@ def test_mask_gap_max_gap_2d(): coords=coords, ) - mask = da.fill_gaps('time', max_gap=1, use_coordinate=False) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", max_gap=1, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], @@ -1312,8 +1323,8 @@ def test_mask_gap_max_gap_2d(): ] ) assert_equal(actual, expected) - mask = da.fill_gaps('time', max_gap=2, use_coordinate=False) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", max_gap=2, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], @@ -1323,8 +1334,8 @@ def test_mask_gap_max_gap_2d(): ) assert_equal(actual, expected) - mask = da.fill_gaps('time', max_gap=pd.Timedelta("3h"), use_coordinate=True) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", max_gap=pd.Timedelta("3h"), use_coordinate=True) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], @@ -1334,6 +1345,7 @@ def test_mask_gap_max_gap_2d(): ) assert_equal(actual, expected) + def test_mask_double_coordinate(): # Check if limit and max_gap are able to handle string coordinate names n = np.nan @@ -1351,14 +1363,9 @@ def test_mask_double_coordinate(): expected = da.copy(data=[[1, 2, 3, 4, 5, 6, 7], [1, 2, n, 4, 5, 6, n]]) assert_equal(actual, expected) - actual = da.fill_gaps( - "y", - limit=2, - max_gap=4, - use_coordinate="y2" - ).interpolate_na( + actual = da.fill_gaps("y", limit=2, max_gap=4, use_coordinate="y2").interpolate_na( "y", fill_value="extrapolate", ) expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) - assert_equal(actual, expected) \ No newline at end of file + assert_equal(actual, expected) From 0c4fdab0f6d896d80c1ebde2c3e116b9d7f597af Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:13:01 +0200 Subject: [PATCH 10/46] Add line break for readability --- xarray/core/dataarray.py | 13 +++++++++---- xarray/core/dataset.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ce2d79580e8..e1d1ffb1290 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3614,6 +3614,7 @@ def interpolate_na( >>> da = xr.DataArray( ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} ... ) + >>> da Size: 40B array([nan, 2., 3., nan, 0.]) @@ -3932,28 +3933,32 @@ def fill_gaps( ... dims="x", ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) + >>> da Size: 56B array([nan, 2., nan, nan, 5., nan, 0.]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( ... dim="x" ... ) Size: 56B array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 56B array([nan, 2., nan, nan, 5., 5., 0.]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 56B array([nan, 2., 9., 9., 5., 9., 0.]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 """ from xarray.core.missing import mask_gaps diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8b9ed014a39..149c6fc3b9a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6813,6 +6813,7 @@ def interpolate_na( ... }, ... coords={"x": [0, 1, 2, 3, 4]}, ... ) + >>> ds Size: 200B Dimensions: (x: 5) @@ -7115,37 +7116,41 @@ def fill_gaps( ... }, ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) + >>> ds Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan + >>> ds.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( ... dim="x" ... ) Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan + >>> ds.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 + >>> ds.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 9.0 9.0 5.0 9.0 0.0 B (x) float64 56B nan 2.0 9.0 9.0 5.0 6.0 nan From 7f06b3ac0eb69de0bb4905ea561c3328804757cc Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:47:58 +0200 Subject: [PATCH 11/46] Enforce kwargs to be passed by name --- xarray/core/dataarray.py | 1 + xarray/core/dataset.py | 1 + 2 files changed, 2 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e1d1ffb1290..294801f7522 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3817,6 +3817,7 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: def fill_gaps( self, dim: Hashable, + *, use_coordinate: bool | Hashable = True, limit: ( None diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 149c6fc3b9a..6fd7ef55265 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6998,6 +6998,7 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: def fill_gaps( self, dim: Hashable, + *, use_coordinate: bool | Hashable = True, limit: ( None From 6090a4d3af00c9dd904cf4c2e110002202be118d Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sat, 24 Aug 2024 22:16:23 +0200 Subject: [PATCH 12/46] Keep_Attrs: Default to True --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6fd7ef55265..57f72084242 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6850,7 +6850,7 @@ def interpolate_na( from xarray.core.missing import _apply_over_vars_with_dim, interp_na if keep_attrs is None: - keep_attrs = _get_keep_attrs(default=False) + keep_attrs = _get_keep_attrs(default=True) new = _apply_over_vars_with_dim( interp_na, From f8cc0c515702a8c4eb75f4ddfc526ee4c31e1985 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 24 Aug 2024 20:17:16 +0000 Subject: [PATCH 13/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 57f72084242..fa1e9028a72 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6813,7 +6813,7 @@ def interpolate_na( ... }, ... coords={"x": [0, 1, 2, 3, 4]}, ... ) - + >>> ds Size: 200B Dimensions: (x: 5) @@ -7117,7 +7117,7 @@ def fill_gaps( ... }, ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) - + >>> ds Size: 168B Dimensions: (x: 7) From 6de764041f73d1575ff926f63a3d3ef38a7f0c12 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 16:06:41 +0200 Subject: [PATCH 14/46] Explicitly add fill functions in GapMask object - Allows for more flexibility, e.g. optional dim arguments - Better static typing --- xarray/core/missing.py | 132 ++++++++++++++++++++++++++++------- xarray/tests/test_missing.py | 19 ++++- 2 files changed, 124 insertions(+), 27 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 687d7902452..3f5921347be 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -549,35 +549,117 @@ def _interp_na_all( class GapMask: """An object that allows for flexible masking of gaps.""" - def __init__(self, content: DataArray | Dataset, mask: np.ndarray): + def __init__(self, content: DataArray | Dataset, mask: np.ndarray, dim: Hashable): self.content = content self.mask = mask - - def _fill_method(name: str, _fill_function: Callable | None = None): - def method(self, *args, _fill_function=_fill_function, **kwargs): - if _fill_function is None: - _fill_function = getattr(self.content, name) - filled = _fill_function(*args, **kwargs) - else: - filled = _fill_function(self.content, *args, **kwargs) - - if self.mask is not None: - filled = filled.where(~self.mask, other=self.content) - return filled - - method.__name__ = name - method.__doc__ = _FILL_MISSING_DOCSTRING_TEMPLATE.format(name=name) - return method - - ffill = _fill_method("ffill") - bfill = _fill_method("bfill") - fillna = _fill_method("fillna") - interpolate_na = _fill_method("interpolate_na") - + self.dim=dim + + def _apply_mask(self, filled): + if self.mask is not None: + filled = filled.where(~self.mask, other=self.content) + return filled + + def ffill(self, dim: Hashable | None = None): + """Partly fill missing values in this object's data by applying ffill to all unmasked values. + + Parameters + ---------- + dim : Hashable or None, default None + Dimension along which to fill missing values. If None, the dimension used to create the mask is used. + + Returns + ------- + filled : same type as caller + New object with ffill applied to all unmasked values. + + See Also + -------- + DataArray.ffill + Dataset.ffill + """ + if dim is None: + dim = self.dim + return self._apply_mask(self.content.ffill(dim)) + + def bfill(self, dim: Hashable | None = None): + """Partly fill missing values in this object's data by applying bfill to all unmasked values. + + Parameters + ---------- + dim : Hashable or None, default None + Dimension along which to fill missing values. If None, the dimension used to create the mask is used. + + Returns + ------- + filled : same type as caller + New object with bfill applied to all unmasked values. + + See Also + -------- + DataArray.bfill + Dataset.bfill + """ + if dim is None: + dim = self.dim + return self._apply_mask(self.content.bfill(dim)) + + def fillna(self, value): + """Partly fill missing values in this object's data by applying fillna to all unmasked values. + + Parameters + ---------- + value : scalar, ndarray or DataArray + Used to fill all unmasked values. If the + argument is a DataArray, it is first aligned with (reindexed to) + this array. + + Returns + ------- + filled : same type as caller + New object with fillna applied to all unmasked values. + + See Also + -------- + DataArray.fillna + Dataset.fillna + """ + return self._apply_mask(self.content.fillna(value)) + + def interpolate_na( + self, + dim: Hashable | None = None, + method: InterpOptions = "linear", + use_coordinate: bool | str = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ): + """Partly fill missing values in this object's data by applying interpolate_na to all unmasked values. + + Parameters + ---------- + See DataArray.interpolate_na and Dataset.interpolate_na for explanation of parameters. + + Returns + ------- + filled : same type as caller + New object with interpolate_na applied to all unmasked values. + + See Also + -------- + DataArray.interpolate_na + Dataset.interpolate_na + """ + if dim is None: + dim = self.dim + return self._apply_mask( + self.content.interpolate_na( + dim=dim, method=method, use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs + ) + ) def mask_gaps( self, - dim: Hashable | None = None, + dim: Hashable, use_coordinate: bool | str = True, limit: ( int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None @@ -598,7 +680,7 @@ def mask_gaps( max_gap, use_coordinate, ) - return GapMask(self, mask) + return GapMask(self, mask, dim) def interp_na( diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index d9d04b953a3..06d412645ee 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -1275,13 +1275,14 @@ def test_mask_gap_limit_2d(): dims=["x", "y"], coords={"x": np.arange(4) * 2}, ) - actual = da.fill_gaps( + mask = da.fill_gaps( dim="x", limit=3, limit_direction="forward", limit_area=None, use_coordinate=True, - ).interpolate_na( + ) + actual=mask.interpolate_na( "x", fill_value="extrapolate", method="linear", @@ -1295,6 +1296,20 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) + # Test: Dim argument from mask should be used + actual=mask.interpolate_na( + fill_value="extrapolate", + method="linear", + ) + expected = da.copy( + data=[ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, 2], + [n, 3, 3, 3, 3, n], + [n, n, 4, 4, 4, 4], + ] + ) + assert_equal(actual, expected) def test_mask_gap_max_gap_2d(): From 07a0d01321357e3824fac89ea2d018dd3fc95db9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:09:09 +0000 Subject: [PATCH 15/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 19 ++++++++++++------- xarray/tests/test_missing.py | 4 ++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 3f5921347be..2db3d1cd7b2 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -552,7 +552,7 @@ class GapMask: def __init__(self, content: DataArray | Dataset, mask: np.ndarray, dim: Hashable): self.content = content self.mask = mask - self.dim=dim + self.dim = dim def _apply_mask(self, filled): if self.mask is not None: @@ -580,7 +580,7 @@ def ffill(self, dim: Hashable | None = None): if dim is None: dim = self.dim return self._apply_mask(self.content.ffill(dim)) - + def bfill(self, dim: Hashable | None = None): """Partly fill missing values in this object's data by applying bfill to all unmasked values. @@ -602,7 +602,7 @@ def bfill(self, dim: Hashable | None = None): if dim is None: dim = self.dim return self._apply_mask(self.content.bfill(dim)) - + def fillna(self, value): """Partly fill missing values in this object's data by applying fillna to all unmasked values. @@ -612,7 +612,7 @@ def fillna(self, value): Used to fill all unmasked values. If the argument is a DataArray, it is first aligned with (reindexed to) this array. - + Returns ------- filled : same type as caller @@ -624,7 +624,7 @@ def fillna(self, value): Dataset.fillna """ return self._apply_mask(self.content.fillna(value)) - + def interpolate_na( self, dim: Hashable | None = None, @@ -643,7 +643,7 @@ def interpolate_na( ------- filled : same type as caller New object with interpolate_na applied to all unmasked values. - + See Also -------- DataArray.interpolate_na @@ -653,10 +653,15 @@ def interpolate_na( dim = self.dim return self._apply_mask( self.content.interpolate_na( - dim=dim, method=method, use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs + dim=dim, + method=method, + use_coordinate=use_coordinate, + keep_attrs=keep_attrs, + **kwargs, ) ) + def mask_gaps( self, dim: Hashable, diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 06d412645ee..ce4e754bc6a 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -1282,7 +1282,7 @@ def test_mask_gap_limit_2d(): limit_area=None, use_coordinate=True, ) - actual=mask.interpolate_na( + actual = mask.interpolate_na( "x", fill_value="extrapolate", method="linear", @@ -1297,7 +1297,7 @@ def test_mask_gap_limit_2d(): ) assert_equal(actual, expected) # Test: Dim argument from mask should be used - actual=mask.interpolate_na( + actual = mask.interpolate_na( fill_value="extrapolate", method="linear", ) From 274168c24e4e8f619a749364586c5ae3e28bc274 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:28:09 +0200 Subject: [PATCH 16/46] Add type hints to most arguments, return types --- xarray/core/dataarray.py | 33 ++------- xarray/core/dataset.py | 33 ++------- xarray/core/missing.py | 148 ++++++++++++++++++++------------------- xarray/core/types.py | 2 + 4 files changed, 88 insertions(+), 128 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 294801f7522..f67e429cb2c 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -123,6 +123,7 @@ SideOptions, T_ChunkDimFreq, T_ChunksFreq, + T_GapLength, T_Xarray, ) from xarray.core.weighted import DataArrayWeighted @@ -3527,15 +3528,7 @@ def interpolate_na( method: InterpOptions = "linear", limit: int | None = None, use_coordinate: bool | Hashable = True, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: @@ -3819,27 +3812,11 @@ def fill_gaps( dim: Hashable, *, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, - ) -> GapMask: + max_gap: T_GapLength | None = None, + ) -> GapMask[DataArray]: """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fa1e9028a72..c1d5bf33637 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -173,6 +173,7 @@ T_ChunkDimFreq, T_Chunks, T_DatasetPadConstantValues, + T_GapLength, T_Xarray, ) from xarray.core.weighted import DatasetWeighted @@ -6714,15 +6715,7 @@ def interpolate_na( method: InterpOptions = "linear", limit: int | None = None, use_coordinate: bool | Hashable = True, - max_gap: ( - int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - | None - ) = None, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: @@ -7000,27 +6993,11 @@ def fill_gaps( dim: Hashable, *, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, - ) -> GapMask: + max_gap: T_GapLength | None = None, + ) -> GapMask[Dataset]: """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 2db3d1cd7b2..dbe603bf457 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,13 +1,12 @@ from __future__ import annotations -import datetime as dt import itertools import warnings from collections import ChainMap from collections.abc import Callable, Generator, Hashable, Sequence from functools import partial from numbers import Number -from typing import TYPE_CHECKING, Any, TypeVar, get_args +from typing import TYPE_CHECKING, Any, TypeVar, Generic, get_args import numpy as np import pandas as pd @@ -30,6 +29,8 @@ InterpOptions, LimitAreaOptions, LimitDirectionOptions, + T_GapLength, + T_Xarray, ) from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import ( @@ -39,9 +40,6 @@ from xarray.namedarray.pycompat import is_chunked_array if TYPE_CHECKING: - from xarray.core.dataarray import DataArray - from xarray.core.dataset import Dataset - InterpCallable = Callable[..., np.ndarray] # interpn Interpolator = Callable[..., Callable[..., np.ndarray]] # *Interpolator # interpolator objects return callables that can be evaluated @@ -66,8 +64,8 @@ def _get_gap_left_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False -): + obj: T_Xarray, dim: Hashable, index: Variable, outside=False +) -> T_Xarray: left = index.where(~obj.isnull()).ffill(dim).transpose(*obj.dims) if outside: return left.fillna(index[0]) @@ -75,8 +73,8 @@ def _get_gap_left_edge( def _get_gap_right_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False -): + obj: T_Xarray, dim: Hashable, index: Variable, outside=False +) -> T_Xarray: right = index.where(~obj.isnull()).bfill(dim).transpose(*obj.dims) if outside: return right.fillna(index[-1]) @@ -84,24 +82,24 @@ def _get_gap_right_edge( def _get_gap_dist_to_left_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): + obj: T_Xarray, dim: Hashable, index: Variable +) -> T_Xarray: return (index - _get_gap_left_edge(obj, dim, index)).transpose(*obj.dims) def _get_gap_dist_to_right_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): + obj: T_Xarray, dim: Hashable, index: Variable +) -> T_Xarray: return (_get_gap_right_edge(obj, dim, index) - index).transpose(*obj.dims) def _get_limit_fill_mask( - obj: Dataset | DataArray | Variable, + obj: T_Xarray, dim: Hashable, index: Variable, - limit, - limit_direction, -): + limit: T_GapLength, + limit_direction: LimitDirectionOptions, +) -> T_Xarray: # At the left boundary, distance to left is nan. # For nan, a<=b and ~(a>b) behave differently if limit_direction == "forward": @@ -120,8 +118,8 @@ def _get_limit_fill_mask( def _get_limit_area_mask( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, limit_area -): + obj: T_Xarray, dim: Hashable, index: Variable, limit_area +) -> T_Xarray: if limit_area == "inside": area_mask = ( _get_gap_left_edge(obj, dim, index).isnull() @@ -140,9 +138,7 @@ def _get_limit_area_mask( return area_mask -def _get_nan_block_lengths( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): +def _get_nan_block_lengths(obj: T_Xarray, dim: Hashable, index: Variable) -> T_Xarray: """ Return an object where each NaN element in 'obj' is replaced by the length of the gap the element is in. @@ -153,25 +149,22 @@ def _get_nan_block_lengths( def _get_max_gap_mask( - obj: Dataset | DataArray | Variable, - dim: Hashable, - index: Variable, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta, -): + obj: T_Xarray, dim: Hashable, index: Variable, max_gap: T_GapLength +) -> T_Xarray: nan_block_lengths = _get_nan_block_lengths(obj, dim, index) return nan_block_lengths > max_gap def _get_gap_mask( - obj: Dataset | DataArray | Variable, + obj: T_Xarray, dim: Hashable, - limit=None, - limit_direction="both", - limit_area=None, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions = None, limit_use_coordinate=False, - max_gap=None, + max_gap: T_GapLength = None, max_gap_use_coordinate=False, -): +) -> T_Xarray: # Input checking ##Limit if not is_scalar(limit): @@ -407,7 +400,9 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds -def _get_raw_interp_index(arr, dim: Hashable, use_coordinate: bool | Hashable = True): +def _get_raw_interp_index( + arr: T_Xarray, dim: Hashable, use_coordinate: bool | Hashable = True +) -> pd.Index: """Return index to use for x values in interpolation or curve fitting. In comparison to get_clean_interp_index, this function does not convert to numeric values.""" @@ -439,8 +434,11 @@ def _get_raw_interp_index(arr, dim: Hashable, use_coordinate: bool | Hashable = def get_clean_interp_index( - arr, dim: Hashable, use_coordinate: bool | Hashable = True, strict: bool = True -): + arr: T_Xarray, + dim: Hashable, + use_coordinate: bool | Hashable = True, + strict: bool = True, +) -> Variable: """Return index to use for x values in interpolation or curve fitting. Parameters @@ -507,22 +505,22 @@ def get_clean_interp_index( return index -def _is_time_index(index): +def _is_time_index(index) -> bool: from xarray.coding.cftimeindex import CFTimeIndex return isinstance(index, pd.DatetimeIndex | CFTimeIndex) def _interp_na_all( - self, - dim: Hashable | None = None, + obj: T_Xarray, + dim: Hashable, method: InterpOptions = "linear", use_coordinate: bool | str = True, keep_attrs: bool | None = None, **kwargs, -): +) -> T_Xarray: """Interpolate all nan values, without restrictions regarding the gap size.""" - index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) + index = get_clean_interp_index(obj, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) interpolator = partial(func_interpolate_na, interp_class, **kwargs) @@ -534,32 +532,37 @@ def _interp_na_all( warnings.filterwarnings("ignore", "invalid value", RuntimeWarning) arr = apply_ufunc( interpolator, - self, + obj, index.values, input_core_dims=[[dim], [dim]], output_core_dims=[[dim]], - output_dtypes=[self.dtype], + output_dtypes=[obj.dtype], dask="parallelized", vectorize=True, keep_attrs=keep_attrs, - ).transpose(*self.dims) + ).transpose(*obj.dims) return arr -class GapMask: +class GapMask(Generic[T_Xarray]): + content: T_Xarray + mask: np.ndarray + dim: Hashable + """An object that allows for flexible masking of gaps.""" - def __init__(self, content: DataArray | Dataset, mask: np.ndarray, dim: Hashable): + def __init__(self, content: T_Xarray, mask: np.ndarray, dim: Hashable) -> None: self.content = content self.mask = mask self.dim = dim + self.dim = dim - def _apply_mask(self, filled): + def _apply_mask(self, filled: T_Xarray) -> T_Xarray: if self.mask is not None: filled = filled.where(~self.mask, other=self.content) return filled - def ffill(self, dim: Hashable | None = None): + def ffill(self, dim: Hashable | None = None) -> T_Xarray: """Partly fill missing values in this object's data by applying ffill to all unmasked values. Parameters @@ -581,7 +584,7 @@ def ffill(self, dim: Hashable | None = None): dim = self.dim return self._apply_mask(self.content.ffill(dim)) - def bfill(self, dim: Hashable | None = None): + def bfill(self, dim: Hashable | None = None) -> T_Xarray: """Partly fill missing values in this object's data by applying bfill to all unmasked values. Parameters @@ -603,7 +606,7 @@ def bfill(self, dim: Hashable | None = None): dim = self.dim return self._apply_mask(self.content.bfill(dim)) - def fillna(self, value): + def fillna(self, value) -> T_Xarray: """Partly fill missing values in this object's data by applying fillna to all unmasked values. Parameters @@ -613,6 +616,7 @@ def fillna(self, value): argument is a DataArray, it is first aligned with (reindexed to) this array. + Returns ------- filled : same type as caller @@ -625,6 +629,7 @@ def fillna(self, value): """ return self._apply_mask(self.content.fillna(value)) + def interpolate_na( self, dim: Hashable | None = None, @@ -632,7 +637,7 @@ def interpolate_na( use_coordinate: bool | str = True, keep_attrs: bool | None = None, **kwargs: Any, - ): + ) -> T_Xarray: """Partly fill missing values in this object's data by applying interpolate_na to all unmasked values. Parameters @@ -644,6 +649,7 @@ def interpolate_na( filled : same type as caller New object with interpolate_na applied to all unmasked values. + See Also -------- DataArray.interpolate_na @@ -658,25 +664,29 @@ def interpolate_na( use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs, + dim=dim, + method=method, + use_coordinate=use_coordinate, + keep_attrs=keep_attrs, + **kwargs, ) ) + def mask_gaps( - self, + obj: T_Xarray, dim: Hashable, use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, + limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, -) -> GapMask: + max_gap: T_GapLength | None = None, +) -> GapMask[T_Xarray]: """Mask continuous gaps in the data, providing functionality to control gap length and offsets""" mask = _get_gap_mask( - self, + obj, dim, limit, limit_direction, @@ -685,26 +695,20 @@ def mask_gaps( max_gap, use_coordinate, ) - return GapMask(self, mask, dim) + return GapMask(obj, mask, dim) def interp_na( - self, - dim: Hashable | None = None, + obj: T_Xarray, + dim: Hashable, method: InterpOptions = "linear", use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, + limit: T_GapLength | None = None, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs, -): +) -> T_Xarray: """Interpolate values according to different methods.""" - # Preprocess arguments and do consistency checks - if dim is None: - raise NotImplementedError("dim is a required argument") - # This was the original behaviour of interp_na and is kept for backward compatibility # Limit=None: Fill everything, including both boundaries # Limit!=None: Do forward interpolation until limit @@ -715,7 +719,7 @@ def interp_na( limit_direction = "forward" limit_area = None mask = _get_gap_mask( - self, + obj, dim, limit, limit_direction, @@ -725,7 +729,7 @@ def interp_na( use_coordinate, ) - arr = _interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) + arr = _interp_na_all(obj, dim, method, use_coordinate, keep_attrs, **kwargs) if mask is not None: arr = arr.where(~mask) return arr diff --git a/xarray/core/types.py b/xarray/core/types.py index ca538cbf19c..d20c27d75d2 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -113,6 +113,7 @@ DatetimeLike: TypeAlias = ( pd.Timestamp | datetime.datetime | np.datetime64 | CFTimeDatetime ) +TimedeltaLike: TypeAlias = pd.Timedelta | datetime.timedelta | np.timedelta64 class Alignable(Protocol): @@ -244,6 +245,7 @@ def copy( ] InterpnOptions = Literal["linear", "nearest", "slinear", "cubic", "quintic", "pchip"] InterpOptions = Union[Interp1dOptions, InterpolantOptions, InterpnOptions] +T_GapLength = Union[int, float, str, TimedeltaLike] LimitDirectionOptions = Literal["forward", "backward", "both"] LimitAreaOptions = Literal["inside", "outside"] From e455f5b5ff10e126019cf0b6593242896fc0f908 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 25 Aug 2024 15:30:49 +0000 Subject: [PATCH 17/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index dbe603bf457..d67706dba3d 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -629,7 +629,6 @@ def fillna(self, value) -> T_Xarray: """ return self._apply_mask(self.content.fillna(value)) - def interpolate_na( self, dim: Hashable | None = None, @@ -673,7 +672,6 @@ def interpolate_na( ) - def mask_gaps( obj: T_Xarray, dim: Hashable, From d58b0ac7e59497033c970641dd4eb46d3fc9db7d Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:46:44 +0200 Subject: [PATCH 18/46] Fix accidental double pasting of arguments --- xarray/core/missing.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index d67706dba3d..a4029649271 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -663,11 +663,6 @@ def interpolate_na( use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs, - dim=dim, - method=method, - use_coordinate=use_coordinate, - keep_attrs=keep_attrs, - **kwargs, ) ) From 97001194b7f0264ebae43fa7e4d6d69e4e79029e Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 18:50:18 +0200 Subject: [PATCH 19/46] Fix more mypy errors --- xarray/core/missing.py | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index a4029649271..0a9682569ae 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -97,7 +97,7 @@ def _get_limit_fill_mask( obj: T_Xarray, dim: Hashable, index: Variable, - limit: T_GapLength, + limit: int | float | np.number, limit_direction: LimitDirectionOptions, ) -> T_Xarray: # At the left boundary, distance to left is nan. @@ -149,7 +149,7 @@ def _get_nan_block_lengths(obj: T_Xarray, dim: Hashable, index: Variable) -> T_X def _get_max_gap_mask( - obj: T_Xarray, dim: Hashable, index: Variable, max_gap: T_GapLength + obj: T_Xarray, dim: Hashable, index: Variable, max_gap: int | float | np.number ) -> T_Xarray: nan_block_lengths = _get_nan_block_lengths(obj, dim, index) return nan_block_lengths > max_gap @@ -160,11 +160,11 @@ def _get_gap_mask( dim: Hashable, limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", - limit_area: LimitAreaOptions = None, + limit_area: LimitAreaOptions | None = None, limit_use_coordinate=False, - max_gap: T_GapLength = None, + max_gap: T_GapLength | None = None, max_gap_use_coordinate=False, -) -> T_Xarray: +) -> T_Xarray | None: # Input checking ##Limit if not is_scalar(limit): @@ -182,22 +182,25 @@ def _get_gap_mask( limit = timedelta_to_numeric(limit) ## Max_gap - if max_gap is not None: - if not is_scalar(max_gap): - raise ValueError("max_gap must be a scalar.") - - if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): - max_gap = timedelta_to_numeric(max_gap) + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + if max_gap is None: + max_gap = np.inf + else: if not max_gap_use_coordinate: if not isinstance(max_gap, Number | np.number): raise TypeError( f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." ) + + if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): + max_gap = timedelta_to_numeric(max_gap) + # Which masks are really needed? need_limit_mask = limit != np.inf or limit_direction != "both" need_area_mask = limit_area is not None - need_max_gap_mask = max_gap is not None + need_max_gap_mask = max_gap != np.inf # Calculate indexes if need_limit_mask or need_area_mask: index_limit = get_clean_interp_index( @@ -515,7 +518,7 @@ def _interp_na_all( obj: T_Xarray, dim: Hashable, method: InterpOptions = "linear", - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, keep_attrs: bool | None = None, **kwargs, ) -> T_Xarray: @@ -546,12 +549,12 @@ def _interp_na_all( class GapMask(Generic[T_Xarray]): content: T_Xarray - mask: np.ndarray + mask: T_Xarray | None dim: Hashable """An object that allows for flexible masking of gaps.""" - def __init__(self, content: T_Xarray, mask: np.ndarray, dim: Hashable) -> None: + def __init__(self, content: T_Xarray, mask: T_Xarray | None, dim: Hashable) -> None: self.content = content self.mask = mask self.dim = dim @@ -633,7 +636,7 @@ def interpolate_na( self, dim: Hashable | None = None, method: InterpOptions = "linear", - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, keep_attrs: bool | None = None, **kwargs: Any, ) -> T_Xarray: @@ -670,7 +673,7 @@ def interpolate_na( def mask_gaps( obj: T_Xarray, dim: Hashable, - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, @@ -695,7 +698,7 @@ def interp_na( obj: T_Xarray, dim: Hashable, method: InterpOptions = "linear", - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, limit: T_GapLength | None = None, max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, @@ -706,10 +709,7 @@ def interp_na( # Limit=None: Fill everything, including both boundaries # Limit!=None: Do forward interpolation until limit limit_use_coordinate = False - if limit is None: - limit_direction = "both" - else: - limit_direction = "forward" + limit_direction: LimitDirectionOptions = "both" if limit is None else "forward" limit_area = None mask = _get_gap_mask( obj, From 4a360fa9d9f82d6e5b3a383cc4e9e84a25a041ba Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:07:09 +0200 Subject: [PATCH 20/46] Bottleneck is required for limit functionality --- xarray/core/dataarray.py | 2 ++ xarray/core/dataset.py | 2 ++ xarray/tests/test_missing.py | 13 +++++++++++++ 3 files changed, 17 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f67e429cb2c..cd097605cdc 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3820,6 +3820,8 @@ def fill_gaps( """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + *Requires bottleneck.* + Parameters ---------- dim : Hashable diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c1d5bf33637..8ec00c68a88 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7001,6 +7001,8 @@ def fill_gaps( """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + *Requires bottleneck.* + Parameters ---------- dim : Hashable diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index ce4e754bc6a..51912b1efff 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -370,6 +370,7 @@ def test_interpolate_limits(): assert_equal(actual, expected) +@requires_bottleneck def test_interpolate_double_coordinate(): # Check if max_gap is able to handle string coordinate names # Limit is always refering to an index @@ -637,6 +638,7 @@ def test_bfill_dataset(ds): ds.ffill(dim="time") +@requires_bottleneck def test_get_gap_left_edge(): n = np.nan arr = [ @@ -668,6 +670,7 @@ def test_get_gap_left_edge(): ) +@requires_bottleneck def test_get_gap_right_edge(): n = np.nan arr = [ @@ -699,6 +702,7 @@ def test_get_gap_right_edge(): ) +@requires_bottleneck def test_get_gap_dist_to_left_edge(): n = np.nan arr = [ @@ -722,6 +726,7 @@ def test_get_gap_dist_to_left_edge(): expected = da.copy(data=[[n, 0, 3, 4, 5, 6, 8, 10, 0], [n, n, n, 0, 1, 2, 0, 2, 4]]) +@requires_bottleneck def test_get_gap_dist_to_right_edge(): n = np.nan arr = [ @@ -772,6 +777,7 @@ def test_get_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +@requires_bottleneck def test_get_nan_block_lengths_2d(): n = np.nan da = xr.DataArray( @@ -808,6 +814,7 @@ def test_get_nan_block_lengths_2d(): assert_equal(actual, expected_y) +@requires_bottleneck def test_get_limit_fill_mask(): T = True F = False @@ -860,6 +867,7 @@ def test_get_limit_fill_mask(): assert_equal(actual, expected) +@requires_bottleneck def test_get_area_mask(): T = True F = False @@ -1067,6 +1075,7 @@ def test_interpolate_na_max_gap_2d(coords): assert_equal(actual, expected_x) +@requires_bottleneck def test_interpolate_na_limit_2d(): n = np.nan times = pd.date_range("2000-01-01", periods=12, freq="3h") @@ -1116,6 +1125,7 @@ def test_interpolators_complex_out_of_bounds(): ####Masking Functionality +@requires_bottleneck def test_fill_gaps_limit(): n = np.nan times = pd.date_range("2000-01-01", periods=8, freq="2h") @@ -1152,6 +1162,7 @@ def test_fill_gaps_limit(): assert_equal(actual, expected) +@requires_bottleneck def test_mask_gap_limit_2d(): n = np.nan times = pd.date_range("2000-01-01", periods=12, freq="3h") @@ -1312,6 +1323,7 @@ def test_mask_gap_limit_2d(): assert_equal(actual, expected) +@requires_bottleneck def test_mask_gap_max_gap_2d(): n = np.nan times = pd.date_range("2000-01-01", periods=12, freq="3h") @@ -1361,6 +1373,7 @@ def test_mask_gap_max_gap_2d(): assert_equal(actual, expected) +@requires_bottleneck def test_mask_double_coordinate(): # Check if limit and max_gap are able to handle string coordinate names n = np.nan From 7389bf71eda2672e1f95ddb8cc6ed4e5ab91e87d Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:21:43 +0200 Subject: [PATCH 21/46] Docs: Require numbagg or bottleneck for ffill/bfill/fill_gaps --- xarray/core/dataarray.py | 6 +++--- xarray/core/dataset.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index cd097605cdc..8d424771cfa 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3642,7 +3642,7 @@ def interpolate_na( def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -3726,7 +3726,7 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: def bfill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values backward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -3820,7 +3820,7 @@ def fill_gaps( """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8ec00c68a88..ef9628e5331 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6862,7 +6862,7 @@ def interpolate_na( def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -6926,7 +6926,7 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: def bfill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values backward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -6998,10 +6998,10 @@ def fill_gaps( limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[Dataset]: - """Fill in gaps in the data using one of several filling methods. + """Fill in gaps (consecutive missing values) in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- From 93c72f59659b4955617b481e9f9c2dd6877f47fd Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:14:32 +0200 Subject: [PATCH 22/46] Rework index conversion to have consistent typing --- xarray/core/dataarray.py | 2 +- xarray/core/missing.py | 36 +++++++++++++++++----------------- xarray/tests/test_dataarray.py | 8 +++++--- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8d424771cfa..d557f60cffb 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3817,7 +3817,7 @@ def fill_gaps( limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[DataArray]: - """Fill in gaps in the data using one of several filling methods. + """Fill in gaps (consecutive missing values) in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. *Requires numbagg or bottleneck.* diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 0a9682569ae..b1b7fc46260 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -488,24 +488,24 @@ def get_clean_interp_index( if isinstance(index, CFTimeIndex | pd.DatetimeIndex): offset = type(index[0])(1970, 1, 1) if isinstance(index, CFTimeIndex): - index = index.values - index = Variable( - data=datetime_to_numeric(index, offset=offset, datetime_unit="ns"), - dims=(dim,), - ) - - # raise if index cannot be cast to a float (e.g. MultiIndex) - try: - index = index.values.astype(np.float64) - except (TypeError, ValueError) as err: - # pandas raises a TypeError - # xarray/numpy raise a ValueError - raise TypeError( - f"Index {index.name!r} must be castable to float64 to support " - f"interpolation or curve fitting, got {type(index).__name__}." - ) from err - index = Variable([dim], index) - return index + values = datetime_to_numeric( + index.values, offset=offset, datetime_unit="ns" + ) + else: + values = datetime_to_numeric(index, offset=offset, datetime_unit="ns") + else: # if numeric or standard calendar index: try to cast to float + try: + values = index.values.astype(np.float64) + # raise if index cannot be cast to a float (e.g. MultiIndex) + except (TypeError, ValueError) as err: + # pandas raises a TypeError + # xarray/numpy raise a ValueError + raise TypeError( + f"Index {index.name!r} must be castable to float64 to support " + f"interpolation or curve fitting, got {type(index).__name__}." + ) from err + var = Variable([dim], values) + return var def _is_time_index(index) -> bool: diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c94eefd74ea..915b497d51e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4253,11 +4253,13 @@ def test_rank(self) -> None: def test_polyfit(self, use_dask, use_datetime) -> None: if use_dask and not has_dask: pytest.skip("requires dask") - xcoord = xr.DataArray( + da_times = xr.DataArray( pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" ) - x = xr.core.missing.get_clean_interp_index(xcoord, "x") - if not use_datetime: + x = xr.core.missing.get_clean_interp_index(da_times, "x").values + if use_datetime: + xcoord = da_times.values + else: xcoord = x da_raw = DataArray( From 72c76dbfc15850cbd9464a964b255172760d9895 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:30:02 +0200 Subject: [PATCH 23/46] Add new method to api.rst --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index f731ac1c59a..e825b786549 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -167,6 +167,7 @@ Missing value handling Dataset.fillna Dataset.ffill Dataset.bfill + Dataset.fill_gaps Dataset.interpolate_na Dataset.where Dataset.isin @@ -357,6 +358,7 @@ Missing value handling DataArray.fillna DataArray.ffill DataArray.bfill + DataArray.fill_gaps DataArray.interpolate_na DataArray.where DataArray.isin From 6631aeb79c5fabf14edc3b89529cef6543c65622 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Jan 2025 17:56:15 +0000 Subject: [PATCH 24/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 4 ++-- xarray/tests/test_missing.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index b1b7fc46260..4e4ff7f1a84 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -6,7 +6,7 @@ from collections.abc import Callable, Generator, Hashable, Sequence from functools import partial from numbers import Number -from typing import TYPE_CHECKING, Any, TypeVar, Generic, get_args +from typing import TYPE_CHECKING, Any, Generic, TypeVar, get_args import numpy as np import pandas as pd @@ -25,7 +25,7 @@ from xarray.core.options import _get_keep_attrs from xarray.core.types import ( Interp1dOptions, - InterpnOptions, + InterpnOptions, InterpOptions, LimitAreaOptions, LimitDirectionOptions, diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 51912b1efff..8e50c48397c 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -109,7 +109,6 @@ def make_interpolate_example_data(shape, frac_nan, seed=12345, non_uniform=False @pytest.mark.parametrize("frac_nan", [0, 0.5, 1]) @requires_scipy def test_interpolate_pd_compat(method, fill_value, dim, shape, frac_nan) -> None: - da, df = make_interpolate_example_data(shape, frac_nan) actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) From 710238134721b508e309eb1d54c6b90407ca46dc Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 10:55:41 +0100 Subject: [PATCH 25/46] Reimport utils (deleted during rebase) --- xarray/core/missing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 4e4ff7f1a84..2aab733d4ed 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +from xarray.core import utils from xarray.core.common import _contains_datetime_like_objects from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import ( From 5fe9febe0be8ed96619480380eed02817f6c698f Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:15:46 +0100 Subject: [PATCH 26/46] Remove typo (double line) --- xarray/core/missing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 2aab733d4ed..6c9d235c480 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -559,7 +559,6 @@ def __init__(self, content: T_Xarray, mask: T_Xarray | None, dim: Hashable) -> N self.content = content self.mask = mask self.dim = dim - self.dim = dim def _apply_mask(self, filled: T_Xarray) -> T_Xarray: if self.mask is not None: From a74a840ecda317849092e196db9aa92a41aab893 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:23:51 +0100 Subject: [PATCH 27/46] Add a hint in the interpolate_na docs about the forward (legacy) behaviour. --- xarray/core/dataarray.py | 3 ++- xarray/core/dataset.py | 3 ++- xarray/tests/test_missing.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d557f60cffb..54f42269557 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3553,7 +3553,7 @@ def interpolate_na( limit : int or None, default: None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of + or None for no limit. This filling is done in the forward direction, regardless of the size of the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. use_coordinate : bool or str, default: True @@ -3599,6 +3599,7 @@ def interpolate_na( See Also -------- + DataArray.fill_gaps numpy.interp scipy.interpolate diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ef9628e5331..e66477d60ca 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6746,7 +6746,7 @@ def interpolate_na( coordinate variable to use as the index. limit : int, default: None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of + or None for no limit. This filling is done in the forward direction, regardless of the size of the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta \ @@ -6792,6 +6792,7 @@ def interpolate_na( See Also -------- + Dataset.fill_gaps numpy.interp scipy.interpolate diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 8e50c48397c..ff84f102a55 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -352,7 +352,7 @@ def test_interpolate_limits(): da = xr.DataArray([n, n, 3, n, n, 6, n, 8, n], dims=["y"], coords=coords) actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") - # With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility + # With no limit, all gaps should be interpolated (forward+backward, including boundaries). Introduced in xarray due to a bug (GH7665), but kept for backward compatibility. With limit, do only forward interpolation. expected = da.copy(data=[1, 2, 3, 4, 5, 6, 7, 8, 9]) assert_equal(actual, expected) From 3b7d6bccbff488cdcfd61d07fea9136ad64c0fff Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:13:57 +0100 Subject: [PATCH 28/46] Add example in User guide --- doc/user-guide/computation.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index 5d7002484c2..8079b227b3b 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -203,6 +203,21 @@ Xarray also provides the ``max_gap`` keyword argument to limit the interpolation data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na` for more. +All of the above methods by default fill gaps of any size in the data. If you want fine control over the size of the gaps that are filled, you can use :py:meth:`~xarray.DataArray.fill_gaps`. For example, consider a series of air temperature measurements with gaps: + +.. ipython:: python + + n = np.nan + temperature = xr.DataArray( + [n, 1.1, n, n, n, 2, n, n, n, n, 2.3], + coords={"time": xr.Variable("time", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])}, + ) + temperature.fill_gaps( + "time", limit=1, limit_direction="both", max_gap=4 + ).interpolate_na("time") + +In this example, we interpolate valid measurements up to one hour forward and backward in time. However, if a gap is longer than four hours, nothing is interpolated. :py:metho:`~xarray.DataArray.fill_gaps` works with all filling methods (:py:meth:`~xarray.DataArray.ffill`, :py:meth:`~xarray.DataArray.bfill`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.interpolate_na`). See :py:meth:`~xarray.DataArray.fill_gaps` for more information on the available options. + .. _agg: Aggregation From 9a5bc29222c7df0f497793776936df71cf7e2185 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 15:03:42 +0100 Subject: [PATCH 29/46] Fix typo in documentation --- doc/user-guide/computation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index 8079b227b3b..57985fde771 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -216,7 +216,7 @@ All of the above methods by default fill gaps of any size in the data. If you wa "time", limit=1, limit_direction="both", max_gap=4 ).interpolate_na("time") -In this example, we interpolate valid measurements up to one hour forward and backward in time. However, if a gap is longer than four hours, nothing is interpolated. :py:metho:`~xarray.DataArray.fill_gaps` works with all filling methods (:py:meth:`~xarray.DataArray.ffill`, :py:meth:`~xarray.DataArray.bfill`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.interpolate_na`). See :py:meth:`~xarray.DataArray.fill_gaps` for more information on the available options. +In this example, we interpolate valid measurements up to one hour forward and backward in time. However, if a gap is longer than four hours, nothing is interpolated. :py:meth:`~xarray.DataArray.fill_gaps` works with all filling methods (:py:meth:`~xarray.DataArray.ffill`, :py:meth:`~xarray.DataArray.bfill`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.interpolate_na`). See :py:meth:`~xarray.DataArray.fill_gaps` for more information on the available options. .. _agg: From 511323d2db6876a5059830565918cb1fb999b68f Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:12:30 +0100 Subject: [PATCH 30/46] Fix typing errors (ignore types for limit argument internally) --- xarray/core/missing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 6c9d235c480..8cc8c061a11 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -219,15 +219,16 @@ def _get_gap_mask( # Calculate individual masks masks = [] if need_limit_mask: + # due to the dynamic typing of limit, mypy cannot infer the correct type masks.append( - _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) + _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) # type: ignore[arg-type] ) if need_area_mask: masks.append(_get_limit_area_mask(obj, dim, index_limit, limit_area)) if need_max_gap_mask: - masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) + masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) # type: ignore[arg-type] # Combine masks mask = masks[0] for m in masks[1:]: From 2463ded26ceb5e1ce4018ce462248cae86fe2d0f Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:13:24 +0100 Subject: [PATCH 31/46] Remove return type of interp_na to avoid mypy error --- xarray/core/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 8cc8c061a11..bebc89dee1d 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -704,7 +704,7 @@ def interp_na( max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs, -) -> T_Xarray: +): """Interpolate values according to different methods.""" # This was the original behaviour of interp_na and is kept for backward compatibility # Limit=None: Fill everything, including both boundaries From ee34faf3efed6a4dfe281e31edef62d4f638e49d Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:24:47 +0100 Subject: [PATCH 32/46] Include documentation for GapMask Object --- doc/api.rst | 14 ++++++++++++ xarray/core/dataarray.py | 4 +++- xarray/core/missing.py | 49 ++++++++++++++++++++++++++++------------ 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index e825b786549..1e64757b0e0 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1494,6 +1494,20 @@ DataArray DataArrayResample.dims DataArrayResample.groups +GapMask object +=============== + +.. currentmodule:: xarray.core.missing + +.. autosummary:: + :toctree: generated/ + + GapMask + GapMask.fillna + GapMask.ffill + GapMask.bfill + GapMask.interpolate_na + Accessors ========= diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 54f42269557..7ba23b73583 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3892,11 +3892,13 @@ def fill_gaps( Returns ------- - Gap Mask: GapMask + Gap Mask: core.missing.GapMask + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. See Also -------- + :ref:`missing_values` DataArray.fillna DataArray.ffill DataArray.bfill diff --git a/xarray/core/missing.py b/xarray/core/missing.py index bebc89dee1d..27ef486de65 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -550,20 +550,41 @@ def _interp_na_all( class GapMask(Generic[T_Xarray]): - content: T_Xarray - mask: T_Xarray | None - dim: Hashable + """An object that allows for flexible masking of gaps. You should use DataArray.fill_gaps() or Dataset.fill_gaps() to construct this object instead of constructing it directly.""" - """An object that allows for flexible masking of gaps.""" + # Attributes + # ---------- + # mask : DataArray or Dataset or None + _content: T_Xarray + _dim: Hashable + mask: T_Xarray | None + """Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps after applying a filling method.""" def __init__(self, content: T_Xarray, mask: T_Xarray | None, dim: Hashable) -> None: - self.content = content + """An object that allows for flexible masking of gaps. You should use DataArray.fill_gaps() or Dataset.fill_gaps() to construct this object instead of calling this constructor directly. + + Parameters + ---------- + content : DataArray or Dataset + The object to be masked. + mask : DataArray or Dataset or None + Boolean gap mask to be applied to the content. If None, the content is not masked. + dim : Hashable + The dimension along which the mask was created. When filling gaps and no dimension is specified for the filling method, this dimension is used. + + See Also + -------- + xarray.DataArray.fill_gaps + xarray.Dataset.fill_gaps + + """ + self._content = content self.mask = mask - self.dim = dim + self._dim = dim def _apply_mask(self, filled: T_Xarray) -> T_Xarray: if self.mask is not None: - filled = filled.where(~self.mask, other=self.content) + filled = filled.where(~self.mask, other=self._content) return filled def ffill(self, dim: Hashable | None = None) -> T_Xarray: @@ -585,8 +606,8 @@ def ffill(self, dim: Hashable | None = None) -> T_Xarray: Dataset.ffill """ if dim is None: - dim = self.dim - return self._apply_mask(self.content.ffill(dim)) + dim = self._dim + return self._apply_mask(self._content.ffill(dim)) def bfill(self, dim: Hashable | None = None) -> T_Xarray: """Partly fill missing values in this object's data by applying bfill to all unmasked values. @@ -607,8 +628,8 @@ def bfill(self, dim: Hashable | None = None) -> T_Xarray: Dataset.bfill """ if dim is None: - dim = self.dim - return self._apply_mask(self.content.bfill(dim)) + dim = self._dim + return self._apply_mask(self._content.bfill(dim)) def fillna(self, value) -> T_Xarray: """Partly fill missing values in this object's data by applying fillna to all unmasked values. @@ -631,7 +652,7 @@ def fillna(self, value) -> T_Xarray: DataArray.fillna Dataset.fillna """ - return self._apply_mask(self.content.fillna(value)) + return self._apply_mask(self._content.fillna(value)) def interpolate_na( self, @@ -659,9 +680,9 @@ def interpolate_na( Dataset.interpolate_na """ if dim is None: - dim = self.dim + dim = self._dim return self._apply_mask( - self.content.interpolate_na( + self._content.interpolate_na( dim=dim, method=method, use_coordinate=use_coordinate, From ec221ddd31cde2036572c615a4be0ad00a2e7ce9 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:45:42 +0100 Subject: [PATCH 33/46] Include references in docs between filling functions --- xarray/core/dataarray.py | 15 ++++++++++++++ xarray/core/dataset.py | 44 ++++++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7ba23b73583..4ac6e72cb7e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3479,6 +3479,11 @@ def fillna(self, value: Any) -> Self: ------- filled : DataArray + See Also + -------- + :ref:`missing_values` + DataArray.fill_gaps + Examples -------- >>> da = xr.DataArray( @@ -3661,6 +3666,11 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: ------- filled : DataArray + See Also + -------- + :ref:`missing_values` + DataArray.fill_gaps + Examples -------- >>> temperature = np.array( @@ -3745,6 +3755,11 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: ------- filled : DataArray + See Also + -------- + :ref:`missing_values` + DataArray.fill_gaps + Examples -------- >>> temperature = np.array( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e66477d60ca..d7c5f36d9a7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6650,6 +6650,11 @@ def fillna(self, value: Any) -> Self: ------- Dataset + See Also + -------- + :ref:`missing_values` + Dataset.fill_gaps + Examples -------- >>> ds = xr.Dataset( @@ -6876,6 +6881,16 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Returns + ------- + Dataset + + See Also + -------- + :ref:`missing_values` + Dataset.fill_gaps + Dataset.bfill + Examples -------- >>> time = pd.date_range("2023-01-01", periods=10, freq="D") @@ -6910,14 +6925,6 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: * time (time) datetime64[ns] 80B 2023-01-01 2023-01-02 ... 2023-01-10 Data variables: data (time) float64 80B 1.0 1.0 1.0 nan 5.0 5.0 5.0 8.0 8.0 10.0 - - Returns - ------- - Dataset - - See Also - -------- - Dataset.bfill """ from xarray.core.missing import _apply_over_vars_with_dim, ffill @@ -6941,6 +6948,16 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Returns + ------- + Dataset + + See Also + -------- + :ref:`missing_values` + Dataset.fill_gaps + Dataset.ffill + Examples -------- >>> time = pd.date_range("2023-01-01", periods=10, freq="D") @@ -6975,14 +6992,6 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: * time (time) datetime64[ns] 80B 2023-01-01 2023-01-02 ... 2023-01-10 Data variables: data (time) float64 80B 1.0 nan 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 - - Returns - ------- - Dataset - - See Also - -------- - Dataset.ffill """ from xarray.core.missing import _apply_over_vars_with_dim, bfill @@ -7073,11 +7082,12 @@ def fill_gaps( Returns ------- - Gap Mask: GapMask + Gap Mask: core.missing.GapMask An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. See Also -------- + :ref:`missing_values` Dataset.fillna Dataset.ffill Dataset.bfill From 41f2bf8ed49aae94033637b0552150f339a02f67 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 22 Jan 2025 18:03:43 +0100 Subject: [PATCH 34/46] Doc-Bug: Default for direction is both --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4ac6e72cb7e..3f7ca7dc939 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3873,7 +3873,7 @@ def fill_gaps( For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. To only fill gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "forward" + limit_direction: {"forward", "backward", "both"}, default: "both" Consecutive NaNs will be filled in this direction. limit_area: {"inside", "outside"} or None: default: None Consecutive NaNs will be filled with this restriction. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d7c5f36d9a7..5e3e042bebf 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7048,7 +7048,7 @@ def fill_gaps( For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. To only fill gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "forward" + limit_direction: {"forward", "backward", "both"}, default: "both" Consecutive NaNs will be filled in this direction. limit_area: {"inside", "outside"} or None: default: None Consecutive NaNs will be filled with this restriction. From 2c0d37567b179fb5e6bc893890601ca782d9fc05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20Ockenfu=C3=9F?= <42680748+Ockenfuss@users.noreply.github.com> Date: Thu, 23 Jan 2025 10:21:02 +0100 Subject: [PATCH 35/46] Typo in Documentation Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- xarray/core/dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 3f7ca7dc939..1da81af0500 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3922,7 +3922,7 @@ def fill_gaps( Notes ----- - ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + ``limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. Examples -------- From 9ae4b2670dd0649aa72ec90acde58f766e627277 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Thu, 23 Jan 2025 11:02:02 +0100 Subject: [PATCH 36/46] Do not allow further limit or max_gap specification when calling interpolate_na on a GapMask object --- xarray/core/missing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 27ef486de65..8ca57ddb4bc 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -686,6 +686,8 @@ def interpolate_na( dim=dim, method=method, use_coordinate=use_coordinate, + limit=None, + max_gap=None, keep_attrs=keep_attrs, **kwargs, ) From 5cdc2233553c92f9cee72b1a2983232bd8e75fdc Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Thu, 23 Jan 2025 13:54:47 +0100 Subject: [PATCH 37/46] Make two stages of filling clear in fill_gaps documentation --- doc/user-guide/computation.rst | 2 +- xarray/core/dataarray.py | 11 +++++++---- xarray/core/dataset.py | 9 ++++++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index 57985fde771..8ec01284126 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -216,7 +216,7 @@ All of the above methods by default fill gaps of any size in the data. If you wa "time", limit=1, limit_direction="both", max_gap=4 ).interpolate_na("time") -In this example, we interpolate valid measurements up to one hour forward and backward in time. However, if a gap is longer than four hours, nothing is interpolated. :py:meth:`~xarray.DataArray.fill_gaps` works with all filling methods (:py:meth:`~xarray.DataArray.ffill`, :py:meth:`~xarray.DataArray.bfill`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.interpolate_na`). See :py:meth:`~xarray.DataArray.fill_gaps` for more information on the available options. +In this example, we interpolate valid measurements up to one hour forward and backward in time. However, if a gap is longer than four hours, nothing is interpolated. :py:meth:`~xarray.DataArray.fill_gaps` returns a :py:class:`~xarray.core.missing.GapMask` object that works with all filling methods (:py:meth:`~xarray.DataArray.ffill`, :py:meth:`~xarray.DataArray.bfill`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.interpolate_na`). See :py:meth:`~xarray.DataArray.fill_gaps` for more information on the available options. .. _agg: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1da81af0500..8ff898b93fa 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3833,8 +3833,11 @@ def fill_gaps( limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[DataArray]: - """Fill in gaps (consecutive missing values) in the data using one of several filling methods. - Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + """Fill in gaps (consecutive missing values) in the data. + + - Firstly, ``fill_gaps`` determines **which** values to fill, with options for fine control how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + - Secondly, calling one of several filling methods determines **how** to fill the selected values. + *Requires numbagg or bottleneck.* @@ -3851,7 +3854,7 @@ def fill_gaps( limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum number or distance of consecutive NaNs to fill. - Use None for no limit. When interpolating along a datetime64 dimension + Use None for no limit. When filling along a datetime64 dimension and ``use_coordinate=True``, ``limit`` can be one of the following: - a string that is valid input for pandas.to_timedelta @@ -3922,7 +3925,7 @@ def fill_gaps( Notes ----- - ``limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. Examples -------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5e3e042bebf..56f478ae1e1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7008,8 +7008,11 @@ def fill_gaps( limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[Dataset]: - """Fill in gaps (consecutive missing values) in the data using one of several filling methods. - Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + """Fill in gaps (consecutive missing values) in the data. + + - Firstly, ``fill_gaps`` determines **which** values to fill, with options for fine control how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + - Secondly, calling one of several filling methods determines **how** to fill the selected values. + *Requires numbagg or bottleneck.* @@ -7026,7 +7029,7 @@ def fill_gaps( limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum number or distance of consecutive NaNs to fill. - Use None for no limit. When interpolating along a datetime64 dimension + Use None for no limit. When filling along a datetime64 dimension and ``use_coordinate=True``, ``limit`` can be one of the following: - a string that is valid input for pandas.to_timedelta From 078e546d50d5008a3c9537e1064e5c7669b45bce Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 12:05:40 +0100 Subject: [PATCH 38/46] Default to forward for ffill and backward to bfill. Raise error if this is not the case. This implies that the mask creation needs to be delayed until the filling stage. --- xarray/core/dataarray.py | 15 +++-- xarray/core/dataset.py | 15 +++-- xarray/core/missing.py | 120 ++++++++++++++++++++--------------- xarray/tests/test_missing.py | 86 ++++++++++++++++++------- 4 files changed, 155 insertions(+), 81 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8ff898b93fa..5869b31da7f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3829,7 +3829,7 @@ def fill_gaps( *, use_coordinate: bool | Hashable = True, limit: T_GapLength | None = None, - limit_direction: LimitDirectionOptions = "both", + limit_direction: LimitDirectionOptions | None = None, limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[DataArray]: @@ -3876,8 +3876,15 @@ def fill_gaps( For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. To only fill gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "both" + limit_direction: {"forward", "backward", "both"}, default: None Consecutive NaNs will be filled in this direction. + If not specified, the default is + + - "forward" if ``ffill`` is used + - "backward" if ``bfill`` is used + - "both" otherwise + + raises ValueError if not "forward" and ``ffill`` is used or not "backward" and ``bfill`` is used. limit_area: {"inside", "outside"} or None: default: None Consecutive NaNs will be filled with this restriction. @@ -3961,9 +3968,9 @@ def fill_gaps( Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 """ - from xarray.core.missing import mask_gaps + from xarray.core.missing import GapMask - return mask_gaps( + return GapMask( self, dim, use_coordinate=use_coordinate, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 56f478ae1e1..694696ff937 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7004,7 +7004,7 @@ def fill_gaps( *, use_coordinate: bool | Hashable = True, limit: T_GapLength | None = None, - limit_direction: LimitDirectionOptions = "both", + limit_direction: LimitDirectionOptions | None = None, limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[Dataset]: @@ -7051,8 +7051,15 @@ def fill_gaps( For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. To only fill gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "both" + limit_direction: {"forward", "backward", "both"}, default: None Consecutive NaNs will be filled in this direction. + If not specified, the default is + + - "forward" if ``ffill`` is used + - "backward" if ``bfill`` is used + - "both" otherwise + + raises ValueError if not "forward" and ``ffill`` is used or not "backward" and ``bfill`` is used. limit_area: {"inside", "outside"} or None: default: None Consecutive NaNs will be filled with this restriction. @@ -7149,9 +7156,9 @@ def fill_gaps( A (x) float64 56B nan 2.0 9.0 9.0 5.0 9.0 0.0 B (x) float64 56B nan 2.0 9.0 9.0 5.0 6.0 nan """ - from xarray.core.missing import mask_gaps + from xarray.core.missing import GapMask - return mask_gaps( + return GapMask( self, dim, use_coordinate=use_coordinate, diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 8ca57ddb4bc..129e93a225b 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -560,17 +560,24 @@ class GapMask(Generic[T_Xarray]): mask: T_Xarray | None """Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps after applying a filling method.""" - def __init__(self, content: T_Xarray, mask: T_Xarray | None, dim: Hashable) -> None: + def __init__( + self, + content: T_Xarray, + dim: Hashable, + use_coordinate: bool | Hashable = True, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions | None = None, + limit_area: LimitAreaOptions | None = None, + max_gap: T_GapLength | None = None, + ) -> None: """An object that allows for flexible masking of gaps. You should use DataArray.fill_gaps() or Dataset.fill_gaps() to construct this object instead of calling this constructor directly. Parameters ---------- content : DataArray or Dataset The object to be masked. - mask : DataArray or Dataset or None - Boolean gap mask to be applied to the content. If None, the content is not masked. - dim : Hashable - The dimension along which the mask was created. When filling gaps and no dimension is specified for the filling method, this dimension is used. + + See xarray.DataArray.fill_gaps or xarray.Dataset.fill_gaps for an explanation of the remaining parameters. See Also -------- @@ -579,21 +586,50 @@ def __init__(self, content: T_Xarray, mask: T_Xarray | None, dim: Hashable) -> N """ self._content = content - self.mask = mask self._dim = dim + self._use_coordinate = use_coordinate + self._limit = limit + self._limit_direction = limit_direction + self._limit_area = limit_area + self._max_gap = max_gap + + def _get_mask(self, limit_direction) -> T_Xarray: + mask = _get_gap_mask( + obj=self._content, + dim=self._dim, + limit=self._limit, + limit_direction=limit_direction, + limit_area=self._limit_area, + limit_use_coordinate=self._use_coordinate, + max_gap=self._max_gap, + max_gap_use_coordinate=self._use_coordinate, + ) + return mask - def _apply_mask(self, filled: T_Xarray) -> T_Xarray: - if self.mask is not None: - filled = filled.where(~self.mask, other=self._content) + def _apply_mask(self, filled: T_Xarray, mask: T_Xarray) -> T_Xarray: + if mask is not None: + filled = filled.where(~mask, other=self._content) return filled - def ffill(self, dim: Hashable | None = None) -> T_Xarray: + def get_mask(self) -> T_Xarray: + """Return the gap mask. + + Returns + ------- + mask : DataArray or Dataset + Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps after applying a filling method. + """ + limit_direction = self._limit_direction + if limit_direction is None: + limit_direction = "both" + mask = self._get_mask(limit_direction) + return mask + + def ffill(self) -> T_Xarray: """Partly fill missing values in this object's data by applying ffill to all unmasked values. Parameters ---------- - dim : Hashable or None, default None - Dimension along which to fill missing values. If None, the dimension used to create the mask is used. Returns ------- @@ -605,18 +641,18 @@ def ffill(self, dim: Hashable | None = None) -> T_Xarray: DataArray.ffill Dataset.ffill """ - if dim is None: - dim = self._dim - return self._apply_mask(self._content.ffill(dim)) + if self._limit_direction is None: + limit_direction = "forward" + elif self._limit_direction != "forward": + raise ValueError( + f"limit_direction='{self._limit_direction}' is not allowed with ffill, must be 'forward'." + ) + mask = self._get_mask(limit_direction) + return self._apply_mask(self._content.ffill(self._dim), mask) - def bfill(self, dim: Hashable | None = None) -> T_Xarray: + def bfill(self) -> T_Xarray: """Partly fill missing values in this object's data by applying bfill to all unmasked values. - Parameters - ---------- - dim : Hashable or None, default None - Dimension along which to fill missing values. If None, the dimension used to create the mask is used. - Returns ------- filled : same type as caller @@ -627,9 +663,14 @@ def bfill(self, dim: Hashable | None = None) -> T_Xarray: DataArray.bfill Dataset.bfill """ - if dim is None: - dim = self._dim - return self._apply_mask(self._content.bfill(dim)) + if self._limit_direction is None: + limit_direction = "backward" + elif self._limit_direction != "backward": + raise ValueError( + f"limit_direction='{self._limit_direction}' is not allowed with bfill, must be 'backward'." + ) + mask = self._get_mask(limit_direction) + return self._apply_mask(self._content.bfill(self._dim), mask) def fillna(self, value) -> T_Xarray: """Partly fill missing values in this object's data by applying fillna to all unmasked values. @@ -652,7 +693,8 @@ def fillna(self, value) -> T_Xarray: DataArray.fillna Dataset.fillna """ - return self._apply_mask(self._content.fillna(value)) + mask = self.get_mask() + return self._apply_mask(self._content.fillna(value), mask) def interpolate_na( self, @@ -681,6 +723,7 @@ def interpolate_na( """ if dim is None: dim = self._dim + mask = self.get_mask() return self._apply_mask( self._content.interpolate_na( dim=dim, @@ -690,34 +733,11 @@ def interpolate_na( max_gap=None, keep_attrs=keep_attrs, **kwargs, - ) + ), + mask, ) -def mask_gaps( - obj: T_Xarray, - dim: Hashable, - use_coordinate: bool | Hashable = True, - limit: T_GapLength | None = None, - limit_direction: LimitDirectionOptions = "both", - limit_area: LimitAreaOptions | None = None, - max_gap: T_GapLength | None = None, -) -> GapMask[T_Xarray]: - """Mask continuous gaps in the data, providing functionality to control gap length and offsets""" - - mask = _get_gap_mask( - obj, - dim, - limit, - limit_direction, - limit_area, - use_coordinate, - max_gap, - use_coordinate, - ) - return GapMask(obj, mask, dim) - - def interp_na( obj: T_Xarray, dim: Hashable, diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index ff84f102a55..62f3a3aeedf 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -198,16 +198,12 @@ def test_interpolate_pd_compat_polynomial(): def test_interpolate_pd_compat_limits(): shapes = [(7, 7)] frac_nan = 0.5 - method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' limits = [ None, 1, 3, ] # pandas 2.1.4 is currently unable to handle coordinate based limits! - limit_directions = [ - "forward", - "backward", - ] # xarray does not support 'None' (pandas: None='forward', unless method='bfill') + limit_directions = ["forward", "backward", None] limit_areas = [None, "outside", "inside"] for shape, limit, limit_direction, limit_area in itertools.product( @@ -221,21 +217,37 @@ def test_interpolate_pd_compat_limits(): limit_direction=limit_direction, limit_area=limit_area, use_coordinate=False, - ).interpolate_na( - dim=dim, - method=method, - use_coordinate=True, - fill_value="extrapolate", - ) - expected = df.interpolate( - method=method, - axis=da.get_axis_num(dim), - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value="extrapolate", ) - np.testing.assert_allclose(actual.values, expected.values) + if ( + limit_direction is None + ): # xarray: 'None'='both', while pandas: None='forward'. But for ffill/bfill, they both should default to forward/backward + filled = actual.ffill() + expected = df.ffill( + axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area + ) + np.testing.assert_allclose(filled.values, expected.values) + filled = actual.bfill() + expected = df.bfill( + axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area + ) + np.testing.assert_allclose(filled.values, expected.values) + else: + method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' + actual = actual.interpolate_na( + dim=dim, + method=method, + use_coordinate=True, + fill_value="extrapolate", + ) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value="extrapolate", + ) + np.testing.assert_allclose(actual.values, expected.values) @requires_scipy @@ -1131,6 +1143,25 @@ def test_fill_gaps_limit(): coords = {"yt": ("y", times)} da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + mask = da.fill_gaps(dim="y", limit_direction="backward") + with pytest.raises( + ValueError, + match=r"limit_direction='backward' is not allowed with ffill, must be 'forward'", + ): + mask.ffill() + mask = da.fill_gaps(dim="y", limit_direction="both") + with pytest.raises( + ValueError, + match=r"limit_direction='both' is not allowed with ffill, must be 'forward'", + ): + mask.ffill() + mask = da.fill_gaps(dim="y", limit_direction="forward") + with pytest.raises( + ValueError, + match=r"limit_direction='forward' is not allowed with bfill, must be 'backward'", + ): + mask.bfill() + actual = da.fill_gaps(dim="y", limit=None).interpolate_na( dim="y", fill_value="extrapolate" ) @@ -1188,12 +1219,21 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) - actual = mask.ffill(dim="time") + actual = mask.ffill() + expected = da.copy( + data=[ + [1, 2, 3, 4, 4, 6, 6, n, n, 10, 11, 11], + [n, n, 3, 3, n, 6, 6, n, n, 10, 10, n], + [n, 2, 3, 4, 4, 6, 6, n, n, 10, 11, 11], + ] + ) + assert_equal(actual, expected) + actual = mask.bfill() expected = da.copy( data=[ - [1, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], - [n, n, 3, 3, 3, 6, 6, n, 6, 10, 10, n], - [n, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], + [1, 2, 3, 4, 6, 6, n, n, 10, 10, 11, n], + [n, 3, 3, n, 6, 6, n, n, 10, 10, n, n], + [2, 2, 3, 4, 6, 6, n, n, 10, 10, 11, n], ] ) assert_equal(actual, expected) From e79f90f7b0236adff8d4d39e97757721ab02a7a5 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 12:20:28 +0100 Subject: [PATCH 39/46] Update api.rst and GapMask Attributes --- doc/api.rst | 1 + xarray/core/missing.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 1e64757b0e0..343ade81c03 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1507,6 +1507,7 @@ GapMask object GapMask.ffill GapMask.bfill GapMask.interpolate_na + GapMask.get_mask Accessors ========= diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 129e93a225b..78aafd4f39d 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -554,11 +554,13 @@ class GapMask(Generic[T_Xarray]): # Attributes # ---------- - # mask : DataArray or Dataset or None _content: T_Xarray _dim: Hashable - mask: T_Xarray | None - """Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps after applying a filling method.""" + _use_coordinate: bool | Hashable + _limit: T_GapLength | None + _limit_direction: LimitDirectionOptions | None + _limit_area: LimitAreaOptions | None + _max_gap: T_GapLength | None def __init__( self, @@ -577,7 +579,8 @@ def __init__( content : DataArray or Dataset The object to be masked. - See xarray.DataArray.fill_gaps or xarray.Dataset.fill_gaps for an explanation of the remaining parameters. + Other: + See xarray.DataArray.fill_gaps or xarray.Dataset.fill_gaps for an explanation of the remaining parameters. See Also -------- @@ -617,7 +620,7 @@ def get_mask(self) -> T_Xarray: Returns ------- mask : DataArray or Dataset - Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps after applying a filling method. + Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps. """ limit_direction = self._limit_direction if limit_direction is None: From e560701d526ff617f3bcf4d4b007686045229375 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:22:03 +0100 Subject: [PATCH 40/46] Split ffill/bfill pandas compatibility test into separate test method --- xarray/tests/test_missing.py | 71 ++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 62f3a3aeedf..1103fb0ab48 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -197,13 +197,14 @@ def test_interpolate_pd_compat_polynomial(): @requires_scipy def test_interpolate_pd_compat_limits(): shapes = [(7, 7)] + method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' frac_nan = 0.5 limits = [ None, 1, 3, ] # pandas 2.1.4 is currently unable to handle coordinate based limits! - limit_directions = ["forward", "backward", None] + limit_directions = ["forward", "backward"] limit_areas = [None, "outside", "inside"] for shape, limit, limit_direction, limit_area in itertools.product( @@ -217,37 +218,69 @@ def test_interpolate_pd_compat_limits(): limit_direction=limit_direction, limit_area=limit_area, use_coordinate=False, + ).interpolate_na( + dim=dim, + method=method, + use_coordinate=True, + fill_value="extrapolate", + ) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value="extrapolate", + ) + np.testing.assert_allclose(actual.values, expected.values) + + +def test_fill_pd_compat_limits(): + shapes = [(7, 7)] + frac_nan = 0.5 + limits = [ + None, + 1, + 3, + ] # pandas 2.1.4 is currently unable to handle coordinate based limits! + limit_areas = [None, "outside", "inside"] + + for shape, limit, limit_area in itertools.product(shapes, limits, limit_areas): + da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) + for dim in ["time", "x"]: + masked = da.fill_gaps( + dim=dim, + limit=limit, + limit_area=limit_area, + use_coordinate=False, ) - if ( - limit_direction is None - ): # xarray: 'None'='both', while pandas: None='forward'. But for ffill/bfill, they both should default to forward/backward - filled = actual.ffill() - expected = df.ffill( + filled_forward = masked.ffill() + filled_backward = masked.bfill() + if pd.__version__ >= "2.2.0": + expected_forward = df.ffill( axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area ) - np.testing.assert_allclose(filled.values, expected.values) - filled = actual.bfill() - expected = df.bfill( + expected_backward = df.bfill( axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area ) - np.testing.assert_allclose(filled.values, expected.values) else: - method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' - actual = actual.interpolate_na( - dim=dim, - method=method, - use_coordinate=True, + expected_forward = df.interpolate( + method="ffill", + axis=da.get_axis_num(dim), + limit=limit, + limit_area=limit_area, fill_value="extrapolate", ) - expected = df.interpolate( - method=method, + expected_backward = df.interpolate( + method="bfill", axis=da.get_axis_num(dim), limit=limit, - limit_direction=limit_direction, limit_area=limit_area, fill_value="extrapolate", ) - np.testing.assert_allclose(actual.values, expected.values) + + np.testing.assert_allclose(filled_forward.values, expected_forward.values) + np.testing.assert_allclose(filled_backward.values, expected_backward.values) @requires_scipy From 281ba6d33d80db6c6cd28af66d501cf3cc218552 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:30:08 +0100 Subject: [PATCH 41/46] Fix doc examples and correct return type hint --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/core/missing.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 5869b31da7f..b66deb41cf9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3956,7 +3956,7 @@ def fill_gaps( Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") + >>> da.fill_gaps(dim="x", max_gap=2).ffill() Size: 56B array([nan, 2., nan, nan, 5., 5., 0.]) Coordinates: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 694696ff937..e7b9defd358 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7138,7 +7138,7 @@ def fill_gaps( A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan - >>> ds.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") + >>> ds.fill_gaps(dim="x", max_gap=2).ffill() Size: 168B Dimensions: (x: 7) Coordinates: diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 78aafd4f39d..2d36b359ccf 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -596,7 +596,7 @@ def __init__( self._limit_area = limit_area self._max_gap = max_gap - def _get_mask(self, limit_direction) -> T_Xarray: + def _get_mask(self, limit_direction) -> T_Xarray | None: mask = _get_gap_mask( obj=self._content, dim=self._dim, From fd04d5483744739220382d31f76c473d9632fc10 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:36:14 +0100 Subject: [PATCH 42/46] Require bottleneck for ffill test --- xarray/tests/test_missing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 1103fb0ab48..72c0fa4c7be 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -235,6 +235,7 @@ def test_interpolate_pd_compat_limits(): np.testing.assert_allclose(actual.values, expected.values) +@requires_bottleneck def test_fill_pd_compat_limits(): shapes = [(7, 7)] frac_nan = 0.5 From 94c88e30c59c1bcab7791b5ea642eafb54105d45 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:43:21 +0100 Subject: [PATCH 43/46] Fix mask type hint in two positions. --- xarray/core/missing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 2d36b359ccf..8eb2cbe8a44 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -609,12 +609,12 @@ def _get_mask(self, limit_direction) -> T_Xarray | None: ) return mask - def _apply_mask(self, filled: T_Xarray, mask: T_Xarray) -> T_Xarray: + def _apply_mask(self, filled: T_Xarray, mask: T_Xarray | None) -> T_Xarray: if mask is not None: filled = filled.where(~mask, other=self._content) return filled - def get_mask(self) -> T_Xarray: + def get_mask(self) -> T_Xarray | None: """Return the gap mask. Returns From 046a5924dab48f415b8691f19bacea021fbcd2a8 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 15:02:25 +0100 Subject: [PATCH 44/46] Remove fill_value in pandas when method=ffill/bfill --- xarray/tests/test_missing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 72c0fa4c7be..2e84d3f6096 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -270,14 +270,12 @@ def test_fill_pd_compat_limits(): axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area, - fill_value="extrapolate", ) expected_backward = df.interpolate( method="bfill", axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area, - fill_value="extrapolate", ) np.testing.assert_allclose(filled_forward.values, expected_forward.values) From ca4547d52ff36891ce6009d4821a301ef40818a2 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 15:59:16 +0100 Subject: [PATCH 45/46] Remove ffill check against old pandas version --- xarray/tests/test_missing.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 2e84d3f6096..06ffea540d7 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -257,30 +257,22 @@ def test_fill_pd_compat_limits(): ) filled_forward = masked.ffill() filled_backward = masked.bfill() - if pd.__version__ >= "2.2.0": + if ( + pd.__version__ >= "2.2.0" + ): # limit_area was introduced in pandas ffill in v2.2.0 expected_forward = df.ffill( axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area ) expected_backward = df.bfill( axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area ) - else: - expected_forward = df.interpolate( - method="ffill", - axis=da.get_axis_num(dim), - limit=limit, - limit_area=limit_area, + np.testing.assert_allclose( + filled_forward.values, expected_forward.values ) - expected_backward = df.interpolate( - method="bfill", - axis=da.get_axis_num(dim), - limit=limit, - limit_area=limit_area, + np.testing.assert_allclose( + filled_backward.values, expected_backward.values ) - np.testing.assert_allclose(filled_forward.values, expected_forward.values) - np.testing.assert_allclose(filled_backward.values, expected_backward.values) - @requires_scipy def test_interpolate_unsorted_index_raises(): From 76424b1762e4e4f460f3a616ffaf1db445ef32ed Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 24 Jan 2025 22:27:04 +0100 Subject: [PATCH 46/46] Add additional tests for the direction kwarg in combination with ffill/bfill --- xarray/core/missing.py | 8 ++++---- xarray/tests/test_missing.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 8eb2cbe8a44..81da5dc9077 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -644,9 +644,9 @@ def ffill(self) -> T_Xarray: DataArray.ffill Dataset.ffill """ - if self._limit_direction is None: + if self._limit_direction is None or self._limit_direction == "forward": limit_direction = "forward" - elif self._limit_direction != "forward": + else: raise ValueError( f"limit_direction='{self._limit_direction}' is not allowed with ffill, must be 'forward'." ) @@ -666,9 +666,9 @@ def bfill(self) -> T_Xarray: DataArray.bfill Dataset.bfill """ - if self._limit_direction is None: + if self._limit_direction is None or self._limit_direction == "backward": limit_direction = "backward" - elif self._limit_direction != "backward": + else: raise ValueError( f"limit_direction='{self._limit_direction}' is not allowed with bfill, must be 'backward'." ) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 06ffea540d7..d64426e255b 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -1186,6 +1186,22 @@ def test_fill_gaps_limit(): ): mask.bfill() + actual = da.fill_gaps(dim="y").ffill() + expected = da.copy(data=[n, n, 2, 2, 2, 5, 5, 5]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit_direction="forward").ffill() + expected = da.copy(data=[n, n, 2, 2, 2, 5, 5, 5]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit=1).bfill() + expected = da.copy(data=[n, 2, 2, n, 5, 5, n, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit=1, limit_direction="backward").bfill() + expected = da.copy(data=[n, 2, 2, n, 5, 5, n, n]) + assert_equal(actual, expected) + actual = da.fill_gaps(dim="y", limit=None).interpolate_na( dim="y", fill_value="extrapolate" )