diff --git a/doc/api.rst b/doc/api.rst index f731ac1c59a..343ade81c03 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -167,6 +167,7 @@ Missing value handling Dataset.fillna Dataset.ffill Dataset.bfill + Dataset.fill_gaps Dataset.interpolate_na Dataset.where Dataset.isin @@ -357,6 +358,7 @@ Missing value handling DataArray.fillna DataArray.ffill DataArray.bfill + DataArray.fill_gaps DataArray.interpolate_na DataArray.where DataArray.isin @@ -1492,6 +1494,21 @@ DataArray DataArrayResample.dims DataArrayResample.groups +GapMask object +=============== + +.. currentmodule:: xarray.core.missing + +.. autosummary:: + :toctree: generated/ + + GapMask + GapMask.fillna + GapMask.ffill + GapMask.bfill + GapMask.interpolate_na + GapMask.get_mask + Accessors ========= diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index 5d7002484c2..8ec01284126 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -203,6 +203,21 @@ Xarray also provides the ``max_gap`` keyword argument to limit the interpolation data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na` for more. +All of the above methods by default fill gaps of any size in the data. If you want fine control over the size of the gaps that are filled, you can use :py:meth:`~xarray.DataArray.fill_gaps`. For example, consider a series of air temperature measurements with gaps: + +.. ipython:: python + + n = np.nan + temperature = xr.DataArray( + [n, 1.1, n, n, n, 2, n, n, n, n, 2.3], + coords={"time": xr.Variable("time", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])}, + ) + temperature.fill_gaps( + "time", limit=1, limit_direction="both", max_gap=4 + ).interpolate_na("time") + +In this example, we interpolate valid measurements up to one hour forward and backward in time. However, if a gap is longer than four hours, nothing is interpolated. :py:meth:`~xarray.DataArray.fill_gaps` returns a :py:class:`~xarray.core.missing.GapMask` object that works with all filling methods (:py:meth:`~xarray.DataArray.ffill`, :py:meth:`~xarray.DataArray.bfill`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.interpolate_na`). See :py:meth:`~xarray.DataArray.fill_gaps` for more information on the available options. + .. _agg: Aggregation diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d287564cfe5..b66deb41cf9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -97,6 +97,7 @@ from xarray.backends import ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.groupby import DataArrayGroupBy + from xarray.core.missing import GapMask from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling from xarray.core.types import ( @@ -109,6 +110,8 @@ GroupIndices, GroupInput, InterpOptions, + LimitAreaOptions, + LimitDirectionOptions, PadModeOptions, PadReflectOptions, QuantileMethods, @@ -120,6 +123,7 @@ SideOptions, T_ChunkDimFreq, T_ChunksFreq, + T_GapLength, T_Xarray, ) from xarray.core.weighted import DataArrayWeighted @@ -3475,6 +3479,11 @@ def fillna(self, value: Any) -> Self: ------- filled : DataArray + See Also + -------- + :ref:`missing_values` + DataArray.fill_gaps + Examples -------- >>> da = xr.DataArray( @@ -3520,19 +3529,11 @@ def fillna(self, value: Any) -> Self: def interpolate_na( self, - dim: Hashable | None = None, + dim: Hashable, method: InterpOptions = "linear", limit: int | None = None, - use_coordinate: bool | str = True, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + use_coordinate: bool | Hashable = True, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: @@ -3540,7 +3541,7 @@ def interpolate_na( Parameters ---------- - dim : Hashable or None, optional + dim : Hashable Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -3555,17 +3556,17 @@ def interpolate_na( - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their respective :py:class:`scipy.interpolate` classes. + limit : int or None, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done in the forward direction, regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. use_coordinate : bool or str, default: True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if equally-spaced along ``dim``. If True, the IndexVariable `dim` is used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variable to use as the index. - limit : int or None, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, - see ``max_gap``. max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3603,6 +3604,7 @@ def interpolate_na( See Also -------- + DataArray.fill_gaps numpy.interp scipy.interpolate @@ -3611,6 +3613,7 @@ def interpolate_na( >>> da = xr.DataArray( ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} ... ) + >>> da Size: 40B array([nan, 2., 3., nan, 0.]) @@ -3645,7 +3648,7 @@ def interpolate_na( def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -3663,6 +3666,11 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: ------- filled : DataArray + See Also + -------- + :ref:`missing_values` + DataArray.fill_gaps + Examples -------- >>> temperature = np.array( @@ -3729,7 +3737,7 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: def bfill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values backward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -3747,6 +3755,11 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: ------- filled : DataArray + See Also + -------- + :ref:`missing_values` + DataArray.fill_gaps + Examples -------- >>> temperature = np.array( @@ -3810,6 +3823,163 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: return bfill(self, dim, limit=limit) + def fill_gaps( + self, + dim: Hashable, + *, + use_coordinate: bool | Hashable = True, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions | None = None, + limit_area: LimitAreaOptions | None = None, + max_gap: T_GapLength | None = None, + ) -> GapMask[DataArray]: + """Fill in gaps (consecutive missing values) in the data. + + - Firstly, ``fill_gaps`` determines **which** values to fill, with options for fine control how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + - Secondly, calling one of several filling methods determines **how** to fill the selected values. + + + *Requires numbagg or bottleneck.* + + Parameters + ---------- + dim : Hashable + Specifies the dimension along which to calculate gap sizes. + use_coordinate : bool or Hashable, default: True + Specifies which index to use when calculating gap sizes. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When filling along a datetime64 dimension + and ``use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only fill gaps less than a given length, + see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: None + Consecutive NaNs will be filled in this direction. + If not specified, the default is + + - "forward" if ``ffill`` is used + - "backward" if ``bfill`` is used + - "both" otherwise + + raises ValueError if not "forward" and ``ffill`` is used or not "backward" and ``bfill`` is used. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values + - "outside": Only fill NaNs outside valid values (extrapolate). + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When calculated along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last valid value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + + Returns + ------- + Gap Mask: core.missing.GapMask + + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. + + See Also + -------- + :ref:`missing_values` + DataArray.fillna + DataArray.ffill + DataArray.bfill + DataArray.interpolate_na + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + + Examples + -------- + >>> da = xr.DataArray( + ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], + ... dims="x", + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... ) + + >>> da + Size: 56B + array([nan, 2., nan, nan, 5., nan, 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + + >>> da.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( + ... dim="x" + ... ) + Size: 56B + array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + + >>> da.fill_gaps(dim="x", max_gap=2).ffill() + Size: 56B + array([nan, 2., nan, nan, 5., 5., 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + + >>> da.fill_gaps(dim="x", limit_area="inside").fillna(9) + Size: 56B + array([nan, 2., 9., 9., 5., 9., 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + """ + from xarray.core.missing import GapMask + + return GapMask( + self, + dim, + use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + max_gap=max_gap, + ) + def combine_first(self, other: Self) -> Self: """Combine two DataArray objects, with union of coordinates. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 74f90ce9eea..4717ce0d566 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -142,6 +142,7 @@ from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy from xarray.core.merge import CoercibleMapping, CoercibleValue, _MergeResult + from xarray.core.missing import GapMask from xarray.core.resample import DatasetResample from xarray.core.rolling import DatasetCoarsen, DatasetRolling from xarray.core.types import ( @@ -160,6 +161,8 @@ GroupInput, InterpOptions, JoinOptions, + LimitAreaOptions, + LimitDirectionOptions, PadModeOptions, PadReflectOptions, QueryEngineOptions, @@ -170,6 +173,7 @@ T_ChunkDimFreq, T_Chunks, T_DatasetPadConstantValues, + T_GapLength, T_Xarray, ) from xarray.core.weighted import DatasetWeighted @@ -6646,6 +6650,11 @@ def fillna(self, value: Any) -> Self: ------- Dataset + See Also + -------- + :ref:`missing_values` + Dataset.fill_gaps + Examples -------- >>> ds = xr.Dataset( @@ -6707,19 +6716,12 @@ def fillna(self, value: Any) -> Self: def interpolate_na( self, - dim: Hashable | None = None, + dim: Hashable, method: InterpOptions = "linear", limit: int | None = None, use_coordinate: bool | Hashable = True, - max_gap: ( - int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - | None - ) = None, + max_gap: T_GapLength | None = None, + keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: """Fill in NaNs by interpolating according to different methods. @@ -6749,7 +6751,7 @@ def interpolate_na( coordinate variable to use as the index. limit : int, default: None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of + or None for no limit. This filling is done in the forward direction, regardless of the size of the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta \ @@ -6776,6 +6778,10 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6791,6 +6797,7 @@ def interpolate_na( See Also -------- + Dataset.fill_gaps numpy.interp scipy.interpolate @@ -6805,6 +6812,7 @@ def interpolate_na( ... }, ... coords={"x": [0, 1, 2, 3, 4]}, ... ) + >>> ds Size: 200B Dimensions: (x: 5) @@ -6840,6 +6848,9 @@ def interpolate_na( """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=True) + new = _apply_over_vars_with_dim( interp_na, self, @@ -6848,14 +6859,16 @@ def interpolate_na( limit=limit, use_coordinate=use_coordinate, max_gap=max_gap, + keep_attrs=keep_attrs, **kwargs, ) + new.attrs = self.attrs if keep_attrs else None return new def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -6868,6 +6881,16 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Returns + ------- + Dataset + + See Also + -------- + :ref:`missing_values` + Dataset.fill_gaps + Dataset.bfill + Examples -------- >>> time = pd.date_range("2023-01-01", periods=10, freq="D") @@ -6902,14 +6925,6 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: * time (time) datetime64[ns] 80B 2023-01-01 2023-01-02 ... 2023-01-10 Data variables: data (time) float64 80B 1.0 1.0 1.0 nan 5.0 5.0 5.0 8.0 8.0 10.0 - - Returns - ------- - Dataset - - See Also - -------- - Dataset.bfill """ from xarray.core.missing import _apply_over_vars_with_dim, ffill @@ -6919,7 +6934,7 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: def bfill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values backward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -6933,6 +6948,16 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Returns + ------- + Dataset + + See Also + -------- + :ref:`missing_values` + Dataset.fill_gaps + Dataset.ffill + Examples -------- >>> time = pd.date_range("2023-01-01", periods=10, freq="D") @@ -6967,19 +6992,181 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: * time (time) datetime64[ns] 80B 2023-01-01 2023-01-02 ... 2023-01-10 Data variables: data (time) float64 80B 1.0 nan 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + """ + from xarray.core.missing import _apply_over_vars_with_dim, bfill + + new = _apply_over_vars_with_dim(bfill, self, dim=dim, limit=limit) + return new + + def fill_gaps( + self, + dim: Hashable, + *, + use_coordinate: bool | Hashable = True, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions | None = None, + limit_area: LimitAreaOptions | None = None, + max_gap: T_GapLength | None = None, + ) -> GapMask[Dataset]: + """Fill in gaps (consecutive missing values) in the data. + + - Firstly, ``fill_gaps`` determines **which** values to fill, with options for fine control how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + - Secondly, calling one of several filling methods determines **how** to fill the selected values. + + + *Requires numbagg or bottleneck.* + + Parameters + ---------- + dim : Hashable + Specifies the dimension along which to calculate gap sizes. + use_coordinate : bool or Hashable, default: True + Specifies which index to use when calculating gap sizes. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When filling along a datetime64 dimension + and ``use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only fill gaps less than a given length, + see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: None + Consecutive NaNs will be filled in this direction. + If not specified, the default is + + - "forward" if ``ffill`` is used + - "backward" if ``bfill`` is used + - "both" otherwise + + raises ValueError if not "forward" and ``ffill`` is used or not "backward" and ``bfill`` is used. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values + - "outside": Only fill NaNs outside valid values (extrapolate). + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When calculated along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last valid value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively Returns ------- - Dataset + Gap Mask: core.missing.GapMask + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. See Also -------- + :ref:`missing_values` + Dataset.fillna Dataset.ffill + Dataset.bfill + Dataset.interpolate_na + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + + Examples + -------- + >>> ds = xr.Dataset( + ... { + ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), + ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), + ... }, + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... ) + + >>> ds + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 + B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan + + >>> ds.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( + ... dim="x" + ... ) + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 + B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan + + >>> ds.fill_gaps(dim="x", max_gap=2).ffill() + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 + B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 + + >>> ds.fill_gaps(dim="x", limit_area="inside").fillna(9) + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 9.0 9.0 5.0 9.0 0.0 + B (x) float64 56B nan 2.0 9.0 9.0 5.0 6.0 nan """ - from xarray.core.missing import _apply_over_vars_with_dim, bfill + from xarray.core.missing import GapMask - new = _apply_over_vars_with_dim(bfill, self, dim=dim, limit=limit) - return new + return GapMask( + self, + dim, + use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + max_gap=max_gap, + ) def combine_first(self, other: Self) -> Self: """Combine two Datasets, default to data_vars of self. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index b4ca36b31df..81da5dc9077 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,19 +1,18 @@ from __future__ import annotations -import datetime as dt import itertools import warnings from collections import ChainMap from collections.abc import Callable, Generator, Hashable, Sequence from functools import partial from numbers import Number -from typing import TYPE_CHECKING, Any, TypeVar, get_args +from typing import TYPE_CHECKING, Any, Generic, TypeVar, get_args import numpy as np import pandas as pd from xarray.core import utils -from xarray.core.common import _contains_datetime_like_objects, ones_like +from xarray.core.common import _contains_datetime_like_objects from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import ( datetime_to_numeric, @@ -25,7 +24,15 @@ transpose, ) from xarray.core.options import _get_keep_attrs -from xarray.core.types import Interp1dOptions, InterpnOptions, InterpOptions +from xarray.core.types import ( + Interp1dOptions, + InterpnOptions, + InterpOptions, + LimitAreaOptions, + LimitDirectionOptions, + T_GapLength, + T_Xarray, +) from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import ( Variable, @@ -34,9 +41,6 @@ from xarray.namedarray.pycompat import is_chunked_array if TYPE_CHECKING: - from xarray.core.dataarray import DataArray - from xarray.core.dataset import Dataset - InterpCallable = Callable[..., np.ndarray] # interpn Interpolator = Callable[..., Callable[..., np.ndarray]] # *Interpolator # interpolator objects return callables that can be evaluated @@ -45,33 +49,191 @@ T = TypeVar("T") -def _get_nan_block_lengths( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): +_FILL_MISSING_DOCSTRING_TEMPLATE = """\ +Partly fill nan values in this object's data by applying `{name}` to all unmasked values. + +Parameters +---------- +**kwargs : dict + Additional keyword arguments passed on to `{name}`. + +Returns +------- +filled : same type as caller + New object with `{name}` applied to all unmasked values. +""" + + +def _get_gap_left_edge( + obj: T_Xarray, dim: Hashable, index: Variable, outside=False +) -> T_Xarray: + left = index.where(~obj.isnull()).ffill(dim).transpose(*obj.dims) + if outside: + return left.fillna(index[0]) + return left + + +def _get_gap_right_edge( + obj: T_Xarray, dim: Hashable, index: Variable, outside=False +) -> T_Xarray: + right = index.where(~obj.isnull()).bfill(dim).transpose(*obj.dims) + if outside: + return right.fillna(index[-1]) + return right + + +def _get_gap_dist_to_left_edge( + obj: T_Xarray, dim: Hashable, index: Variable +) -> T_Xarray: + return (index - _get_gap_left_edge(obj, dim, index)).transpose(*obj.dims) + + +def _get_gap_dist_to_right_edge( + obj: T_Xarray, dim: Hashable, index: Variable +) -> T_Xarray: + return (_get_gap_right_edge(obj, dim, index) - index).transpose(*obj.dims) + + +def _get_limit_fill_mask( + obj: T_Xarray, + dim: Hashable, + index: Variable, + limit: int | float | np.number, + limit_direction: LimitDirectionOptions, +) -> T_Xarray: + # At the left boundary, distance to left is nan. + # For nan, a<=b and ~(a>b) behave differently + if limit_direction == "forward": + limit_mask = ~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit) + elif limit_direction == "backward": + limit_mask = ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) + elif limit_direction == "both": + limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & ( + ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) + ) + else: + raise ValueError( + f"limit_direction must be one of 'forward', 'backward', 'both'. Got {limit_direction}" + ) + return limit_mask + + +def _get_limit_area_mask( + obj: T_Xarray, dim: Hashable, index: Variable, limit_area +) -> T_Xarray: + if limit_area == "inside": + area_mask = ( + _get_gap_left_edge(obj, dim, index).isnull() + | _get_gap_right_edge(obj, dim, index).isnull() + ) + elif limit_area == "outside": + area_mask = ( + _get_gap_left_edge(obj, dim, index).notnull() + & _get_gap_right_edge(obj, dim, index).notnull() + ) + area_mask = area_mask & obj.isnull() + else: + raise ValueError( + f"limit_area must be one of 'inside', 'outside' or None. Got {limit_area}" + ) + return area_mask + + +def _get_nan_block_lengths(obj: T_Xarray, dim: Hashable, index: Variable) -> T_Xarray: """ Return an object where each NaN element in 'obj' is replaced by the length of the gap the element is in. """ - - # make variable so that we get broadcasting for free - index = Variable([dim], index) - - # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072 - arange = ones_like(obj) * index - valid = obj.notnull() - valid_arange = arange.where(valid) - cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0]) - - nan_block_lengths = ( - cumulative_nans.diff(dim=dim, label="upper") - .reindex({dim: obj[dim]}) - .where(valid) - .bfill(dim=dim) - .where(~valid, 0) - .fillna(index[-1] - valid_arange.max(dim=[dim])) + return _get_gap_right_edge(obj, dim, index, outside=True) - _get_gap_left_edge( + obj, dim, index, outside=True ) - return nan_block_lengths + +def _get_max_gap_mask( + obj: T_Xarray, dim: Hashable, index: Variable, max_gap: int | float | np.number +) -> T_Xarray: + nan_block_lengths = _get_nan_block_lengths(obj, dim, index) + return nan_block_lengths > max_gap + + +def _get_gap_mask( + obj: T_Xarray, + dim: Hashable, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate=False, + max_gap: T_GapLength | None = None, + max_gap_use_coordinate=False, +) -> T_Xarray | None: + # Input checking + ##Limit + if not is_scalar(limit): + raise ValueError("limit must be a scalar.") + + if limit is None: + limit = np.inf + else: + if limit_use_coordinate is False: + if not isinstance(limit, Number | np.number): + raise TypeError( + f"Expected integer or floating point limit since limit_use_coordinate=False. Received {type(limit).__name__}." + ) + if _is_time_index(_get_raw_interp_index(obj, dim, limit_use_coordinate)): + limit = timedelta_to_numeric(limit) + + ## Max_gap + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + + if max_gap is None: + max_gap = np.inf + else: + if not max_gap_use_coordinate: + if not isinstance(max_gap, Number | np.number): + raise TypeError( + f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." + ) + + if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): + max_gap = timedelta_to_numeric(max_gap) + + # Which masks are really needed? + need_limit_mask = limit != np.inf or limit_direction != "both" + need_area_mask = limit_area is not None + need_max_gap_mask = max_gap != np.inf + # Calculate indexes + if need_limit_mask or need_area_mask: + index_limit = get_clean_interp_index( + obj, dim, use_coordinate=limit_use_coordinate + ) + # index_limit = ones_like(obj) * index_limit + if need_max_gap_mask: + index_max_gap = get_clean_interp_index( + obj, dim, use_coordinate=max_gap_use_coordinate + ) + # index_max_gap = ones_like(obj) * index_max_gap + if not (need_limit_mask or need_area_mask or need_max_gap_mask): + return None + + # Calculate individual masks + masks = [] + if need_limit_mask: + # due to the dynamic typing of limit, mypy cannot infer the correct type + masks.append( + _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) # type: ignore[arg-type] + ) + + if need_area_mask: + masks.append(_get_limit_area_mask(obj, dim, index_limit, limit_area)) + + if need_max_gap_mask: + masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) # type: ignore[arg-type] + # Combine masks + mask = masks[0] + for m in masks[1:]: + mask |= m + return mask class BaseInterpolator: @@ -243,9 +405,45 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds +def _get_raw_interp_index( + arr: T_Xarray, dim: Hashable, use_coordinate: bool | Hashable = True +) -> pd.Index: + """Return index to use for x values in interpolation or curve fitting. + In comparison to get_clean_interp_index, this function does not convert + to numeric values.""" + + if dim not in arr.dims: + raise ValueError(f"{dim} is not a valid dimension") + + if use_coordinate is False: + return pd.RangeIndex(arr.sizes[dim], name=dim) + + elif use_coordinate is True: + coordinate = arr.coords[ + dim + ] # this will default to a linear coordinate, if no index is present + else: # string/hashable + coordinate = arr.coords[use_coordinate] + if dim not in coordinate.dims: + raise ValueError( + f"Coordinate given by {use_coordinate} must have dimension {dim}." + ) + + if coordinate.ndim != 1: + raise ValueError( + f"Coordinates used for interpolation must be 1D, " + f"{use_coordinate} is {coordinate.ndim}D." + ) + index = coordinate.to_index() + return index + + def get_clean_interp_index( - arr, dim: Hashable, use_coordinate: Hashable | bool = True, strict: bool = True -): + arr: T_Xarray, + dim: Hashable, + use_coordinate: bool | Hashable = True, + strict: bool = True, +) -> Variable: """Return index to use for x values in interpolation or curve fitting. Parameters @@ -254,7 +452,7 @@ def get_clean_interp_index( Array to interpolate or fit to a curve. dim : str Name of dimension along which to fit. - use_coordinate : str or bool + use_coordinate : bool or hashable If use_coordinate is True, the coordinate that shares the name of the dimension along which interpolation is being performed will be used as the x values. If False, the x values are set as an equally spaced sequence. @@ -272,25 +470,9 @@ def get_clean_interp_index( to time deltas with respect to 1970-01-01. """ - # Question: If use_coordinate is a string, what role does `dim` play? from xarray.coding.cftimeindex import CFTimeIndex - if use_coordinate is False: - axis = arr.get_axis_num(dim) - return np.arange(arr.shape[axis], dtype=np.float64) - - if use_coordinate is True: - index = arr.get_index(dim) - - else: # string - index = arr.coords[use_coordinate] - if index.ndim != 1: - raise ValueError( - f"Coordinates used for interpolation must be 1D, " - f"{use_coordinate} is {index.ndim}D." - ) - index = index.to_index() - + index = _get_raw_interp_index(arr, dim, use_coordinate) # TODO: index.name is None for multiindexes # set name for nice error messages below if isinstance(index, pd.MultiIndex): @@ -308,70 +490,42 @@ def get_clean_interp_index( if isinstance(index, CFTimeIndex | pd.DatetimeIndex): offset = type(index[0])(1970, 1, 1) if isinstance(index, CFTimeIndex): - index = index.values - index = Variable( - data=datetime_to_numeric(index, offset=offset, datetime_unit="ns"), - dims=(dim,), - ) - - # raise if index cannot be cast to a float (e.g. MultiIndex) - try: - index = index.values.astype(np.float64) - except (TypeError, ValueError) as err: - # pandas raises a TypeError - # xarray/numpy raise a ValueError - raise TypeError( - f"Index {index.name!r} must be castable to float64 to support " - f"interpolation or curve fitting, got {type(index).__name__}." - ) from err + values = datetime_to_numeric( + index.values, offset=offset, datetime_unit="ns" + ) + else: + values = datetime_to_numeric(index, offset=offset, datetime_unit="ns") + else: # if numeric or standard calendar index: try to cast to float + try: + values = index.values.astype(np.float64) + # raise if index cannot be cast to a float (e.g. MultiIndex) + except (TypeError, ValueError) as err: + # pandas raises a TypeError + # xarray/numpy raise a ValueError + raise TypeError( + f"Index {index.name!r} must be castable to float64 to support " + f"interpolation or curve fitting, got {type(index).__name__}." + ) from err + var = Variable([dim], values) + return var + + +def _is_time_index(index) -> bool: + from xarray.coding.cftimeindex import CFTimeIndex - return index + return isinstance(index, pd.DatetimeIndex | CFTimeIndex) -def interp_na( - self, - dim: Hashable | None = None, - use_coordinate: bool | str = True, +def _interp_na_all( + obj: T_Xarray, + dim: Hashable, method: InterpOptions = "linear", - limit: int | None = None, - max_gap: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, + use_coordinate: bool | Hashable = True, keep_attrs: bool | None = None, **kwargs, -): - """Interpolate values according to different methods.""" - from xarray.coding.cftimeindex import CFTimeIndex - - if dim is None: - raise NotImplementedError("dim is a required argument") - - if limit is not None: - valids = _get_valid_fill_mask(self, dim, limit) - - if max_gap is not None: - max_type = type(max_gap).__name__ - if not is_scalar(max_gap): - raise ValueError("max_gap must be a scalar.") - - if ( - dim in self._indexes - and isinstance( - self._indexes[dim].to_pandas_index(), pd.DatetimeIndex | CFTimeIndex - ) - and use_coordinate - ): - # Convert to float - max_gap = timedelta_to_numeric(max_gap) - - if not use_coordinate: - if not isinstance(max_gap, Number | np.number): - raise TypeError( - f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}." - ) - - # method - index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) +) -> T_Xarray: + """Interpolate all nan values, without restrictions regarding the gap size.""" + index = get_clean_interp_index(obj, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) interpolator = partial(func_interpolate_na, interp_class, **kwargs) @@ -383,27 +537,241 @@ def interp_na( warnings.filterwarnings("ignore", "invalid value", RuntimeWarning) arr = apply_ufunc( interpolator, - self, - index, + obj, + index.values, input_core_dims=[[dim], [dim]], output_core_dims=[[dim]], - output_dtypes=[self.dtype], + output_dtypes=[obj.dtype], dask="parallelized", vectorize=True, keep_attrs=keep_attrs, - ).transpose(*self.dims) + ).transpose(*obj.dims) + return arr + + +class GapMask(Generic[T_Xarray]): + """An object that allows for flexible masking of gaps. You should use DataArray.fill_gaps() or Dataset.fill_gaps() to construct this object instead of constructing it directly.""" - if limit is not None: - arr = arr.where(valids) + # Attributes + # ---------- + _content: T_Xarray + _dim: Hashable + _use_coordinate: bool | Hashable + _limit: T_GapLength | None + _limit_direction: LimitDirectionOptions | None + _limit_area: LimitAreaOptions | None + _max_gap: T_GapLength | None - if max_gap is not None: - if dim not in self.coords: - raise NotImplementedError( - "max_gap not implemented for unlabeled coordinates yet." + def __init__( + self, + content: T_Xarray, + dim: Hashable, + use_coordinate: bool | Hashable = True, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions | None = None, + limit_area: LimitAreaOptions | None = None, + max_gap: T_GapLength | None = None, + ) -> None: + """An object that allows for flexible masking of gaps. You should use DataArray.fill_gaps() or Dataset.fill_gaps() to construct this object instead of calling this constructor directly. + + Parameters + ---------- + content : DataArray or Dataset + The object to be masked. + + Other: + See xarray.DataArray.fill_gaps or xarray.Dataset.fill_gaps for an explanation of the remaining parameters. + + See Also + -------- + xarray.DataArray.fill_gaps + xarray.Dataset.fill_gaps + + """ + self._content = content + self._dim = dim + self._use_coordinate = use_coordinate + self._limit = limit + self._limit_direction = limit_direction + self._limit_area = limit_area + self._max_gap = max_gap + + def _get_mask(self, limit_direction) -> T_Xarray | None: + mask = _get_gap_mask( + obj=self._content, + dim=self._dim, + limit=self._limit, + limit_direction=limit_direction, + limit_area=self._limit_area, + limit_use_coordinate=self._use_coordinate, + max_gap=self._max_gap, + max_gap_use_coordinate=self._use_coordinate, + ) + return mask + + def _apply_mask(self, filled: T_Xarray, mask: T_Xarray | None) -> T_Xarray: + if mask is not None: + filled = filled.where(~mask, other=self._content) + return filled + + def get_mask(self) -> T_Xarray | None: + """Return the gap mask. + + Returns + ------- + mask : DataArray or Dataset + Boolean gap mask, created based on the parameters passed to DataArray.fill_gaps() or Dataset.fill_gaps(). True values indicate remaining gaps. + """ + limit_direction = self._limit_direction + if limit_direction is None: + limit_direction = "both" + mask = self._get_mask(limit_direction) + return mask + + def ffill(self) -> T_Xarray: + """Partly fill missing values in this object's data by applying ffill to all unmasked values. + + Parameters + ---------- + + Returns + ------- + filled : same type as caller + New object with ffill applied to all unmasked values. + + See Also + -------- + DataArray.ffill + Dataset.ffill + """ + if self._limit_direction is None or self._limit_direction == "forward": + limit_direction = "forward" + else: + raise ValueError( + f"limit_direction='{self._limit_direction}' is not allowed with ffill, must be 'forward'." + ) + mask = self._get_mask(limit_direction) + return self._apply_mask(self._content.ffill(self._dim), mask) + + def bfill(self) -> T_Xarray: + """Partly fill missing values in this object's data by applying bfill to all unmasked values. + + Returns + ------- + filled : same type as caller + New object with bfill applied to all unmasked values. + + See Also + -------- + DataArray.bfill + Dataset.bfill + """ + if self._limit_direction is None or self._limit_direction == "backward": + limit_direction = "backward" + else: + raise ValueError( + f"limit_direction='{self._limit_direction}' is not allowed with bfill, must be 'backward'." ) - nan_block_lengths = _get_nan_block_lengths(self, dim, index) - arr = arr.where(nan_block_lengths <= max_gap) + mask = self._get_mask(limit_direction) + return self._apply_mask(self._content.bfill(self._dim), mask) + + def fillna(self, value) -> T_Xarray: + """Partly fill missing values in this object's data by applying fillna to all unmasked values. + + Parameters + ---------- + value : scalar, ndarray or DataArray + Used to fill all unmasked values. If the + argument is a DataArray, it is first aligned with (reindexed to) + this array. + + + Returns + ------- + filled : same type as caller + New object with fillna applied to all unmasked values. + + See Also + -------- + DataArray.fillna + Dataset.fillna + """ + mask = self.get_mask() + return self._apply_mask(self._content.fillna(value), mask) + + def interpolate_na( + self, + dim: Hashable | None = None, + method: InterpOptions = "linear", + use_coordinate: bool | Hashable = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> T_Xarray: + """Partly fill missing values in this object's data by applying interpolate_na to all unmasked values. + + Parameters + ---------- + See DataArray.interpolate_na and Dataset.interpolate_na for explanation of parameters. + + Returns + ------- + filled : same type as caller + New object with interpolate_na applied to all unmasked values. + + + See Also + -------- + DataArray.interpolate_na + Dataset.interpolate_na + """ + if dim is None: + dim = self._dim + mask = self.get_mask() + return self._apply_mask( + self._content.interpolate_na( + dim=dim, + method=method, + use_coordinate=use_coordinate, + limit=None, + max_gap=None, + keep_attrs=keep_attrs, + **kwargs, + ), + mask, + ) + + +def interp_na( + obj: T_Xarray, + dim: Hashable, + method: InterpOptions = "linear", + use_coordinate: bool | Hashable = True, + limit: T_GapLength | None = None, + max_gap: T_GapLength | None = None, + keep_attrs: bool | None = None, + **kwargs, +): + """Interpolate values according to different methods.""" + # This was the original behaviour of interp_na and is kept for backward compatibility + # Limit=None: Fill everything, including both boundaries + # Limit!=None: Do forward interpolation until limit + limit_use_coordinate = False + limit_direction: LimitDirectionOptions = "both" if limit is None else "forward" + limit_area = None + mask = _get_gap_mask( + obj, + dim, + limit, + limit_direction, + limit_area, + limit_use_coordinate, + max_gap, + use_coordinate, + ) + arr = _interp_na_all(obj, dim, method, use_coordinate, keep_attrs, **kwargs) + if mask is not None: + arr = arr.where(~mask) return arr @@ -563,20 +931,6 @@ def _get_interpolator_nd(method, **kwargs): return interp_class, kwargs -def _get_valid_fill_mask(arr, dim, limit): - """helper function to determine values that can be filled when limit is not - None""" - kw = {dim: limit + 1} - # we explicitly use construct method to avoid copy. - new_dim = utils.get_temp_dimname(arr.dims, "_window") - return ( - arr.isnull() - .rolling(min_periods=1, **kw) - .construct(new_dim, fill_value=False) - .sum(new_dim, skipna=False) - ) <= limit - - def _localize(obj: T, indexes_coords: SourceDest) -> tuple[T, SourceDest]: """Speed up for linear and nearest neighbor method. Only consider a subspace that is needed for the interpolation diff --git a/xarray/core/types.py b/xarray/core/types.py index 186738ed718..7fc6eb516a3 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -120,6 +120,7 @@ DatetimeLike: TypeAlias = ( pd.Timestamp | datetime.datetime | np.datetime64 | CFTimeDatetime ) +TimedeltaLike: TypeAlias = pd.Timedelta | datetime.timedelta | np.timedelta64 class Alignable(Protocol): @@ -251,6 +252,9 @@ def copy( ] InterpnOptions = Literal["linear", "nearest", "slinear", "cubic", "quintic", "pchip"] InterpOptions = Union[Interp1dOptions, InterpolantOptions, InterpnOptions] +T_GapLength = Union[int, float, str, TimedeltaLike] +LimitDirectionOptions = Literal["forward", "backward", "both"] +LimitAreaOptions = Literal["inside", "outside"] DatetimeUnitOptions = Literal[ "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", None diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index bfcfda19df3..9ca86a1170f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4254,11 +4254,13 @@ def test_rank(self) -> None: def test_polyfit(self, use_dask, use_datetime) -> None: if use_dask and not has_dask: pytest.skip("requires dask") - xcoord = xr.DataArray( + da_times = xr.DataArray( pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" ) - x = xr.core.missing.get_clean_interp_index(xcoord, "x") - if not use_datetime: + x = xr.core.missing.get_clean_interp_index(da_times, "x").values + if use_datetime: + xcoord = da_times.values + else: xcoord = x da_raw = DataArray( diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index eb21cca0861..d64426e255b 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -11,6 +11,12 @@ NumpyInterpolator, ScipyInterpolator, SplineInterpolator, + _get_gap_dist_to_left_edge, + _get_gap_dist_to_right_edge, + _get_gap_left_edge, + _get_gap_right_edge, + _get_limit_area_mask, + _get_limit_fill_mask, _get_nan_block_lengths, get_clean_interp_index, ) @@ -98,37 +104,32 @@ def make_interpolate_example_data(shape, frac_nan, seed=12345, non_uniform=False @pytest.mark.parametrize( "method", ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"] ) +@pytest.mark.parametrize("dim", ["time", "x"]) +@pytest.mark.parametrize("shape", [(8, 8), (1, 20), (20, 1), (100, 100)]) +@pytest.mark.parametrize("frac_nan", [0, 0.5, 1]) @requires_scipy -def test_interpolate_pd_compat(method, fill_value) -> None: - shapes = [(8, 8), (1, 20), (20, 1), (100, 100)] - frac_nans = [0, 0.5, 1] - - for shape, frac_nan in itertools.product(shapes, frac_nans): - da, df = make_interpolate_example_data(shape, frac_nan) - - for dim in ["time", "x"]: - actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) - # need limit_direction="both" here, to let pandas fill - # in both directions instead of default forward direction only - expected = df.interpolate( - method=method, - axis=da.get_axis_num(dim), - limit_direction="both", - fill_value=fill_value, - ) - - if method == "linear": - # Note, Pandas does not take left/right fill_value into account - # for the numpy linear methods. - # see https://github.com/pandas-dev/pandas/issues/55144 - # This aligns the pandas output with the xarray output - fixed = expected.values.copy() - fixed[pd.isnull(actual.values)] = np.nan - fixed[actual.values == fill_value] = fill_value - else: - fixed = expected.values +def test_interpolate_pd_compat(method, fill_value, dim, shape, frac_nan) -> None: + da, df = make_interpolate_example_data(shape, frac_nan) + + actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + fill_value=fill_value, + limit_direction="both", + ) - np.testing.assert_allclose(actual.values, fixed) + if method == "linear": + # Note, Pandas does not take left/right fill_value into account + # for the numpy linear methods. + # see https://github.com/pandas-dev/pandas/issues/55144 + # This aligns the pandas output with the xarray output + fixed = expected.values.copy() + fixed[pd.isnull(actual.values)] = np.nan + fixed[actual.values == fill_value] = fill_value + else: + fixed = expected.values + np.testing.assert_allclose(actual.values, fixed) @requires_scipy @@ -193,6 +194,86 @@ def test_interpolate_pd_compat_polynomial(): np.testing.assert_allclose(actual.values, expected.values) +@requires_scipy +def test_interpolate_pd_compat_limits(): + shapes = [(7, 7)] + method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' + frac_nan = 0.5 + limits = [ + None, + 1, + 3, + ] # pandas 2.1.4 is currently unable to handle coordinate based limits! + limit_directions = ["forward", "backward"] + limit_areas = [None, "outside", "inside"] + + for shape, limit, limit_direction, limit_area in itertools.product( + shapes, limits, limit_directions, limit_areas + ): + da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) + for dim in ["time", "x"]: + actual = da.fill_gaps( + dim=dim, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + use_coordinate=False, + ).interpolate_na( + dim=dim, + method=method, + use_coordinate=True, + fill_value="extrapolate", + ) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value="extrapolate", + ) + np.testing.assert_allclose(actual.values, expected.values) + + +@requires_bottleneck +def test_fill_pd_compat_limits(): + shapes = [(7, 7)] + frac_nan = 0.5 + limits = [ + None, + 1, + 3, + ] # pandas 2.1.4 is currently unable to handle coordinate based limits! + limit_areas = [None, "outside", "inside"] + + for shape, limit, limit_area in itertools.product(shapes, limits, limit_areas): + da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) + for dim in ["time", "x"]: + masked = da.fill_gaps( + dim=dim, + limit=limit, + limit_area=limit_area, + use_coordinate=False, + ) + filled_forward = masked.ffill() + filled_backward = masked.bfill() + if ( + pd.__version__ >= "2.2.0" + ): # limit_area was introduced in pandas ffill in v2.2.0 + expected_forward = df.ffill( + axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area + ) + expected_backward = df.bfill( + axis=da.get_axis_num(dim), limit=limit, limit_area=limit_area + ) + np.testing.assert_allclose( + filled_forward.values, expected_forward.values + ) + np.testing.assert_allclose( + filled_backward.values, expected_backward.values + ) + + @requires_scipy def test_interpolate_unsorted_index_raises(): vals = np.array([1, 2, 3], dtype=np.float64) @@ -201,12 +282,6 @@ def test_interpolate_unsorted_index_raises(): expected.interpolate_na(dim="x", method="index") -def test_interpolate_no_dim_raises(): - da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims="x") - with pytest.raises(NotImplementedError, match=r"dim is a required argument"): - da.interpolate_na(method="linear") - - def test_interpolate_invalid_interpolator_raises(): da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims="x") with pytest.raises(ValueError, match=r"not a valid"): @@ -307,18 +382,53 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): - da = xr.DataArray( - np.array([1, 2, np.nan, np.nan, np.nan, 6], dtype=np.float64), dims="x" - ) + n = np.nan + times = pd.date_range("2000-01-01", periods=9, freq="2h") + coords = {"yt": ("y", times)} + da = xr.DataArray([n, n, 3, n, n, 6, n, 8, n], dims=["y"], coords=coords) + + actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") + # With no limit, all gaps should be interpolated (forward+backward, including boundaries). Introduced in xarray due to a bug (GH7665), but kept for backward compatibility. With limit, do only forward interpolation. + expected = da.copy(data=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + assert_equal(actual, expected) - actual = da.interpolate_na(dim="x", limit=None) - assert actual.isnull().sum() == 0 + actual = da.interpolate_na(dim="y", limit=None, max_gap=2, fill_value="extrapolate") + expected = da.copy(data=[1, 2, 3, n, n, 6, 7, 8, 9]) + assert_equal(actual, expected) - actual = da.interpolate_na(dim="x", limit=2) - expected = xr.DataArray( - np.array([1, 2, 3, 4, np.nan, 6], dtype=np.float64), dims="x" + actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") + expected = da.copy(data=[n, n, 3, 4, n, 6, 7, 8, 9]) + assert_equal(actual, expected) + + actual = da.interpolate_na(dim="y", limit=1, max_gap=2, fill_value="extrapolate") + expected = da.copy(data=[n, n, 3, n, n, 6, 7, 8, 9]) + assert_equal(actual, expected) + + +@requires_bottleneck +def test_interpolate_double_coordinate(): + # Check if max_gap is able to handle string coordinate names + # Limit is always refering to an index + n = np.nan + da = xr.DataArray( + [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], + dims=["x", "y"], + coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, + ) + actual = da.interpolate_na( + dim="y", limit=1, max_gap=4, use_coordinate="y1", fill_value="extrapolate" ) + expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, 2, n, n, 5, 6, n]]) + assert_equal(actual, expected) + actual = da.interpolate_na( + "y", + limit=2, + max_gap=4, + use_coordinate="y2", + fill_value="extrapolate", + ) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, 7]]) assert_equal(actual, expected) @@ -563,6 +673,118 @@ def test_bfill_dataset(ds): ds.ffill(dim="time") +@requires_bottleneck +def test_get_gap_left_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 3, 3, 3, 3, 3, 3, 3, 24], [n, n, n, 9, 9, 9, 18, 18, 18]] + ) + assert_equal(actual, expected) + + actual = _get_gap_left_edge(da, dim="y", index=index, outside=True) + expected = da.copy( + data=[[0, 3, 3, 3, 3, 3, 3, 3, 24], [0, 0, 0, 9, 9, 9, 18, 18, 18]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 2, 2, 2, 2, 2, 2, 2, 14], [n, n, n, 6, 6, 6, 10, 10, 10]] + ) + + +@requires_bottleneck +def test_get_gap_right_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[3, 3, 24, 24, 24, 24, 24, 24, 24], [9, 9, 9, 9, 18, 18, 18, n, n]] + ) + assert_equal(actual, expected) + + actual = _get_gap_right_edge(da, dim="y", index=index, outside=True) + expected = da.copy( + data=[[3, 3, 24, 24, 24, 24, 24, 24, 24], [9, 9, 9, 9, 18, 18, 18, 24, 24]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[2, 2, 14, 14, 14, 14, 14, 14, 14], [6, 6, 6, 6, 10, 10, 10, n, n]] + ) + + +@requires_bottleneck +def test_get_gap_dist_to_left_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 0, 3, 6, 9, 12, 15, 18, 0], [n, n, n, 0, 3, 6, 0, 3, 6]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_left_edge(da, dim="y", index=index) + expected = da.copy(data=[[n, 0, 3, 4, 5, 6, 8, 10, 0], [n, n, n, 0, 1, 2, 0, 2, 4]]) + + +@requires_bottleneck +def test_get_gap_dist_to_right_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[3, 0, 18, 15, 12, 9, 6, 3, 0], [9, 6, 3, 0, 6, 3, 0, n, n]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_right_edge(da, dim="y", index=index) + expected = da.copy(data=[[2, 0, 9, 8, 7, 6, 4, 2, 0], [5, 3, 0, 4, 3, 2, 0, n, n]]) + + @requires_bottleneck @pytest.mark.parametrize( "y, lengths_expected", @@ -578,7 +800,7 @@ def test_bfill_dataset(ds): ], ], ) -def test_interpolate_na_nan_block_lengths(y, lengths_expected): +def test_get_nan_block_lengths(y, lengths_expected): arr = [ [np.nan, 1, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 4], [np.nan, np.nan, np.nan, 1, np.nan, np.nan, 4, np.nan, np.nan], @@ -590,6 +812,121 @@ def test_interpolate_na_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +@requires_bottleneck +def test_get_nan_block_lengths_2d(): + n = np.nan + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + dims=["x", "y"], + coords={"x": np.arange(4), "y": np.arange(12) ** 2}, + ) + index = get_clean_interp_index(da, dim="y", use_coordinate=False) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected_y = da.copy( + data=[ + [0, 0, 0, 0, 2, 0, 4, 4, 4, 0, 0, 1], + [2, 2, 0, 3, 3, 0, 4, 4, 4, 0, 2, 2], + [2, 2, 0, 3, 3, 0, 4, 4, 4, 0, 2, 2], + [1, 0, 0, 0, 2, 0, 4, 4, 4, 0, 0, 1], + ] + ) + assert_equal(actual, expected_y) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected_y = da.copy( + data=[ + [0, 0, 0, 0, 16, 0, 56, 56, 56, 0, 0, 21], + [4, 4, 0, 21, 21, 0, 56, 56, 56, 0, 40, 40], + [4, 4, 0, 21, 21, 0, 56, 56, 56, 0, 40, 40], + [1, 0, 0, 0, 16, 0, 56, 56, 56, 0, 0, 21], + ] + ) + assert_equal(actual, expected_y) + + +@requires_bottleneck +def test_get_limit_fill_mask(): + T = True + F = False + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + + with pytest.raises(ValueError, match=r"limit_direction must be one of"): + _get_limit_fill_mask(da, dim="y", index=index, limit=3, limit_direction="cat") + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="forward" + ) + expected = da.copy(data=[[T, F, F, T, T, T, T, T, F], [T, T, T, F, F, F, F, F, T]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="backward" + ) + expected = da.copy(data=[[F, F, T, T, T, T, T, F, F], [T, T, F, F, F, F, F, T, T]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="both" + ) + expected = da.copy(data=[[F, F, F, T, T, T, T, F, F], [T, T, F, F, F, F, F, F, T]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="forward" + ) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, T, F, F, T, F, T, T]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="backward" + ) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, F, F, T, T, F, T, T]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="both" + ) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, F, F, F, T, F, T, T]]) + assert_equal(actual, expected) + + +@requires_bottleneck +def test_get_area_mask(): + T = True + F = False + n = np.nan + arr = [ + [n, 1, n, n, 5, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + + with pytest.raises(ValueError, match=r"limit_area must be one of"): + _get_limit_area_mask(da, dim="y", index=index, limit_area="cow") + + actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="inside") + expected = da.copy(data=[[T, F, F, F, F, F, F, F, F], [T, T, T, F, F, F, F, T, T]]) + assert_equal(actual, expected) + + actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="outside") + expected = da.copy(data=[[F, F, T, T, F, T, T, T, F], [F, F, F, F, T, T, F, F, F]]) + assert_equal(actual, expected) + + @requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_get_clean_interp_index_cf_calendar(cf_da, calendar): @@ -638,6 +975,31 @@ def test_get_clean_interp_index_strict(index): assert clean.dtype == np.float64 +def test_get_clean_interp_index_double_coordinate(): + da = xr.DataArray( + np.ones((2, 7)), + dims=["x", "y"], + coords={ + "x": ("x", [10, 20]), + "y1": ("y", np.arange(7) * 2), + "y2": ("y", np.arange(7) * 3), + }, + ) + with pytest.raises(ValueError, match=r"not a valid dimension"): + get_clean_interp_index(da, "y1", use_coordinate=True) + + actual = get_clean_interp_index(da, "y", use_coordinate=True) + expected = xr.Variable(["y"], np.arange(7)) + assert_equal(actual, expected) + + actual = get_clean_interp_index(da, "y", use_coordinate="y1") + expected = xr.Variable(["y"], np.arange(7) * 2) + assert_equal(actual, expected) + + with pytest.raises(ValueError, match=r"must have dimension"): + get_clean_interp_index(da, "x", use_coordinate="y1") + + @pytest.fixture def da_time(): return xr.DataArray( @@ -647,11 +1009,6 @@ def da_time(): def test_interpolate_na_max_gap_errors(da_time): - with pytest.raises( - NotImplementedError, match=r"max_gap not implemented for unlabeled coordinates" - ): - da_time.interpolate_na("t", max_gap=1) - with pytest.raises(ValueError, match=r"max_gap must be a scalar."): da_time.interpolate_na("t", max_gap=(1,)) @@ -690,12 +1047,16 @@ def test_interpolate_na_max_gap_time_specifier( @pytest.mark.parametrize( "coords", [ - pytest.param(None, marks=pytest.mark.xfail()), + None, {"x": np.arange(4), "y": np.arange(12)}, ], ) -def test_interpolate_na_2d(coords): +def test_interpolate_na_max_gap_2d(coords): n = np.nan + if coords is None: + use_coordinate = False + else: + use_coordinate = True da = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -707,18 +1068,22 @@ def test_interpolate_na_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", max_gap=2) + actual = da.interpolate_na( + "y", use_coordinate=use_coordinate, max_gap=2, fill_value="extrapolate" + ) expected_y = da.copy( data=[ - [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], - [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y) - actual = da.interpolate_na("y", max_gap=1, fill_value="extrapolate") + actual = da.interpolate_na( + "y", use_coordinate=use_coordinate, max_gap=1, fill_value="extrapolate" + ) expected_y_extra = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], @@ -729,7 +1094,9 @@ def test_interpolate_na_2d(coords): ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", max_gap=3) + actual = da.interpolate_na( + "x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate" + ) expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -743,6 +1110,34 @@ def test_interpolate_na_2d(coords): assert_equal(actual, expected_x) +@requires_bottleneck +def test_interpolate_na_limit_2d(): + n = np.nan + times = pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + actual = da.interpolate_na("time", limit=1, fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, n, n, 10, 11, 12], + [n, n, 3, 4, n, 6, 7, n, n, 10, 11, n], + [n, 2, 3, 4, 5, 6, 7, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + + @requires_scipy def test_interpolators_complex_out_of_bounds(): """Ensure complex nans are used for complex data""" @@ -762,3 +1157,322 @@ def test_interpolators_complex_out_of_bounds(): f = interpolator(xi, yi, method=method) actual = f(x) assert_array_equal(actual, expected) + + +####Masking Functionality +@requires_bottleneck +def test_fill_gaps_limit(): + n = np.nan + times = pd.date_range("2000-01-01", periods=8, freq="2h") + coords = {"yt": ("y", times)} + da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + + mask = da.fill_gaps(dim="y", limit_direction="backward") + with pytest.raises( + ValueError, + match=r"limit_direction='backward' is not allowed with ffill, must be 'forward'", + ): + mask.ffill() + mask = da.fill_gaps(dim="y", limit_direction="both") + with pytest.raises( + ValueError, + match=r"limit_direction='both' is not allowed with ffill, must be 'forward'", + ): + mask.ffill() + mask = da.fill_gaps(dim="y", limit_direction="forward") + with pytest.raises( + ValueError, + match=r"limit_direction='forward' is not allowed with bfill, must be 'backward'", + ): + mask.bfill() + + actual = da.fill_gaps(dim="y").ffill() + expected = da.copy(data=[n, n, 2, 2, 2, 5, 5, 5]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit_direction="forward").ffill() + expected = da.copy(data=[n, n, 2, 2, 2, 5, 5, 5]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit=1).bfill() + expected = da.copy(data=[n, 2, 2, n, 5, 5, n, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit=1, limit_direction="backward").bfill() + expected = da.copy(data=[n, 2, 2, n, 5, 5, n, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit=None).interpolate_na( + dim="y", fill_value="extrapolate" + ) + expected = da.copy(data=[0, 1, 2, 3, 4, 5, 6, 7]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim="y", limit=1).interpolate_na( + dim="y", fill_value="extrapolate" + ) + expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps( + dim="y", + limit=pd.Timedelta("3h"), + use_coordinate="yt", + ).interpolate_na(dim="y", fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps( + dim="y", + limit=pd.Timedelta("3h"), + limit_direction="backward", + use_coordinate="yt", + ).interpolate_na(dim="y", fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, n, 4, 5, n, n]) + assert_equal(actual, expected) + + +@requires_bottleneck +def test_mask_gap_limit_2d(): + n = np.nan + times = pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + mask = da.fill_gaps("time", limit=1, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], + [n, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, n], + [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + actual = mask.ffill() + expected = da.copy( + data=[ + [1, 2, 3, 4, 4, 6, 6, n, n, 10, 11, 11], + [n, n, 3, 3, n, 6, 6, n, n, 10, 10, n], + [n, 2, 3, 4, 4, 6, 6, n, n, 10, 11, 11], + ] + ) + assert_equal(actual, expected) + actual = mask.bfill() + expected = da.copy( + data=[ + [1, 2, 3, 4, 6, 6, n, n, 10, 10, 11, n], + [n, 3, 3, n, 6, 6, n, n, 10, 10, n, n], + [2, 2, 3, 4, 6, 6, n, n, 10, 10, 11, n], + ] + ) + assert_equal(actual, expected) + actual = mask.fillna(0) + expected = da.copy( + data=[ + [1, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], + [n, 0, 3, 0, 0, 6, 0, n, 0, 10, 0, n], + [0, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], + ] + ) + assert_equal(actual, expected) + + actual = da.fill_gaps( + "time", limit=2, use_coordinate=False, limit_direction="backward" + ).interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, n, n], + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + actual = da.fill_gaps( + "time", + limit=pd.Timedelta("3h"), + limit_direction="backward", + limit_area="inside", + use_coordinate=True, + ).interpolate_na( + "time", + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, n, 9, 10, 11, n], + [n, n, 3, n, 5, 6, n, n, 9, 10, n, n], + [n, 2, 3, 4, 5, 6, n, n, 9, 10, 11, n], + ] + ) + + actual = da.fill_gaps( + "time", + limit=pd.Timedelta("3h"), + limit_direction="backward", + limit_area="outside", + use_coordinate=True, + ).interpolate_na( + "time", + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, 2, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + actual = da.fill_gaps( + "time", + limit=None, + limit_direction="backward", + limit_area="outside", + use_coordinate=True, + ).interpolate_na( + "time", + fill_value=8, + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [8, 8, 3, n, n, 6, n, n, n, 10, n, n], + [8, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + da = xr.DataArray( + [ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, n], + [n, n, 3, 3, n, n], + [n, n, n, 4, 4, 4], + ], + dims=["x", "y"], + coords={"x": np.arange(4) * 2}, + ) + mask = da.fill_gaps( + dim="x", + limit=3, + limit_direction="forward", + limit_area=None, + use_coordinate=True, + ) + actual = mask.interpolate_na( + "x", + fill_value="extrapolate", + method="linear", + ) + expected = da.copy( + data=[ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, 2], + [n, 3, 3, 3, 3, n], + [n, n, 4, 4, 4, 4], + ] + ) + assert_equal(actual, expected) + # Test: Dim argument from mask should be used + actual = mask.interpolate_na( + fill_value="extrapolate", + method="linear", + ) + expected = da.copy( + data=[ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, 2], + [n, 3, 3, 3, 3, n], + [n, n, 4, 4, 4, 4], + ] + ) + assert_equal(actual, expected) + + +@requires_bottleneck +def test_mask_gap_max_gap_2d(): + n = np.nan + times = pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + mask = da.fill_gaps("time", max_gap=1, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + mask = da.fill_gaps("time", max_gap=2, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + + mask = da.fill_gaps("time", max_gap=pd.Timedelta("3h"), use_coordinate=True) + actual = mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + + +@requires_bottleneck +def test_mask_double_coordinate(): + # Check if limit and max_gap are able to handle string coordinate names + n = np.nan + da = xr.DataArray( + [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], + dims=["x", "y"], + coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, + ) + actual = da.fill_gaps( + "y", + limit=1, + max_gap=4, + use_coordinate="y1", + ).interpolate_na("y", fill_value="extrapolate") + expected = da.copy(data=[[1, 2, 3, 4, 5, 6, 7], [1, 2, n, 4, 5, 6, n]]) + assert_equal(actual, expected) + + actual = da.fill_gaps("y", limit=2, max_gap=4, use_coordinate="y2").interpolate_na( + "y", + fill_value="extrapolate", + ) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + assert_equal(actual, expected)