diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 950a5d16273..1da89ff9a82 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -34,7 +34,7 @@ ) from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder -from xarray.core import indexing +from xarray.core import dtypes, indexing from xarray.core.combine import ( _infer_concat_order_from_positions, _nested_combine, @@ -49,6 +49,13 @@ from xarray.core.utils import is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _COORDS_DEFAULT, + _DATA_VARS_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: try: @@ -1402,14 +1409,16 @@ def open_mfdataset( | Sequence[Index] | None ) = None, - compat: CompatOptions = "no_conflicts", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, preprocess: Callable[[Dataset], Dataset] | None = None, engine: T_Engine | None = None, - data_vars: Literal["all", "minimal", "different"] | list[str] = "all", - coords="different", + data_vars: Literal["all", "minimal", "different"] + | list[str] + | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords=_COORDS_DEFAULT, combine: Literal["by_coords", "nested"] = "by_coords", parallel: bool = False, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", **kwargs, @@ -1596,9 +1605,6 @@ def open_mfdataset( paths1d: list[str | ReadBuffer] if combine == "nested": - if isinstance(concat_dim, str | DataArray) or concat_dim is None: - concat_dim = [concat_dim] # type: ignore[assignment] - # This creates a flat list which is easier to iterate over, whilst # encoding the originally-supplied structure as "ids". # The "ids" are not used at all if combine='by_coords`. @@ -1647,13 +1653,14 @@ def open_mfdataset( # along each dimension, using structure given by "ids" combined = _nested_combine( datasets, - concat_dims=concat_dim, + concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords, ids=ids, join=join, combine_attrs=combine_attrs, + fill_value=dtypes.NA, ) elif combine == "by_coords": # Redo ordering from coordinates, ignoring how they were ordered diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index d6cdd45bb49..dd7edbd88c2 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -2,6 +2,7 @@ import functools import operator +import warnings from collections import defaultdict from collections.abc import Callable, Hashable, Iterable, Mapping from contextlib import suppress @@ -22,6 +23,7 @@ from xarray.core.types import T_Alignable from xarray.core.utils import is_dict_like, is_full_slice from xarray.core.variable import Variable, as_compatible_data, calculate_dimensions +from xarray.util.deprecation_helpers import CombineKwargDefault if TYPE_CHECKING: from xarray.core.dataarray import DataArray @@ -418,12 +420,35 @@ def align_indexes(self) -> None: else: need_reindex = False if need_reindex: + if ( + isinstance(self.join, CombineKwargDefault) + and self.join != "exact" + ): + warnings.warn( + self.join.warning_message( + "This change will result in the following ValueError:" + "cannot be aligned with join='exact' because " + "index/labels/sizes are not equal along " + "these coordinates (dimensions): " + + ", ".join( + f"{name!r} {dims!r}" for name, dims in key[0] + ), + recommend_set_options=False, + ), + category=FutureWarning, + stacklevel=2, + ) if self.join == "exact": raise ValueError( "cannot align objects with join='exact' where " "index/labels/sizes are not equal along " "these coordinates (dimensions): " + ", ".join(f"{name!r} {dims!r}" for name, dims in key[0]) + + ( + self.join.error_message() + if isinstance(self.join, CombineKwargDefault) + else "" + ) ) joiner = self._get_index_joiner(index_cls) joined_index = joiner(matching_indexes) @@ -886,7 +911,7 @@ def align( def deep_align( objects: Iterable[Any], - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), diff --git a/xarray/core/combine.py b/xarray/core/combine.py index f02d046fff6..b9a0d9f614a 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -12,6 +12,13 @@ from xarray.core.dataset import Dataset from xarray.core.merge import merge from xarray.core.utils import iterate_nested +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _COORDS_DEFAULT, + _DATA_VARS_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: from xarray.core.types import ( @@ -200,12 +207,12 @@ def _check_shape_tile_ids(combined_tile_ids): def _combine_nd( combined_ids, concat_dims, - data_vars="all", - coords="different", - compat: CompatOptions = "no_conflicts", - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + data_vars, + coords, + compat: CompatOptions | CombineKwargDefault, + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): """ Combines an N-dimensional structure of datasets into one by applying a @@ -263,9 +270,9 @@ def _combine_all_along_first_dim( data_vars, coords, compat: CompatOptions, - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): # Group into lines of datasets which must be combined along dim grouped = groupby_defaultdict(list(combined_ids.items()), key=_new_tile_id) @@ -276,7 +283,14 @@ def _combine_all_along_first_dim( combined_ids = dict(sorted(group)) datasets = combined_ids.values() new_combined_ids[new_id] = _combine_1d( - datasets, dim, compat, data_vars, coords, fill_value, join, combine_attrs + datasets, + concat_dim=dim, + compat=compat, + data_vars=data_vars, + coords=coords, + fill_value=fill_value, + join=join, + combine_attrs=combine_attrs, ) return new_combined_ids @@ -284,12 +298,12 @@ def _combine_all_along_first_dim( def _combine_1d( datasets, concat_dim, - compat: CompatOptions = "no_conflicts", - data_vars="all", - coords="different", - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + compat: CompatOptions, + data_vars, + coords, + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): """ Applies either concat or merge to 1D list of datasets depending on value @@ -338,18 +352,21 @@ def _new_tile_id(single_id_ds_pair): def _nested_combine( datasets, - concat_dims, + concat_dim, compat, data_vars, coords, ids, - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): if len(datasets) == 0: return Dataset() + if isinstance(concat_dim, str | DataArray) or concat_dim is None: + concat_dim = [concat_dim] # type: ignore[assignment] + # Arrange datasets for concatenation # Use information from the shape of the user input if not ids: @@ -366,7 +383,7 @@ def _nested_combine( # Apply series of concatenate or merge operations along each dimension combined = _combine_nd( combined_ids, - concat_dims, + concat_dims=concat_dim, compat=compat, data_vars=data_vars, coords=coords, @@ -384,11 +401,11 @@ def _nested_combine( def combine_nested( datasets: DATASET_HYPERCUBE, concat_dim: str | DataArray | None | Sequence[str | DataArray | pd.Index | None], - compat: str = "no_conflicts", - data_vars: str = "all", - coords: str = "different", + compat: str | CombineKwargDefault = _COMPAT_DEFAULT, + data_vars: str | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: str | CombineKwargDefault = _COORDS_DEFAULT, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "drop", ) -> Dataset: """ @@ -581,13 +598,10 @@ def combine_nested( if mixed_datasets_and_arrays: raise ValueError("Can't combine datasets with unnamed arrays.") - if isinstance(concat_dim, str | DataArray) or concat_dim is None: - concat_dim = [concat_dim] - # The IDs argument tells _nested_combine that datasets aren't yet sorted return _nested_combine( datasets, - concat_dims=concat_dim, + concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords, @@ -619,12 +633,12 @@ def groupby_defaultdict( def _combine_single_variable_hypercube( datasets, - fill_value=dtypes.NA, - data_vars="all", - coords="different", - compat: CompatOptions = "no_conflicts", - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "no_conflicts", + fill_value, + data_vars, + coords, + compat: CompatOptions | CombineKwargDefault, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): """ Attempt to combine a list of Datasets into a hypercube using their @@ -678,11 +692,13 @@ def _combine_single_variable_hypercube( def combine_by_coords( data_objects: Iterable[Dataset | DataArray] = [], - compat: CompatOptions = "no_conflicts", - data_vars: Literal["all", "minimal", "different"] | list[str] = "all", - coords: str = "different", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, + data_vars: Literal["all", "minimal", "different"] + | list[str] + | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: str | CombineKwargDefault = _COORDS_DEFAULT, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "no_conflicts", ) -> Dataset | DataArray: """ diff --git a/xarray/core/concat.py b/xarray/core/concat.py index b824aabbb23..846f52bae17 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from collections.abc import Hashable, Iterable from typing import TYPE_CHECKING, Any, Union, overload @@ -20,6 +21,13 @@ from xarray.core.types import T_DataArray, T_Dataset, T_Variable from xarray.core.variable import Variable from xarray.core.variable import concat as concat_vars +from xarray.util.deprecation_helpers import ( + _COMPAT_CONCAT_DEFAULT, + _COORDS_DEFAULT, + _DATA_VARS_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: from xarray.core.types import ( @@ -37,12 +45,12 @@ def concat( objs: Iterable[T_Dataset], dim: Hashable | T_Variable | T_DataArray | pd.Index | Any, - data_vars: T_DataVars = "all", - coords: ConcatOptions | list[Hashable] = "different", - compat: CompatOptions = "equals", + data_vars: T_DataVars | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: ConcatOptions | list[Hashable] | CombineKwargDefault = _COORDS_DEFAULT, + compat: CompatOptions | CombineKwargDefault = _COMPAT_CONCAT_DEFAULT, positions: Iterable[Iterable[int]] | None = None, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "override", create_index_for_new_dim: bool = True, ) -> T_Dataset: ... @@ -52,12 +60,12 @@ def concat( def concat( objs: Iterable[T_DataArray], dim: Hashable | T_Variable | T_DataArray | pd.Index | Any, - data_vars: T_DataVars = "all", - coords: ConcatOptions | list[Hashable] = "different", - compat: CompatOptions = "equals", + data_vars: T_DataVars | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: ConcatOptions | list[Hashable] | CombineKwargDefault = _COORDS_DEFAULT, + compat: CompatOptions | CombineKwargDefault = _COMPAT_CONCAT_DEFAULT, positions: Iterable[Iterable[int]] | None = None, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | None = None, combine_attrs: CombineAttrsOptions = "override", create_index_for_new_dim: bool = True, ) -> T_DataArray: ... @@ -66,12 +74,12 @@ def concat( def concat( objs, dim, - data_vars: T_DataVars = "all", - coords="different", - compat: CompatOptions = "equals", + data_vars: T_DataVars | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: ConcatOptions | list[Hashable] | CombineKwargDefault = _COORDS_DEFAULT, + compat: CompatOptions | CombineKwargDefault = _COMPAT_CONCAT_DEFAULT, positions=None, fill_value=dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "override", create_index_for_new_dim: bool = True, ): @@ -255,7 +263,9 @@ def concat( except StopIteration as err: raise ValueError("must supply at least one object to concatenate") from err - if compat not in set(_VALID_COMPAT) - {"minimal"}: + if not isinstance(compat, CombineKwargDefault) and compat not in set( + _VALID_COMPAT + ) - {"minimal"}: raise ValueError( f"compat={compat!r} invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" ) @@ -320,7 +330,14 @@ def _calc_concat_dim_index( return dim, index -def _calc_concat_over(datasets, dim, dim_names, data_vars: T_DataVars, coords, compat): +def _calc_concat_over( + datasets, + dim, + dim_names, + data_vars: T_DataVars, + coords, + compat, +): """ Determine which dataset variables need to be concatenated in the result, """ @@ -344,11 +361,32 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars: T_DataVars, coords, c concat_dim_lengths.append(ds.sizes.get(dim, 1)) def process_subset_opt(opt, subset): - if isinstance(opt, str): + original = set(concat_over) + compat_str = ( + compat._value if isinstance(compat, CombineKwargDefault) else compat + ) + if isinstance(opt, str | CombineKwargDefault): if opt == "different": + if isinstance(compat, CombineKwargDefault) and compat != "override": + if not isinstance(opt, CombineKwargDefault): + warnings.warn( + compat.warning_message( + "This change will result in the following ValueError: " + f"Cannot specify both {subset}='different' and compat='override'.", + recommend_set_options=False, + ), + category=FutureWarning, + stacklevel=2, + ) + if compat == "override": raise ValueError( f"Cannot specify both {subset}='different' and compat='override'." + + ( + compat.error_message() + if isinstance(compat, CombineKwargDefault) + else "" + ) ) # all nonindexes that are not the same in each dataset for k in getattr(datasets[0], subset): @@ -372,7 +410,7 @@ def process_subset_opt(opt, subset): # first check without comparing values i.e. no computes for var in variables[1:]: - equals[k] = getattr(variables[0], compat)( + equals[k] = getattr(variables[0], compat_str)( var, equiv=lazy_array_equiv ) if equals[k] is not True: @@ -395,7 +433,7 @@ def process_subset_opt(opt, subset): for ds_rhs in datasets[1:]: v_rhs = ds_rhs.variables[k].compute() computed.append(v_rhs) - if not getattr(v_lhs, compat)(v_rhs): + if not getattr(v_lhs, compat_str)(v_rhs): concat_over.add(k) equals[k] = False # computed variables are not to be re-computed @@ -418,6 +456,20 @@ def process_subset_opt(opt, subset): pass else: raise ValueError(f"unexpected value for {subset}: {opt}") + + if ( + isinstance(opt, CombineKwargDefault) + and opt != "minimal" + and original != concat_over + ): + warnings.warn( + opt.warning_message( + "This is likely to lead to different results when multiple datasets " + "have matching variables with overlapping values.", + ), + category=FutureWarning, + stacklevel=2, + ) else: valid_vars = tuple(getattr(datasets[0], subset)) invalid_vars = [k for k in opt if k not in valid_vars] @@ -479,14 +531,14 @@ def _parse_datasets( def _dataset_concat( datasets: Iterable[T_Dataset], dim: str | T_Variable | T_DataArray | pd.Index, - data_vars: T_DataVars, - coords: str | list[str], - compat: CompatOptions, + data_vars: T_DataVars | CombineKwargDefault, + coords: str | list[str] | CombineKwargDefault, + compat: CompatOptions | CombineKwargDefault, positions: Iterable[Iterable[int]] | None, - fill_value: Any = dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "override", - create_index_for_new_dim: bool = True, + fill_value: Any, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, + create_index_for_new_dim: bool, ) -> T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -501,6 +553,13 @@ def _dataset_concat( "The elements in the input list need to be either all 'Dataset's or all 'DataArray's" ) + if not isinstance(compat, CombineKwargDefault) and compat not in set( + _VALID_COMPAT + ) - {"minimal"}: + raise ValueError( + f"compat={compat!r} invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" + ) + if isinstance(dim, DataArray): dim_var = dim.variable elif isinstance(dim, Variable): @@ -718,14 +777,14 @@ def get_indexes(name): def _dataarray_concat( arrays: Iterable[T_DataArray], dim: str | T_Variable | T_DataArray | pd.Index, - data_vars: T_DataVars, - coords: str | list[str], - compat: CompatOptions, + data_vars: T_DataVars | CombineKwargDefault, + coords: str | list[str] | CombineKwargDefault, + compat: CompatOptions | CombineKwargDefault, positions: Iterable[Iterable[int]] | None, - fill_value: object = dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "override", - create_index_for_new_dim: bool = True, + fill_value: object, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, + create_index_for_new_dim: bool, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -736,7 +795,10 @@ def _dataarray_concat( "The elements in the input list need to be either all 'Dataset's or all 'DataArray's" ) - if data_vars != "all": + if not isinstance(data_vars, CombineKwargDefault) and data_vars not in [ + "all", + "minimal", + ]: raise ValueError( "data_vars is not a valid argument when concatenating DataArray objects" ) @@ -754,11 +816,11 @@ def _dataarray_concat( ds = _dataset_concat( datasets, - dim, - data_vars, - coords, - compat, - positions, + dim=dim, + data_vars="all", + coords=coords, + compat=compat, + positions=positions, fill_value=fill_value, join=join, combine_attrs=combine_attrs, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 449f502c43a..af37b1bb3f2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -132,7 +132,13 @@ from xarray.namedarray.parallelcompat import get_chunked_array_type, guess_chunkmanager from xarray.namedarray.pycompat import array_type, is_chunked_array, to_numpy from xarray.plot.accessor import DatasetPlotAccessor -from xarray.util.deprecation_helpers import _deprecate_positional_args, deprecate_dims +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, + _deprecate_positional_args, + deprecate_dims, +) if TYPE_CHECKING: from dask.dataframe import DataFrame as DaskDataFrame @@ -413,6 +419,7 @@ def merge_data_and_coords(data_vars: DataVars, coords) -> _MergeResult: [data_vars, coords], compat="broadcast_equals", join="outer", + combine_attrs="override", explicit_coords=tuple(coords), indexes=coords.xindexes, priority_arg=1, @@ -5506,7 +5513,14 @@ def stack_dataarray(da): # concatenate the arrays stackable_vars = [stack_dataarray(da) for da in self.data_vars.values()] - data_array = concat(stackable_vars, dim=new_dim) + data_array = concat( + stackable_vars, + dim=new_dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) if name is not None: data_array.name = name @@ -5750,8 +5764,8 @@ def merge( self, other: CoercibleMapping | DataArray, overwrite_vars: Hashable | Iterable[Hashable] = frozenset(), - compat: CompatOptions = "no_conflicts", - join: JoinOptions = "outer", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, fill_value: Any = xrdtypes.NA, combine_attrs: CombineAttrsOptions = "override", ) -> Self: diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index b28ba390a9f..9a1827c4eb8 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -1526,7 +1526,14 @@ def _combine(self, applied, shortcut=False): if shortcut: combined = self._concat_shortcut(applied, dim, positions) else: - combined = concat(applied, dim) + combined = concat( + applied, + dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) combined = _maybe_reorder(combined, dim, positions, N=self.group1d.size) if isinstance(combined, type(self._obj)): @@ -1686,7 +1693,14 @@ def _combine(self, applied): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) dim, positions = self._infer_concat_args(applied_example) - combined = concat(applied, dim) + combined = concat( + applied, + dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) combined = _maybe_reorder(combined, dim, positions, N=self.group1d.size) # assign coord when the applied function does not return that coord if dim not in applied_example.dims: diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 6426f741750..8c14582982b 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, Sequence, Set from typing import TYPE_CHECKING, Any, NamedTuple, Union @@ -17,6 +18,11 @@ ) from xarray.core.utils import Frozen, compat_dict_union, dict_equiv, equivalent from xarray.core.variable import Variable, as_variable, calculate_dimensions +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: from xarray.core.coordinates import Coordinates @@ -83,7 +89,7 @@ class MergeError(ValueError): def unique_variable( name: Hashable, variables: list[Variable], - compat: CompatOptions = "broadcast_equals", + compat: CompatOptions | CombineKwargDefault = "broadcast_equals", equals: bool | None = None, ) -> Variable: """Return the unique variable from a list of variables or raise MergeError. @@ -126,9 +132,12 @@ def unique_variable( combine_method = "fillna" if equals is None: + compat_str = ( + compat._value if isinstance(compat, CombineKwargDefault) else compat + ) # first check without comparing values i.e. no computes for var in variables[1:]: - equals = getattr(out, compat)(var, equiv=lazy_array_equiv) + equals = getattr(out, compat_str)(var, equiv=lazy_array_equiv) if equals is not True: break @@ -136,7 +145,7 @@ def unique_variable( # now compare values with minimum number of computes out = out.compute() for var in variables[1:]: - equals = getattr(out, compat)(var) + equals = getattr(out, compat_str)(var) if not equals: break @@ -154,7 +163,7 @@ def unique_variable( def _assert_compat_valid(compat): - if compat not in _VALID_COMPAT: + if not isinstance(compat, CombineKwargDefault) and compat not in _VALID_COMPAT: raise ValueError(f"compat={compat!r} invalid: must be {set(_VALID_COMPAT)}") @@ -196,7 +205,7 @@ def _assert_prioritized_valid( def merge_collected( grouped: dict[Any, list[MergeElement]], prioritized: Mapping[Any, MergeElement] | None = None, - compat: CompatOptions = "minimal", + compat: CompatOptions | CombineKwargDefault = "minimal", combine_attrs: CombineAttrsOptions = "override", equals: dict[Any, bool] | None = None, ) -> tuple[dict[Hashable, Variable], dict[Hashable, Index]]: @@ -290,6 +299,21 @@ def merge_collected( merged_vars[name] = unique_variable( name, variables, compat, equals.get(name, None) ) + # This is very likely to result in false positives, but there is no way + # to tell if the output will change without computing. + if ( + isinstance(compat, CombineKwargDefault) + and compat == "no_conflicts" + and len(variables) > 1 + ): + warnings.warn( + compat.warning_message( + "This is likely to lead to different results when " + "combining overlapping variables with the same name.", + ), + category=FutureWarning, + stacklevel=2, + ) except MergeError: if compat != "minimal": # we need more than "minimal" compatibility (for which @@ -626,8 +650,8 @@ class _MergeResult(NamedTuple): def merge_core( objects: Iterable[CoercibleMapping], - compat: CompatOptions = "broadcast_equals", - join: JoinOptions = "outer", + compat: CompatOptions | CombineKwargDefault, + join: JoinOptions | CombineKwargDefault, combine_attrs: CombineAttrsOptions = "override", priority_arg: int | None = None, explicit_coords: Iterable[Hashable] | None = None, @@ -690,7 +714,11 @@ def merge_core( coerced = coerce_pandas_values(objects) aligned = deep_align( - coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value + coerced, + join=join, + copy=False, + indexes=indexes, + fill_value=fill_value, ) for pos, obj in skip_align_objs: @@ -699,7 +727,10 @@ def merge_core( collected = collect_variables_and_indexes(aligned, indexes=indexes) prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat) variables, out_indexes = merge_collected( - collected, prioritized, compat=compat, combine_attrs=combine_attrs + collected, + prioritized, + compat=compat, + combine_attrs=combine_attrs, ) dims = calculate_dimensions(variables) @@ -730,8 +761,8 @@ def merge_core( def merge( objects: Iterable[DataArray | CoercibleMapping], - compat: CompatOptions = "no_conflicts", - join: JoinOptions = "outer", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, fill_value: object = dtypes.NA, combine_attrs: CombineAttrsOptions = "override", ) -> Dataset: @@ -975,8 +1006,8 @@ def merge( merge_result = merge_core( dict_like_objects, - compat, - join, + compat=compat, + join=join, combine_attrs=combine_attrs, fill_value=fill_value, ) @@ -987,8 +1018,8 @@ def dataset_merge_method( dataset: Dataset, other: CoercibleMapping, overwrite_vars: Hashable | Iterable[Hashable], - compat: CompatOptions, - join: JoinOptions, + compat: CompatOptions | CombineKwargDefault, + join: JoinOptions | CombineKwargDefault, fill_value: Any, combine_attrs: CombineAttrsOptions, ) -> _MergeResult: @@ -1021,8 +1052,8 @@ def dataset_merge_method( return merge_core( objs, - compat, - join, + compat=compat, + join=join, priority_arg=priority_arg, fill_value=fill_value, combine_attrs=combine_attrs, @@ -1054,6 +1085,8 @@ def dataset_update_method(dataset: Dataset, other: CoercibleMapping) -> _MergeRe return merge_core( [dataset, other], + compat="broadcast_equals", + join="outer", priority_arg=1, indexes=dataset.xindexes, combine_attrs="override", diff --git a/xarray/core/options.py b/xarray/core/options.py index 2d69e4b6584..df4bd94d074 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -29,6 +29,7 @@ "keep_attrs", "warn_for_unclosed_files", "use_bottleneck", + "use_new_combine_kwarg_defaults", "use_numbagg", "use_opt_einsum", "use_flox", @@ -57,6 +58,7 @@ class T_Options(TypedDict): warn_for_unclosed_files: bool use_bottleneck: bool use_flox: bool + use_new_combine_kwarg_defaults: bool use_numbagg: bool use_opt_einsum: bool @@ -84,6 +86,7 @@ class T_Options(TypedDict): "warn_for_unclosed_files": False, "use_bottleneck": True, "use_flox": True, + "use_new_combine_kwarg_defaults": False, "use_numbagg": True, "use_opt_einsum": True, } @@ -113,6 +116,7 @@ def _positive_integer(value: Any) -> bool: "file_cache_maxsize": _positive_integer, "keep_attrs": lambda choice: choice in [True, False, "default"], "use_bottleneck": lambda value: isinstance(value, bool), + "use_new_combine_kwarg_defaults": lambda value: isinstance(value, bool), "use_numbagg": lambda value: isinstance(value, bool), "use_opt_einsum": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), @@ -250,6 +254,15 @@ class set_options: use_flox : bool, default: True Whether to use ``numpy_groupies`` and `flox`` to accelerate groupby and resampling reductions. + use_new_combine_kwarg_defaults : bool, default False + Whether to use new kwarg default values for combine functions: + :py:func:`~xarray.concat`, :py:func:`~xarray.merge`, + :py:func:`~xarray.open_mfdataset`. New values are: + + * ``data_vars``: "minimal" + * ``coords``: "minimal" + * ``compat``: "override" + * ``join``: "exact" use_numbagg : bool, default: True Whether to use ``numbagg`` to accelerate reductions. Takes precedence over ``use_bottleneck`` when both are True. diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index 6d6a6672470..d70a3b8b516 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -351,7 +351,9 @@ def _wrapper( result = func(*converted_args, **kwargs) merged_coordinates = merge( - [arg.coords for arg in args if isinstance(arg, Dataset | DataArray)] + [arg.coords for arg in args if isinstance(arg, Dataset | DataArray)], + join="outer", + compat="no_conflicts", ).coords # check all dims are present @@ -439,7 +441,9 @@ def _wrapper( # rechunk any numpy variables appropriately xarray_objs = tuple(arg.chunk(arg.chunksizes) for arg in xarray_objs) - merged_coordinates = merge([arg.coords for arg in aligned]).coords + merged_coordinates = merge( + [arg.coords for arg in aligned], join="outer", compat="no_conflicts" + ).coords _, npargs = unzip( sorted( @@ -472,7 +476,9 @@ def _wrapper( ) coordinates = merge( - (preserved_coords, template.coords.to_dataset()[new_coord_vars]) + (preserved_coords, template.coords.to_dataset()[new_coord_vars]), + join="outer", + compat="no_conflicts", ).coords output_chunks: Mapping[Hashable, tuple[int, ...]] = { dim: input_chunks[dim] for dim in template.dims if dim in input_chunks diff --git a/xarray/plot/dataarray_plot.py b/xarray/plot/dataarray_plot.py index cca9fe4f561..9663303276e 100644 --- a/xarray/plot/dataarray_plot.py +++ b/xarray/plot/dataarray_plot.py @@ -196,7 +196,14 @@ def _prepare_plot1d_data( dim = coords_to_plot.get(v, None) if (dim is not None) and (dim in darray.dims): darray_nan = np.nan * darray.isel({dim: -1}) - darray = concat([darray, darray_nan], dim=dim) + darray = concat( + [darray, darray_nan], + dim=dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) dims_T.append(coords_to_plot[v]) # Lines should never connect to the same coordinate when stacked, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 83d5afa6a09..e95f710c43c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -14,7 +14,7 @@ import uuid import warnings from collections.abc import Generator, Iterator, Mapping -from contextlib import ExitStack +from contextlib import ExitStack, nullcontext from io import BytesIO from os import listdir from pathlib import Path @@ -4511,13 +4511,14 @@ def setup_files_and_datasets(self, fuzz=0): # to test join='exact' ds1["x"] = ds1.x + fuzz - with create_tmp_file() as tmpfile1: - with create_tmp_file() as tmpfile2: - # save data to the temporary files - ds1.to_netcdf(tmpfile1) - ds2.to_netcdf(tmpfile2) + with set_options(use_new_combine_kwarg_defaults=True): + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) - yield [tmpfile1, tmpfile2], [ds1, ds2] + yield [tmpfile1, tmpfile2], [ds1, ds2] def gen_datasets_with_common_coord_and_time(self): # create coordinate data @@ -4554,11 +4555,19 @@ def test_open_mfdataset_does_same_as_concat( if combine == "by_coords": files.reverse() with open_mfdataset( - files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join + files, + data_vars=opt, + combine=combine, + concat_dim=concat_dim, + join=join, + compat="no_conflicts", ) as ds: - ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join) + ds_expect = xr.concat( + [ds1, ds2], data_vars=opt, dim="t", join=join, compat="equals" + ) assert_identical(ds, ds_expect) + @pytest.mark.parametrize("use_new_combine_kwarg_defaults", [True, False]) @pytest.mark.parametrize( ["combine_attrs", "attrs", "expected", "expect_error"], ( @@ -4586,7 +4595,12 @@ def test_open_mfdataset_does_same_as_concat( ), ) def test_open_mfdataset_dataset_combine_attrs( - self, combine_attrs, attrs, expected, expect_error + self, + use_new_combine_kwarg_defaults, + combine_attrs, + attrs, + expected, + expect_error, ): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # Give the files an inconsistent attribute @@ -4596,22 +4610,24 @@ def test_open_mfdataset_dataset_combine_attrs( ds.close() ds.to_netcdf(f) - if expect_error: - with pytest.raises(xr.MergeError): - xr.open_mfdataset( - files, - combine="nested", - concat_dim="t", - combine_attrs=combine_attrs, - ) - else: - with xr.open_mfdataset( - files, - combine="nested", - concat_dim="t", - combine_attrs=combine_attrs, - ) as ds: - assert ds.attrs == expected + with set_options( + use_new_combine_kwarg_defaults=use_new_combine_kwarg_defaults + ): + warning = ( + pytest.warns(FutureWarning) + if not use_new_combine_kwarg_defaults + else nullcontext() + ) + error = pytest.raises(xr.MergeError) if expect_error else nullcontext() + with warning: + with error: + with xr.open_mfdataset( + files, + combine="nested", + concat_dim="t", + combine_attrs=combine_attrs, + ) as ds: + assert ds.attrs == expected def test_open_mfdataset_dataset_attr_by_coords(self) -> None: """ @@ -4640,30 +4656,65 @@ def test_open_mfdataset_dataarray_attr_by_coords(self) -> None: ds.close() ds.to_netcdf(f) - with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds: + with xr.open_mfdataset( + files, data_vars="minimal", combine="nested", concat_dim="t" + ) as ds: assert ds["v1"].test_dataarray_attr == 0 @pytest.mark.parametrize( "combine, concat_dim", [("nested", "t"), ("by_coords", None)] ) - @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) + @pytest.mark.parametrize( + "kwargs", + [ + {"data_vars": "all"}, + {"data_vars": "minimal"}, + { + "data_vars": "all", + "coords": "different", + "compat": "no_conflicts", + }, # old defaults + { + "data_vars": "minimal", + "coords": "minimal", + "compat": "override", + }, # new defaults + {"data_vars": "different", "compat": "no_conflicts"}, + {}, + ], + ) def test_open_mfdataset_exact_join_raises_error( - self, combine, concat_dim, opt + self, combine, concat_dim, kwargs ) -> None: - with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): + with self.setup_files_and_datasets(fuzz=0.1) as (files, _): if combine == "by_coords": files.reverse() with pytest.raises( - ValueError, match=r"cannot align objects.*join.*exact.*" + ValueError, match="cannot align objects with join='exact'" ): open_mfdataset( files, - data_vars=opt, + **kwargs, combine=combine, concat_dim=concat_dim, join="exact", ) + def test_open_mfdataset_defaults_with_exact_join_warns_as_well_as_raising( + self, + ) -> None: + with self.setup_files_and_datasets(fuzz=0.1) as (files, _): + with set_options(use_new_combine_kwarg_defaults=False): + files.reverse() + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + with pytest.raises( + ValueError, match="cannot align objects with join='exact'" + ): + open_mfdataset(files, combine="by_coords", join="exact") + def test_common_coord_when_datavars_all(self) -> None: opt: Final = "all" @@ -4711,6 +4762,50 @@ def test_invalid_data_vars_value_should_fail(self) -> None: with open_mfdataset(files, coords="minimum", combine="by_coords"): pass + @pytest.mark.parametrize( + "combine, concat_dim", [("nested", "t"), ("by_coords", None)] + ) + @pytest.mark.parametrize( + "kwargs", [{"data_vars": "different"}, {"coords": "different"}] + ) + def test_open_mfdataset_warns_when_kwargs_set_to_different( + self, combine, concat_dim, kwargs + ) -> None: + with self.setup_files_and_datasets() as (files, [ds1, ds2]): + if combine == "by_coords": + files.reverse() + with pytest.raises( + ValueError, match="Previously the default was compat='no_conflicts'" + ): + open_mfdataset(files, combine=combine, concat_dim=concat_dim, **kwargs) + with pytest.raises( + ValueError, match="Previously the default was compat='equals'" + ): + xr.concat([ds1, ds2], dim="t", **kwargs) + + with set_options(use_new_combine_kwarg_defaults=False): + if "data_vars" not in kwargs: + expectation = pytest.warns( + FutureWarning, + match="will change from data_vars='all'", + ) + else: + expectation = nullcontext() + with pytest.warns( + FutureWarning, + match="will change from compat='equals'", + ): + with expectation: + ds_expect = xr.concat([ds1, ds2], dim="t", **kwargs) + with pytest.warns( + FutureWarning, match="will change from compat='no_conflicts'" + ): + with expectation: + with open_mfdataset( + files, combine=combine, concat_dim=concat_dim, **kwargs + ) as ds: + assert_identical(ds, ds_expect) + @requires_dask @requires_scipy @@ -4966,11 +5061,58 @@ def test_encoding_mfdataset(self) -> None: ds2.t.encoding["units"] = "days since 2000-01-01" ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine="nested") as actual: + with open_mfdataset( + [tmp1, tmp2], combine="nested", compat="no_conflicts", join="outer" + ) as actual: assert actual.t.encoding["units"] == original.t.encoding["units"] assert actual.t.encoding["units"] == ds1.t.encoding["units"] assert actual.t.encoding["units"] != ds2.t.encoding["units"] + def test_encoding_mfdataset_new_defaults(self) -> None: + original = Dataset( + { + "foo": ("t", np.random.randn(10)), + "t": ("t", pd.date_range(start="2010-01-01", periods=10, freq="1D")), + } + ) + original.t.encoding["units"] = "days since 2010-01-01" + + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1 = original.isel(t=slice(5)) + ds2 = original.isel(t=slice(5, 10)) + ds1.t.encoding["units"] = "days since 2010-01-01" + ds2.t.encoding["units"] = "days since 2000-01-01" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from join='outer' to join='exact'", + ): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + with open_mfdataset([tmp1, tmp2], combine="nested") as old: + assert ( + old.t.encoding["units"] + == original.t.encoding["units"] + ) + assert ( + old.t.encoding["units"] == ds1.t.encoding["units"] + ) + assert ( + old.t.encoding["units"] != ds2.t.encoding["units"] + ) + + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises( + ValueError, match="Error might be related to new default" + ): + open_mfdataset([tmp1, tmp2], combine="nested") + def test_preprocess_mfdataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp: @@ -5053,25 +5195,21 @@ def test_open_and_do_math(self) -> None: actual = 1.0 * ds assert_allclose(original, actual, decode_bytes=False) - def test_open_mfdataset_concat_dim_none(self) -> None: - with create_tmp_file() as tmp1: - with create_tmp_file() as tmp2: - data = Dataset({"x": 0}) - data.to_netcdf(tmp1) - Dataset({"x": np.nan}).to_netcdf(tmp2) - with open_mfdataset( - [tmp1, tmp2], concat_dim=None, combine="nested" - ) as actual: - assert_identical(data, actual) - - def test_open_mfdataset_concat_dim_default_none(self) -> None: - with create_tmp_file() as tmp1: - with create_tmp_file() as tmp2: - data = Dataset({"x": 0}) - data.to_netcdf(tmp1) - Dataset({"x": np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine="nested") as actual: - assert_identical(data, actual) + @pytest.mark.parametrize( + "kwargs", + [pytest.param({"concat_dim": None}, id="none"), pytest.param({}, id="default")], + ) + def test_open_mfdataset_concat_dim(self, kwargs) -> None: + with set_options(use_new_combine_kwarg_defaults=True): + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + data = Dataset({"x": 0}) + data.to_netcdf(tmp1) + Dataset({"x": np.nan}).to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], **kwargs, combine="nested" + ) as actual: + assert_identical(data, actual) def test_open_dataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) @@ -5098,7 +5236,9 @@ def test_open_single_dataset(self) -> None: ) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dim=dim, combine="nested") as actual: + with open_mfdataset( + [tmp], concat_dim=dim, data_vars="all", combine="nested" + ) as actual: assert_identical(expected, actual) def test_open_multi_dataset(self) -> None: @@ -5122,7 +5262,7 @@ def test_open_multi_dataset(self) -> None: original.to_netcdf(tmp1) original.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim=dim, combine="nested" + [tmp1, tmp2], concat_dim=dim, data_vars="all", combine="nested" ) as actual: assert_identical(expected, actual) @@ -6579,19 +6719,20 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): @requires_h5netcdf @requires_fsspec def test_h5netcdf_storage_options() -> None: - with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2): - ds1 = create_test_data() - ds1.to_netcdf(f1, engine="h5netcdf") + with set_options(use_new_combine_kwarg_defaults=True): + with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2): + ds1 = create_test_data() + ds1.to_netcdf(f1, engine="h5netcdf") - ds2 = create_test_data() - ds2.to_netcdf(f2, engine="h5netcdf") + ds2 = create_test_data() + ds2.to_netcdf(f2, engine="h5netcdf") - files = [f"file://{f}" for f in [f1, f2]] - ds = xr.open_mfdataset( - files, - engine="h5netcdf", - concat_dim="time", - combine="nested", - storage_options={"skip_instance_cache": False}, - ) - assert_identical(xr.concat([ds1, ds2], dim="time"), ds) + files = [f"file://{f}" for f in [f1, f2]] + ds = xr.open_mfdataset( + files, + engine="h5netcdf", + concat_dim="time", + combine="nested", + storage_options={"skip_instance_cache": False}, + ) + assert_identical(xr.concat([ds1, ds2], dim="time"), ds) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index cc20ab414ee..c1d61a6f424 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -13,6 +13,7 @@ combine_nested, concat, merge, + set_options, ) from xarray.core import dtypes from xarray.core.combine import ( @@ -290,9 +291,12 @@ def test_concat_once(self, create_combined_ids, concat_dim): data_vars="all", coords="different", compat="no_conflicts", + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", ) - expected_ds = concat([ds(0), ds(1)], dim=concat_dim) + expected_ds = concat([ds(0), ds(1)], data_vars="all", dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_only_first_dim(self, create_combined_ids): @@ -304,6 +308,9 @@ def test_concat_only_first_dim(self, create_combined_ids): data_vars="all", coords="different", compat="no_conflicts", + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", ) ds = create_test_data @@ -319,13 +326,24 @@ def test_concat_only_first_dim(self, create_combined_ids): def test_concat_twice(self, create_combined_ids, concat_dim): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _combine_nd(combined_ids, concat_dims=["dim1", concat_dim]) + result = _combine_nd( + combined_ids, + concat_dims=["dim1", concat_dim], + data_vars="all", + coords="different", + compat="no_conflicts", + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", + ) ds = create_test_data partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") - expected = concat([partway1, partway2, partway3], dim=concat_dim) + expected = concat( + [partway1, partway2, partway3], data_vars="all", dim=concat_dim + ) assert_equal(result, expected) @@ -417,7 +435,7 @@ def test_nested_concat_along_new_dim(self): Dataset({"a": ("x", [20]), "x": [0]}), ] expected = Dataset({"a": (("t", "x"), [[10], [20]]), "x": [0]}) - actual = combine_nested(objs, concat_dim="t") + actual = combine_nested(objs, data_vars="all", concat_dim="t") assert_identical(expected, actual) # Same but with a DataArray as new dim, see GH #1988 and #2647 @@ -425,42 +443,51 @@ def test_nested_concat_along_new_dim(self): expected = Dataset( {"a": (("baz", "x"), [[10], [20]]), "x": [0], "baz": [100, 150]} ) - actual = combine_nested(objs, concat_dim=dim) + actual = combine_nested(objs, data_vars="all", concat_dim=dim) assert_identical(expected, actual) - def test_nested_merge(self): + def test_nested_merge_with_self(self): data = Dataset({"x": 0}) - actual = combine_nested([data, data, data], concat_dim=None) + actual = combine_nested( + [data, data, data], compat="no_conflicts", concat_dim=None + ) assert_identical(data, actual) + def test_nested_merge_with_overlapping_values(self): ds1 = Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) - actual = combine_nested([ds1, ds2], concat_dim=None) + actual = combine_nested( + [ds1, ds2], join="outer", compat="no_conflicts", concat_dim=None + ) assert_identical(expected, actual) - actual = combine_nested([ds1, ds2], concat_dim=[None]) + actual = combine_nested( + [ds1, ds2], join="outer", compat="no_conflicts", concat_dim=[None] + ) assert_identical(expected, actual) + def test_nested_merge_with_nan(self): tmp1 = Dataset({"x": 0}) tmp2 = Dataset({"x": np.nan}) - actual = combine_nested([tmp1, tmp2], concat_dim=None) + actual = combine_nested([tmp1, tmp2], compat="no_conflicts", concat_dim=None) assert_identical(tmp1, actual) - actual = combine_nested([tmp1, tmp2], concat_dim=[None]) + actual = combine_nested([tmp1, tmp2], compat="no_conflicts", concat_dim=[None]) assert_identical(tmp1, actual) - # Single object, with a concat_dim explicitly provided + def test_nested_merge_with_concat_dim_explicitly_provided(self): # Test the issue reported in GH #1988 objs = [Dataset({"x": 0, "y": 1})] dim = DataArray([100], name="baz", dims="baz") - actual = combine_nested(objs, concat_dim=[dim]) + actual = combine_nested(objs, concat_dim=[dim], data_vars="all") expected = Dataset({"x": ("baz", [0]), "y": ("baz", [1])}, {"baz": [100]}) assert_identical(expected, actual) + def test_nested_merge_with_non_scalars(self): # Just making sure that auto_combine is doing what is # expected for non-scalar values, too. objs = [Dataset({"x": ("z", [0, 1]), "y": ("z", [1, 2])})] dim = DataArray([100], name="baz", dims="baz") - actual = combine_nested(objs, concat_dim=[dim]) + actual = combine_nested(objs, concat_dim=[dim], data_vars="all") expected = Dataset( {"x": (("baz", "z"), [[0, 1]]), "y": (("baz", "z"), [[1, 2]])}, {"baz": [100]}, @@ -510,10 +537,15 @@ def test_auto_combine_2d(self): partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") - expected = concat([partway1, partway2, partway3], dim="dim2") + expected = concat([partway1, partway2, partway3], data_vars="all", dim="dim2") datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] - result = combine_nested(datasets, concat_dim=["dim1", "dim2"]) + result = combine_nested( + datasets, + data_vars="all", + compat="no_conflicts", + concat_dim=["dim1", "dim2"], + ) assert_equal(result, expected) def test_auto_combine_2d_combine_attrs_kwarg(self): @@ -522,7 +554,7 @@ def test_auto_combine_2d_combine_attrs_kwarg(self): partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") - expected = concat([partway1, partway2, partway3], dim="dim2") + expected = concat([partway1, partway2, partway3], data_vars="all", dim="dim2") expected_dict = {} expected_dict["drop"] = expected.copy(deep=True) @@ -553,12 +585,20 @@ def test_auto_combine_2d_combine_attrs_kwarg(self): with pytest.raises(ValueError, match=r"combine_attrs='identical'"): result = combine_nested( - datasets, concat_dim=["dim1", "dim2"], combine_attrs="identical" + datasets, + concat_dim=["dim1", "dim2"], + data_vars="all", + compat="no_conflicts", + combine_attrs="identical", ) for combine_attrs in expected_dict: result = combine_nested( - datasets, concat_dim=["dim1", "dim2"], combine_attrs=combine_attrs + datasets, + concat_dim=["dim1", "dim2"], + data_vars="all", + compat="no_conflicts", + combine_attrs=combine_attrs, ) assert_identical(result, expected_dict[combine_attrs]) @@ -572,7 +612,7 @@ def test_combine_nested_missing_data_new_dim(self): expected = Dataset( {"a": (("t", "x"), [[np.nan, 2, 3], [1, 2, np.nan]])}, {"x": [0, 1, 2]} ) - actual = combine_nested(datasets, concat_dim="t") + actual = combine_nested(datasets, data_vars="all", join="outer", concat_dim="t") assert_identical(expected, actual) def test_invalid_hypercube_input(self): @@ -650,7 +690,13 @@ def test_combine_nested_fill_value(self, fill_value): }, {"x": [0, 1, 2]}, ) - actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value) + actual = combine_nested( + datasets, + concat_dim="t", + data_vars="all", + join="outer", + fill_value=fill_value, + ) assert_identical(expected, actual) def test_combine_nested_unnamed_data_arrays(self): @@ -710,26 +756,30 @@ def test_combine_by_coords(self): expected = Dataset({"x": [0, 1, 2]}) assert_identical(expected, actual) + def test_combine_by_coords_handles_non_sorted_variables(self): # ensure auto_combine handles non-sorted variables objs = [ Dataset({"x": ("a", [0]), "y": ("a", [0]), "a": [0]}), Dataset({"x": ("a", [1]), "y": ("a", [1]), "a": [1]}), ] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, join="outer") expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1]), "a": [0, 1]}) assert_identical(expected, actual) + def test_combine_by_coords_multiple_variables(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, join="outer") expected = Dataset({"x": [0, 1], "y": [0, 1]}) assert_equal(actual, expected) + def test_combine_by_coords_for_scalar_variables(self): objs = [Dataset({"x": 0}), Dataset({"x": 1})] with pytest.raises( ValueError, match=r"Could not find any dimension coordinates" ): combine_by_coords(objs) + def test_combine_by_coords_requires_coord_or_index(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] with pytest.raises( ValueError, @@ -945,7 +995,9 @@ def test_combine_by_coords_combine_attrs_variables( with pytest.raises(MergeError, match="combine_attrs"): combine_by_coords([data1, data2], combine_attrs=combine_attrs) else: - actual = combine_by_coords([data1, data2], combine_attrs=combine_attrs) + actual = combine_by_coords( + [data1, data2], data_vars="all", combine_attrs=combine_attrs + ) expected = Dataset( { "x": ("a", [0, 1], expected_attrs), @@ -959,7 +1011,7 @@ def test_combine_by_coords_combine_attrs_variables( def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, data_vars="all", compat="no_conflicts") expected = data assert expected.broadcast_equals(actual) @@ -997,7 +1049,7 @@ def test_combine_by_coords_previously_failed(self): Dataset({"a": ("x", [1]), "x": [1]}), ] expected = Dataset({"a": ("x", [0, 1]), "b": ("x", [0, np.nan])}, {"x": [0, 1]}) - actual = combine_by_coords(datasets) + actual = combine_by_coords(datasets, join="outer") assert_identical(expected, actual) def test_combine_by_coords_still_fails(self): @@ -1014,7 +1066,7 @@ def test_combine_by_coords_no_concat(self): assert_identical(expected, actual) objs = [Dataset({"x": 0, "y": 1}), Dataset({"y": np.nan, "z": 2})] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, compat="no_conflicts") expected = Dataset({"x": 0, "y": 1, "z": 2}) assert_identical(expected, actual) @@ -1032,7 +1084,7 @@ def test_combine_by_coords_incomplete_hypercube(self): x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]}) x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]}) x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]}) - actual = combine_by_coords([x1, x2, x3]) + actual = combine_by_coords([x1, x2, x3], join="outer") expected = Dataset( {"a": (("y", "x"), [[1, 1], [1, np.nan]])}, coords={"y": [0, 1], "x": [0, 1]}, @@ -1040,8 +1092,10 @@ def test_combine_by_coords_incomplete_hypercube(self): assert_identical(expected, actual) # test that this fails if fill_value is None - with pytest.raises(ValueError): - combine_by_coords([x1, x2, x3], fill_value=None) + with pytest.raises( + ValueError, match="supplied objects do not form a hypercube" + ): + combine_by_coords([x1, x2, x3], join="outer", fill_value=None) def test_combine_by_coords_override_order(self) -> None: # regression test for https://github.com/pydata/xarray/issues/8828 @@ -1111,7 +1165,7 @@ def test_combine_by_coords_all_named_dataarrays(self): named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") - actual = combine_by_coords([named_da1, named_da2]) + actual = combine_by_coords([named_da1, named_da2], join="outer") expected = Dataset( { "a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"), @@ -1124,11 +1178,146 @@ def test_combine_by_coords_all_dataarrays_with_the_same_name(self): named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") named_da2 = DataArray(name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") - actual = combine_by_coords([named_da1, named_da2]) - expected = merge([named_da1, named_da2]) + actual = combine_by_coords( + [named_da1, named_da2], compat="no_conflicts", join="outer" + ) + expected = merge([named_da1, named_da2], compat="no_conflicts", join="outer") assert_identical(expected, actual) +class TestNewDefaults: + def test_concat_along_existing_dim(self): + concat_dim = "dim1" + ds = create_test_data + with set_options(use_new_combine_kwarg_defaults=False): + old = concat([ds(0), ds(1)], dim=concat_dim) + with set_options(use_new_combine_kwarg_defaults=True): + new = concat([ds(0), ds(1)], dim=concat_dim) + + assert_identical(old, new) + + def test_concat_along_new_dim(self): + concat_dim = "new_dim" + ds = create_test_data + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = concat([ds(0), ds(1)], dim=concat_dim) + with set_options(use_new_combine_kwarg_defaults=True): + new = concat([ds(0), ds(1)], dim=concat_dim) + + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_nested_merge_with_overlapping_values(self): + ds1 = Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) + ds2 = Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) + expected = Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = combine_nested([ds1, ds2], concat_dim=None) + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + combine_nested([ds1, ds2], concat_dim=None) + + assert_identical(old, expected) + + def test_nested_merge_with_nan_order_matters(self): + ds1 = Dataset({"x": 0}) + ds2 = Dataset({"x": np.nan}) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = combine_nested([ds1, ds2], concat_dim=None) + with set_options(use_new_combine_kwarg_defaults=True): + new = combine_nested([ds1, ds2], concat_dim=None) + + assert_identical(ds1, old) + assert_identical(old, new) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = combine_nested([ds2, ds1], concat_dim=None) + with set_options(use_new_combine_kwarg_defaults=True): + new = combine_nested([ds2, ds1], concat_dim=None) + + assert_identical(ds1, old) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_nested_merge_with_concat_dim_explicitly_provided(self): + # Test the issue reported in GH #1988 + objs = [Dataset({"x": 0, "y": 1})] + dim = DataArray([100], name="baz", dims="baz") + expected = Dataset({"x": ("baz", [0]), "y": ("baz", [1])}, {"baz": [100]}) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = combine_nested(objs, concat_dim=dim) + with set_options(use_new_combine_kwarg_defaults=True): + new = combine_nested(objs, concat_dim=dim) + + assert_identical(expected, old) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_combine_nested_missing_data_new_dim(self): + # Your data includes "time" and "station" dimensions, and each year's + # data has a different set of stations. + datasets = [ + Dataset({"a": ("x", [2, 3]), "x": [1, 2]}), + Dataset({"a": ("x", [1, 2]), "x": [0, 1]}), + ] + expected = Dataset( + {"a": (("t", "x"), [[np.nan, 2, 3], [1, 2, np.nan]])}, {"x": [0, 1, 2]} + ) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = combine_nested(datasets, concat_dim="t") + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + combine_nested(datasets, concat_dim="t") + + assert_identical(expected, old) + + def test_combine_by_coords_multiple_variables(self): + objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] + expected = Dataset({"x": [0, 1], "y": [0, 1]}) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + old = combine_by_coords(objs) + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + combine_by_coords(objs) + + assert_identical(old, expected) + + @requires_cftime def test_combine_by_coords_distant_cftime_dates(): # Regression test for https://github.com/pydata/xarray/issues/3535 diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index b970781fe28..34ad36a1e12 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -446,7 +446,11 @@ def test_concat_loads_variables(self): assert kernel_call_count == 0 out = xr.concat( - [ds1, ds2, ds3], dim="n", data_vars="different", coords="different" + [ds1, ds2, ds3], + dim="n", + data_vars="different", + coords="different", + compat="equals", ) # each kernel is computed exactly once assert kernel_call_count == 6 @@ -488,7 +492,11 @@ def test_concat_loads_variables(self): # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={"d": ("x", [2.0])}, coords={"c": ("x", [2.0])}) out = xr.concat( - [ds1, ds2, ds4, ds3], dim="n", data_vars="different", coords="different" + [ds1, ds2, ds4, ds3], + dim="n", + data_vars="different", + coords="different", + compat="equals", ) # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 @@ -509,7 +517,11 @@ def test_concat_loads_variables(self): # now check that concat() is correctly using dask name equality to skip loads out = xr.concat( - [ds1, ds1, ds1], dim="n", data_vars="different", coords="different" + [ds1, ds1, ds1], + dim="n", + data_vars="different", + coords="different", + compat="equals", ) assert kernel_call_count == 24 # variables are not loaded in the output @@ -1375,7 +1387,9 @@ def test_map_blocks_ds_transformations(func, map_ds): def test_map_blocks_da_ds_with_template(obj): func = lambda x: x.isel(x=[1]) # a simple .isel(x=[1, 5, 9]) puts all those in a single chunk. - template = xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x") + template = xr.concat( + [obj.isel(x=[i]) for i in [1, 5, 9]], data_vars="minimal", dim="x" + ) with raise_if_dask_computes(): actual = xr.map_blocks(func, obj, template=template) assert_identical(actual, template) @@ -1448,7 +1462,9 @@ def test_map_blocks_errors_bad_template(obj): xr.map_blocks( lambda a: a.isel(x=[1]).assign_coords(x=[120]), # assign bad index values obj, - template=xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x"), + template=xr.concat( + [obj.isel(x=[i]) for i in [1, 5, 9]], data_vars="minimal", dim="x" + ), ).compute() diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 75d6d919e19..66546283d4b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1412,12 +1412,25 @@ def test_selection_multiindex_from_level(self) -> None: # GH: 3512 da = DataArray([0, 1], dims=["x"], coords={"x": [0, 1], "y": "a"}) db = DataArray([2, 3], dims=["x"], coords={"x": [0, 1], "y": "b"}) - data = xr.concat([da, db], dim="x").set_index(xy=["x", "y"]) + data = xr.concat( + [da, db], dim="x", coords="different", compat="equals" + ).set_index(xy=["x", "y"]) assert data.dims == ("xy",) actual = data.sel(y="a") expected = data.isel(xy=[0, 1]).unstack("xy").squeeze("y") assert_equal(actual, expected) + def test_concat_with_default_coords_warns(self) -> None: + da = DataArray([0, 1], dims=["x"], coords={"x": [0, 1], "y": "a"}) + db = DataArray([2, 3], dims=["x"], coords={"x": [0, 1], "y": "b"}) + + with pytest.warns(FutureWarning): + original = xr.concat([da, db], dim="x") + with set_options(use_new_combine_kwarg_defaults=True): + new = xr.concat([da, db], dim="x") + + assert original.y.shape != new.y.shape + def test_virtual_default_coords(self) -> None: array = DataArray(np.zeros((5,)), dims="x") expected = DataArray(range(5), dims="x", name="x") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7be2d13f9dd..28f932c8716 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6085,7 +6085,7 @@ def test_dataset_math_auto_align(self) -> None: assert_equal(actual, expected) actual = ds + ds[["bar"]] - expected = (2 * ds[["bar"]]).merge(ds.coords) + expected = (2 * ds[["bar"]]).merge(ds.coords, compat="override") assert_identical(expected, actual) assert_identical(ds + Dataset(), ds.coords.to_dataset()) @@ -6521,12 +6521,12 @@ def test_combine_first(self) -> None: coords={"x": ["a", "b", "c"]}, ) assert_equal(actual, expected) - assert_equal(actual, xr.merge([dsx0, dsx1])) + assert_equal(actual, xr.merge([dsx0, dsx1], join="outer")) # works just like xr.merge([self, other]) dsy2 = DataArray([2, 2, 2], [("x", ["b", "c", "d"])]).to_dataset(name="dsy2") actual = dsx0.combine_first(dsy2) - expected = xr.merge([dsy2, dsx0]) + expected = xr.merge([dsy2, dsx0], join="outer") assert_equal(actual, expected) def test_sortby(self) -> None: diff --git a/xarray/tests/test_duck_array_wrapping.py b/xarray/tests/test_duck_array_wrapping.py index 59928dce370..b0c9d40a8cc 100644 --- a/xarray/tests/test_duck_array_wrapping.py +++ b/xarray/tests/test_duck_array_wrapping.py @@ -155,7 +155,7 @@ def test_concat(self): assert isinstance(result.data, self.Array) def test_merge(self): - result = xr.merge([self.x1, self.x2], compat="override") + result = xr.merge([self.x1, self.x2], compat="override", join="outer") assert isinstance(result.foo.data, self.Array) def test_where(self): diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index d42f86f5ea6..be5ec0b28af 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -2402,6 +2402,7 @@ def test_resample_min_count(self) -> None: for i in range(3) ], dim=actual["time"], + data_vars="all", ) assert_allclose(expected, actual) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 52935e9714e..7d346994d6b 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -6,6 +6,7 @@ import xarray as xr from xarray.core import dtypes, merge from xarray.core.merge import MergeError +from xarray.core.options import set_options from xarray.testing import assert_equal, assert_identical from xarray.tests.test_dataset import create_test_data @@ -36,15 +37,17 @@ def test_merge_arrays(self): expected = data[["var1", "var2"]] assert_identical(actual, expected) - def test_merge_datasets(self): - data = create_test_data(add_attrs=False, use_extension_array=True) + @pytest.mark.parametrize("use_new_combine_kwarg_defaults", [True, False]) + def test_merge_datasets(self, use_new_combine_kwarg_defaults): + with set_options(use_new_combine_kwarg_defaults=use_new_combine_kwarg_defaults): + data = create_test_data(add_attrs=False, use_extension_array=True) - actual = xr.merge([data[["var1"]], data[["var2"]]]) - expected = data[["var1", "var2"]] - assert_identical(actual, expected) + actual = xr.merge([data[["var1"]], data[["var2"]]]) + expected = data[["var1", "var2"]] + assert_identical(actual, expected) - actual = xr.merge([data, data]) - assert_identical(actual, data) + actual = xr.merge([data, data], compat="no_conflicts") + assert_identical(actual, data) def test_merge_dataarray_unnamed(self): data = xr.DataArray([1, 2], dims="x") @@ -191,9 +194,13 @@ def test_merge_arrays_attrs_variables( if expect_exception: with pytest.raises(MergeError, match="combine_attrs"): - actual = xr.merge([data1, data2], combine_attrs=combine_attrs) + actual = xr.merge( + [data1, data2], compat="no_conflicts", combine_attrs=combine_attrs + ) else: - actual = xr.merge([data1, data2], combine_attrs=combine_attrs) + actual = xr.merge( + [data1, data2], compat="no_conflicts", combine_attrs=combine_attrs + ) expected = xr.Dataset( {"var1": ("dim1", [], expected_attrs)}, coords={"dim1": ("dim1", [], expected_attrs)}, @@ -266,8 +273,12 @@ def test_merge_no_conflicts_single_var(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = xr.Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) - assert expected.identical(xr.merge([ds1, ds2], compat="no_conflicts")) - assert expected.identical(xr.merge([ds2, ds1], compat="no_conflicts")) + assert expected.identical( + xr.merge([ds1, ds2], compat="no_conflicts", join="outer") + ) + assert expected.identical( + xr.merge([ds2, ds1], compat="no_conflicts", join="outer") + ) assert ds1.identical(xr.merge([ds1, ds2], compat="no_conflicts", join="left")) assert ds2.identical(xr.merge([ds1, ds2], compat="no_conflicts", join="right")) expected = xr.Dataset({"a": ("x", [2]), "x": [1]}) @@ -277,11 +288,11 @@ def test_merge_no_conflicts_single_var(self): with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("x", [99, 3]), "x": [1, 2]}) - xr.merge([ds1, ds3], compat="no_conflicts") + xr.merge([ds1, ds3], compat="no_conflicts", join="outer") with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("y", [2, 3]), "y": [1, 2]}) - xr.merge([ds1, ds3], compat="no_conflicts") + xr.merge([ds1, ds3], compat="no_conflicts", join="outer") def test_merge_no_conflicts_multi_var(self): data = create_test_data(add_attrs=False) @@ -303,17 +314,19 @@ def test_merge_no_conflicts_multi_var(self): def test_merge_no_conflicts_preserve_attrs(self): data = xr.Dataset({"x": ([], 0, {"foo": "bar"})}) - actual = xr.merge([data, data], combine_attrs="no_conflicts") + actual = xr.merge( + [data, data], compat="no_conflicts", combine_attrs="no_conflicts" + ) assert_identical(data, actual) def test_merge_no_conflicts_broadcast(self): datasets = [xr.Dataset({"x": ("y", [0])}), xr.Dataset({"x": np.nan})] - actual = xr.merge(datasets) + actual = xr.merge(datasets, compat="no_conflicts") expected = xr.Dataset({"x": ("y", [0])}) assert_identical(expected, actual) datasets = [xr.Dataset({"x": ("y", [np.nan])}), xr.Dataset({"x": 0})] - actual = xr.merge(datasets) + actual = xr.merge(datasets, compat="no_conflicts") assert_identical(expected, actual) @@ -329,27 +342,27 @@ def test_merge(self): actual = ds2.merge(ds1) assert_identical(expected, actual) - actual = data.merge(data) + actual = data.merge(data, compat="no_conflicts") assert_identical(data, actual) - actual = data.reset_coords(drop=True).merge(data) + actual = data.reset_coords(drop=True).merge(data, compat="no_conflicts") assert_identical(data, actual) - actual = data.merge(data.reset_coords(drop=True)) + actual = data.merge(data.reset_coords(drop=True), compat="no_conflicts") assert_identical(data, actual) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="conflicting values for variable"): ds1.merge(ds2.rename({"var3": "var1"})) with pytest.raises(ValueError, match=r"should be coordinates or not"): - data.reset_coords().merge(data) + data.reset_coords().merge(data, compat="no_conflicts") with pytest.raises(ValueError, match=r"should be coordinates or not"): - data.merge(data.reset_coords()) + data.merge(data.reset_coords(), compat="no_conflicts") def test_merge_broadcast_equals(self): ds1 = xr.Dataset({"x": 0}) ds2 = xr.Dataset({"x": ("y", [0, 0])}) - actual = ds1.merge(ds2) + actual = ds1.merge(ds2, compat="no_conflicts") assert_identical(ds2, actual) - actual = ds2.merge(ds1) + actual = ds2.merge(ds1, compat="override") assert_identical(ds2, actual) actual = ds1.copy() @@ -358,7 +371,7 @@ def test_merge_broadcast_equals(self): ds1 = xr.Dataset({"x": np.nan}) ds2 = xr.Dataset({"x": ("y", [np.nan, np.nan])}) - actual = ds1.merge(ds2) + actual = ds1.merge(ds2, compat="no_conflicts") assert_identical(ds2, actual) def test_merge_compat(self): @@ -398,8 +411,8 @@ def test_merge_auto_align(self): expected = xr.Dataset( {"a": ("x", [1, 2, np.nan]), "b": ("x", [np.nan, 3, 4])}, {"x": [0, 1, 2]} ) - assert expected.identical(ds1.merge(ds2)) - assert expected.identical(ds2.merge(ds1)) + assert expected.identical(ds1.merge(ds2, join="outer")) + assert expected.identical(ds2.merge(ds1, join="outer")) expected = expected.isel(x=slice(2)) assert expected.identical(ds1.merge(ds2, join="left")) @@ -427,17 +440,19 @@ def test_merge_fill_value(self, fill_value): {"a": ("x", [1, 2, fill_value_a]), "b": ("x", [fill_value_b, 3, 4])}, {"x": [0, 1, 2]}, ) - assert expected.identical(ds1.merge(ds2, fill_value=fill_value)) - assert expected.identical(ds2.merge(ds1, fill_value=fill_value)) - assert expected.identical(xr.merge([ds1, ds2], fill_value=fill_value)) + assert expected.identical(ds1.merge(ds2, join="outer", fill_value=fill_value)) + assert expected.identical(ds2.merge(ds1, join="outer", fill_value=fill_value)) + assert expected.identical( + xr.merge([ds1, ds2], join="outer", fill_value=fill_value) + ) def test_merge_no_conflicts(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = xr.Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) - assert expected.identical(ds1.merge(ds2, compat="no_conflicts")) - assert expected.identical(ds2.merge(ds1, compat="no_conflicts")) + assert expected.identical(ds1.merge(ds2, compat="no_conflicts", join="outer")) + assert expected.identical(ds2.merge(ds1, compat="no_conflicts", join="outer")) assert ds1.identical(ds1.merge(ds2, compat="no_conflicts", join="left")) @@ -448,11 +463,11 @@ def test_merge_no_conflicts(self): with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("x", [99, 3]), "x": [1, 2]}) - ds1.merge(ds3, compat="no_conflicts") + ds1.merge(ds3, compat="no_conflicts", join="outer") with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("y", [2, 3]), "y": [1, 2]}) - ds1.merge(ds3, compat="no_conflicts") + ds1.merge(ds3, compat="no_conflicts", join="outer") def test_merge_dataarray(self): ds = xr.Dataset({"a": 0}) @@ -490,3 +505,80 @@ def test_merge_combine_attrs( actual = ds1.merge(ds2, combine_attrs=combine_attrs) expected = xr.Dataset(attrs=expected_attrs) assert_identical(actual, expected) + + +class TestNewDefaults: + def test_merge_datasets_false_warning(self): + data = create_test_data(add_attrs=False, use_extension_array=True) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = xr.merge([data, data]) + + with set_options(use_new_combine_kwarg_defaults=True): + new = xr.merge([data, data]) + + assert_identical(old, new) + + def test_merge(self): + data = create_test_data() + ds1 = data[["var1"]] + ds2 = data[["var3"]] + expected = data[["var1", "var3"]] + with set_options(use_new_combine_kwarg_defaults=True): + actual = ds1.merge(ds2) + assert_identical(expected, actual) + + actual = ds2.merge(ds1) + assert_identical(expected, actual) + + actual = data.merge(data) + assert_identical(data, actual) + + ds1.merge(ds2.rename({"var3": "var1"})) + + with pytest.raises(ValueError, match=r"should be coordinates or not"): + data.reset_coords().merge(data) + with pytest.raises(ValueError, match=r"should be coordinates or not"): + data.merge(data.reset_coords()) + + def test_merge_broadcast_equals(self): + ds1 = xr.Dataset({"x": 0}) + ds2 = xr.Dataset({"x": ("y", [0, 0])}) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = ds1.merge(ds2) + + with set_options(use_new_combine_kwarg_defaults=True): + new = ds1.merge(ds2) + + assert_identical(ds2, old) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_merge_auto_align(self): + ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) + ds2 = xr.Dataset({"b": ("x", [3, 4]), "x": [1, 2]}) + expected = xr.Dataset( + {"a": ("x", [1, 2, np.nan]), "b": ("x", [np.nan, 3, 4])}, {"x": [0, 1, 2]} + ) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + assert expected.identical(ds1.merge(ds2)) + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + assert expected.identical(ds2.merge(ds1)) + + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + expected.identical(ds2.merge(ds1)) diff --git a/xarray/util/deprecation_helpers.py b/xarray/util/deprecation_helpers.py index 1064082872d..ddae2cdf9f3 100644 --- a/xarray/util/deprecation_helpers.py +++ b/xarray/util/deprecation_helpers.py @@ -35,9 +35,10 @@ import warnings from collections.abc import Callable from functools import wraps -from typing import TypeVar +from typing import Any, TypeVar -from xarray.core.utils import emit_user_level_warning +from xarray.core.options import OPTIONS +from xarray.core.utils import ReprObject, emit_user_level_warning T = TypeVar("T", bound=Callable) @@ -145,3 +146,66 @@ def wrapper(*args, **kwargs): # We're quite confident we're just returning `T` from this function, so it's fine to ignore typing # within the function. return wrapper # type: ignore[return-value] + + +class CombineKwargDefault(ReprObject): + """Object that handles deprecation cycle for kwarg default values.""" + + _old: str + _new: str + _name: str + + def __init__(self, *, name: str, old: str, new: str): + self._name = name + self._old = old + self._new = new + + def __eq__(self, other: ReprObject | Any) -> bool: + # TODO: What type can other be? ArrayLike? + return ( + self._value == other._value + if isinstance(other, ReprObject) + else self._value == other + ) + + @property + def _value(self): + return self._new if OPTIONS["use_new_combine_kwarg_defaults"] else self._old + + def __hash__(self) -> int: + return hash(self._value) + + def warning_message(self, message: str, recommend_set_options: bool = True): + if recommend_set_options: + recommendation = ( + " To opt in to new defaults and get rid of these warnings now " + "use `set_options(use_new_combine_kwarg_defaults=True) or " + f"set {self._name} explicitly." + ) + else: + recommendation = ( + f" The recommendation is to set {self._name} explicitly for this case." + ) + + return ( + f"In a future version of xarray the default value for {self._name} will " + + f"change from {self._name}={self._old!r} to {self._name}={self._new!r}. " + + message + + recommendation + ) + + def error_message(self): + return ( + f" Error might be related to new default ({self._name}={self._new!r}). " + f"Previously the default was {self._name}={self._old!r}. " + f"The recommendation is to set {self._name} explicitly for this case." + ) + + +_DATA_VARS_DEFAULT = CombineKwargDefault(name="data_vars", old="all", new="minimal") +_COORDS_DEFAULT = CombineKwargDefault(name="coords", old="different", new="minimal") +_COMPAT_CONCAT_DEFAULT = CombineKwargDefault( + name="compat", old="equals", new="override" +) +_COMPAT_DEFAULT = CombineKwargDefault(name="compat", old="no_conflicts", new="override") +_JOIN_DEFAULT = CombineKwargDefault(name="join", old="outer", new="exact")