From aa880014e808cd4a6689ef4a6c17c20c84c6e7d4 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 14 Feb 2025 10:04:51 -0500 Subject: [PATCH 1/3] Set up new default values for kwargs on `open_mfdataset` --- xarray/backends/api.py | 140 ++++++++++++++++++++++++++++++++++++----- xarray/core/options.py | 6 ++ 2 files changed, 132 insertions(+), 14 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 950a5d16273..7404f7a3cb7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -44,6 +44,7 @@ from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk from xarray.core.datatree import DataTree from xarray.core.indexes import Index +from xarray.core.options import OPTIONS from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes from xarray.core.utils import is_remote_uri @@ -147,6 +148,100 @@ def _get_default_engine(path: str, allow_remote: bool = False) -> T_NetcdfEngine return _get_default_engine_netcdf() +def _get_default_open_mfdataset_kwargs( + datasets: list[Dataset], + *, + concat_dim: str | None = None, + **kwargs, +): + """This is a short-term helper used for changing the defaults. + + with + a deprecation cycle and warnings if behavior changes + """ + input_kwargs = {k: v for k, v in kwargs.items() if v is not None} + + old_defaults = dict( + data_vars="all", coords="different", compat="no_conflicts", join="outer" + ) + new_defaults = dict( + data_vars="minimal", coords="minimal", compat="override", join="exact" + ) + + # if user has opted in to new defaults, do not try to provide proactive warnings + if OPTIONS["use_new_open_mfdataset_kwargs"]: + return {**new_defaults, **input_kwargs} + + if "data_vars" not in input_kwargs: + # To figure out if any concat_dims overlap we follow these steps: + # + # 1) determine the concat_dims if undefined + # 2) see if any of the datasets overlap with each other along all + # concat_dims. + if isinstance(concat_dim, str | DataArray) or concat_dim is None: + concat_dims = [concat_dim] + else: + concat_dims = concat_dim + + if len(concat_dims) == 1 and concat_dims[0] is None: + concat_dims = [] + # All datasets have same variables because they've been grouped as such + ds0 = datasets[0] + for dim in ds0.dims: + # Check if dim is a coordinate dimension + if dim in ds0: + # Need to read coordinate values to do ordering + indexes = [ + ds._indexes[dim] for ds in datasets if dim in ds._indexes + ] + # TODO (benbovy, flexible indexes): support flexible indexes? + indexes = [index.to_pandas_index() for index in indexes] + + # If dimension coordinate values are same on every dataset then + # should be leaving this dimension alone (it's just a "bystander") + if not all(index.equals(indexes[0]) for index in indexes[1:]): + concat_dims.append(dim) + + concat_dim_ranges = {} + for ds in datasets: + overlaps_at_index = [] + for concat_dim in concat_dims: + # Using a DataArray as concat_dim will always result in a different behavior + if isinstance(concat_dim, DataArray): + raise ValueError("Warn about new default for data_vars") + # dimensions without coordinate arrays are fine + if concat_dim is None or concat_dim not in ds: + continue + concat_dim_min = ds[concat_dim].min() + concat_dim_max = ds[concat_dim].max() + for i, (min, max) in enumerate(concat_dim_ranges.get(concat_dim, [])): + if concat_dim_min <= max and concat_dim_max >= min: + overlaps_at_index.append(i) + concat_dim_ranges[concat_dim] = [ + *concat_dim_ranges.get(concat_dim, []), + (concat_dim_min, concat_dim_max), + ] + # check if this dataset overlaps with one other dataset on all concat_dims + if ( + len(overlaps_at_index) == len(concat_dims) + and len(set(overlaps_at_index)) == 1 + ): + raise ValueError("Warn about new default for data_vars") + + if "compat" not in input_kwargs and input_kwargs.get("data_vars") == "different": + # make sure people set compat explicitly for this case + raise ValueError( + "Warn about new default for compat - given specified data_vars value" + ) + + if "join" not in input_kwargs: + # TODO: Figure out how conditions that xarray/tests/test_backends.py::TestDask::test_encoding_mfdataset + # is failing on OR if it should be failing at all + pass + + return {**old_defaults, **input_kwargs} + + def _validate_dataset_names(dataset: Dataset) -> None: """DataArray.name and Dataset keys must be a string or None""" @@ -1402,14 +1497,14 @@ def open_mfdataset( | Sequence[Index] | None ) = None, - compat: CompatOptions = "no_conflicts", + compat: CompatOptions | None = None, preprocess: Callable[[Dataset], Dataset] | None = None, engine: T_Engine | None = None, - data_vars: Literal["all", "minimal", "different"] | list[str] = "all", - coords="different", + data_vars: Literal["all", "minimal", "different"] | list[str] | None = None, + coords: Literal["all", "minimal", "different"] | None = None, combine: Literal["by_coords", "nested"] = "by_coords", parallel: bool = False, - join: JoinOptions = "outer", + join: JoinOptions | None = None, attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", **kwargs, @@ -1642,17 +1737,24 @@ def open_mfdataset( # Combine all datasets, closing them in case of a ValueError try: + # for any data_vars, coords, compat, or join that is None, set to + # default - with warning if appropriate + kwargs_with_defaults = _get_default_open_mfdataset_kwargs( + datasets, + concat_dim=concat_dim, + data_vars=data_vars, + coords=coords, + compat=compat, + join=join, + ) if combine == "nested": # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined = _nested_combine( datasets, concat_dims=concat_dim, - compat=compat, - data_vars=data_vars, - coords=coords, + **kwargs_with_defaults, ids=ids, - join=join, combine_attrs=combine_attrs, ) elif combine == "by_coords": @@ -1660,20 +1762,30 @@ def open_mfdataset( # previously combined = combine_by_coords( datasets, - compat=compat, - data_vars=data_vars, - coords=coords, - join=join, + **kwargs_with_defaults, combine_attrs=combine_attrs, ) else: raise ValueError( f"{combine} is an invalid option for the keyword argument ``combine``" ) - except ValueError: + except ValueError as e: for ds in datasets: ds.close() - raise + # When using new defaults, add a warning if the failure appears to be related + # to the new kwarg values. + for kwarg, new, old in ( + (data_vars, "data_vars='minimal'", "date_vars='all'"), + (coords, "coords='minimal'", "coords='different'"), + (compat, "compat='override'", "compat='no_conflicts'"), + (join, "join='exact'", "join='outer'"), + ): + if kwarg is None and new in str(e): + raise ValueError( + f"Failure might be related to using new default ({new}) " + f"in `open_mfdataset`. Previously the default was {old}" + ) from e + raise e combined.set_close(partial(_multi_file_closer, closers)) diff --git a/xarray/core/options.py b/xarray/core/options.py index 2d69e4b6584..01331f9f9df 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -29,6 +29,7 @@ "keep_attrs", "warn_for_unclosed_files", "use_bottleneck", + "use_new_open_mfdataset_kwargs", "use_numbagg", "use_opt_einsum", "use_flox", @@ -57,6 +58,7 @@ class T_Options(TypedDict): warn_for_unclosed_files: bool use_bottleneck: bool use_flox: bool + use_new_open_mfdataset_kwargs: bool use_numbagg: bool use_opt_einsum: bool @@ -84,6 +86,7 @@ class T_Options(TypedDict): "warn_for_unclosed_files": False, "use_bottleneck": True, "use_flox": True, + "use_new_open_mfdataset_kwargs": False, "use_numbagg": True, "use_opt_einsum": True, } @@ -113,6 +116,7 @@ def _positive_integer(value: Any) -> bool: "file_cache_maxsize": _positive_integer, "keep_attrs": lambda choice: choice in [True, False, "default"], "use_bottleneck": lambda value: isinstance(value, bool), + "use_new_open_mfdataset_kwargs": lambda value: isinstance(value, bool), "use_numbagg": lambda value: isinstance(value, bool), "use_opt_einsum": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), @@ -250,6 +254,8 @@ class set_options: use_flox : bool, default: True Whether to use ``numpy_groupies`` and `flox`` to accelerate groupby and resampling reductions. + use_new_open_mfdataset_kwargs : bool, default False + Whether to use new default kwarg values for open_mfdataset. use_numbagg : bool, default: True Whether to use ``numbagg`` to accelerate reductions. Takes precedence over ``use_bottleneck`` when both are True. From a02a61c7f637ae1bffa6b6e22fbd41e3fb5456d3 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 14 Feb 2025 11:42:49 -0500 Subject: [PATCH 2/3] Switch to warnings --- xarray/backends/api.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7404f7a3cb7..1378fabfcbc 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import warnings from collections.abc import ( Callable, Hashable, @@ -208,7 +209,16 @@ def _get_default_open_mfdataset_kwargs( for concat_dim in concat_dims: # Using a DataArray as concat_dim will always result in a different behavior if isinstance(concat_dim, DataArray): - raise ValueError("Warn about new default for data_vars") + warnings.warn( + "In a future version of xarray default value for data_vars will change from " + "data_vars='all' to data_vars='minimal'. This is likely to lead to " + "different results when using a `DataArray` as the concat_dim. " + "To opt in to new defaults and get rid of these warnings now " + "use `set_option(use_new_open_mfdataset_kwargs=True) or set data_vars explicitly.", + category=FutureWarning, + stacklevel=2, + ) + break # dimensions without coordinate arrays are fine if concat_dim is None or concat_dim not in ds: continue @@ -226,12 +236,26 @@ def _get_default_open_mfdataset_kwargs( len(overlaps_at_index) == len(concat_dims) and len(set(overlaps_at_index)) == 1 ): - raise ValueError("Warn about new default for data_vars") + warnings.warn( + "In a future version of xarray default value for data_vars will change from " + "data_vars='all' to data_vars='minimal'. This is likely to lead to " + "different results when multiple datasets have overlapping concat_dim values. " + "To opt in to new defaults and get rid of these warnings now " + "use `set_option(use_new_open_mfdataset_kwargs=True) or set data_vars explicitly.", + category=FutureWarning, + stacklevel=2, + ) + break if "compat" not in input_kwargs and input_kwargs.get("data_vars") == "different": # make sure people set compat explicitly for this case - raise ValueError( - "Warn about new default for compat - given specified data_vars value" + warnings.warn( + "In a future version of xarray default value for compat will change from " + "compat='no_conflicts' to compat='override'. This is likely to lead to " + "failures when used with data_vars='different'. The recommendation is to " + "set compat='no_conflicts' explicitly for this case.", + category=FutureWarning, + stacklevel=2, ) if "join" not in input_kwargs: @@ -1781,10 +1805,12 @@ def open_mfdataset( (join, "join='exact'", "join='outer'"), ): if kwarg is None and new in str(e): - raise ValueError( - f"Failure might be related to using new default ({new}) " - f"in `open_mfdataset`. Previously the default was {old}" - ) from e + warnings.warn( + f"Failure might be related to new default ({new}) " + f"in `open_mfdataset`. Previously the default was {old}", + category=UserWarning, + stacklevel=2, + ) raise e combined.set_close(partial(_multi_file_closer, closers)) From 5af3d0a81b1a6a31a45b802ac9b2b18ca7196379 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 14 Feb 2025 11:52:49 -0500 Subject: [PATCH 3/3] Fix docstring --- xarray/backends/api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1378fabfcbc..b6b982b51f9 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -157,8 +157,9 @@ def _get_default_open_mfdataset_kwargs( ): """This is a short-term helper used for changing the defaults. - with - a deprecation cycle and warnings if behavior changes + This function supports a deprecation cycle that tries to throw + warnings if the outcome of `open_mfdataset` would be different with + the new default kwargs compared to the old ones. """ input_kwargs = {k: v for k, v in kwargs.items() if v is not None}