Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set up new default values for kwargs on open_mfdataset #10051

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 153 additions & 14 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import os
import warnings
from collections.abc import (
Callable,
Hashable,
Expand Down Expand Up @@ -44,6 +45,7 @@
from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk
from xarray.core.datatree import DataTree
from xarray.core.indexes import Index
from xarray.core.options import OPTIONS
from xarray.core.treenode import group_subtrees
from xarray.core.types import NetcdfWriteModes, ZarrWriteModes
from xarray.core.utils import is_remote_uri
Expand Down Expand Up @@ -147,6 +149,124 @@ def _get_default_engine(path: str, allow_remote: bool = False) -> T_NetcdfEngine
return _get_default_engine_netcdf()


def _get_default_open_mfdataset_kwargs(
datasets: list[Dataset],
*,
concat_dim: str | None = None,
**kwargs,
):
"""This is a short-term helper used for changing the defaults.

This function supports a deprecation cycle that tries to throw
warnings if the outcome of `open_mfdataset` would be different with
the new default kwargs compared to the old ones.
"""
input_kwargs = {k: v for k, v in kwargs.items() if v is not None}

old_defaults = dict(
data_vars="all", coords="different", compat="no_conflicts", join="outer"
)
new_defaults = dict(
data_vars="minimal", coords="minimal", compat="override", join="exact"
)

# if user has opted in to new defaults, do not try to provide proactive warnings
if OPTIONS["use_new_open_mfdataset_kwargs"]:
return {**new_defaults, **input_kwargs}

if "data_vars" not in input_kwargs:
# To figure out if any concat_dims overlap we follow these steps:
#
# 1) determine the concat_dims if undefined
# 2) see if any of the datasets overlap with each other along all
# concat_dims.
if isinstance(concat_dim, str | DataArray) or concat_dim is None:
concat_dims = [concat_dim]
else:
concat_dims = concat_dim

if len(concat_dims) == 1 and concat_dims[0] is None:
concat_dims = []
# All datasets have same variables because they've been grouped as such
ds0 = datasets[0]
for dim in ds0.dims:
# Check if dim is a coordinate dimension
if dim in ds0:
# Need to read coordinate values to do ordering
indexes = [
ds._indexes[dim] for ds in datasets if dim in ds._indexes
]
# TODO (benbovy, flexible indexes): support flexible indexes?
indexes = [index.to_pandas_index() for index in indexes]

# If dimension coordinate values are same on every dataset then
# should be leaving this dimension alone (it's just a "bystander")
if not all(index.equals(indexes[0]) for index in indexes[1:]):
concat_dims.append(dim)

concat_dim_ranges = {}
for ds in datasets:
overlaps_at_index = []
for concat_dim in concat_dims:
# Using a DataArray as concat_dim will always result in a different behavior
if isinstance(concat_dim, DataArray):
warnings.warn(
"In a future version of xarray default value for data_vars will change from "
"data_vars='all' to data_vars='minimal'. This is likely to lead to "
"different results when using a `DataArray` as the concat_dim. "
"To opt in to new defaults and get rid of these warnings now "
"use `set_option(use_new_open_mfdataset_kwargs=True) or set data_vars explicitly.",
category=FutureWarning,
stacklevel=2,
)
break
# dimensions without coordinate arrays are fine
if concat_dim is None or concat_dim not in ds:
continue
concat_dim_min = ds[concat_dim].min()
concat_dim_max = ds[concat_dim].max()
for i, (min, max) in enumerate(concat_dim_ranges.get(concat_dim, [])):
if concat_dim_min <= max and concat_dim_max >= min:
overlaps_at_index.append(i)
concat_dim_ranges[concat_dim] = [
*concat_dim_ranges.get(concat_dim, []),
(concat_dim_min, concat_dim_max),
]
# check if this dataset overlaps with one other dataset on all concat_dims
if (
len(overlaps_at_index) == len(concat_dims)
and len(set(overlaps_at_index)) == 1
):
warnings.warn(
"In a future version of xarray default value for data_vars will change from "
"data_vars='all' to data_vars='minimal'. This is likely to lead to "
"different results when multiple datasets have overlapping concat_dim values. "
"To opt in to new defaults and get rid of these warnings now "
"use `set_option(use_new_open_mfdataset_kwargs=True) or set data_vars explicitly.",
category=FutureWarning,
stacklevel=2,
)
break

if "compat" not in input_kwargs and input_kwargs.get("data_vars") == "different":
# make sure people set compat explicitly for this case
warnings.warn(
"In a future version of xarray default value for compat will change from "
"compat='no_conflicts' to compat='override'. This is likely to lead to "
"failures when used with data_vars='different'. The recommendation is to "
"set compat='no_conflicts' explicitly for this case.",
category=FutureWarning,
stacklevel=2,
)

if "join" not in input_kwargs:
# TODO: Figure out how conditions that xarray/tests/test_backends.py::TestDask::test_encoding_mfdataset
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am having a hard time figuring out how to catch the conditions that make xarray/tests/test_backends.py::TestDask::test_encoding_mfdataset fail with the new join kwarg. It seems like it has to do with the concat_dim in each dataset having a different encoding, but the values of the concat_dim in each dataset doe not overlap. So I'm a little wondering if the align index function is too strict.

# is failing on OR if it should be failing at all
pass

return {**old_defaults, **input_kwargs}


def _validate_dataset_names(dataset: Dataset) -> None:
"""DataArray.name and Dataset keys must be a string or None"""

Expand Down Expand Up @@ -1402,14 +1522,14 @@ def open_mfdataset(
| Sequence[Index]
| None
) = None,
compat: CompatOptions = "no_conflicts",
compat: CompatOptions | None = None,
preprocess: Callable[[Dataset], Dataset] | None = None,
engine: T_Engine | None = None,
data_vars: Literal["all", "minimal", "different"] | list[str] = "all",
coords="different",
data_vars: Literal["all", "minimal", "different"] | list[str] | None = None,
coords: Literal["all", "minimal", "different"] | None = None,
combine: Literal["by_coords", "nested"] = "by_coords",
parallel: bool = False,
join: JoinOptions = "outer",
join: JoinOptions | None = None,
attrs_file: str | os.PathLike | None = None,
combine_attrs: CombineAttrsOptions = "override",
**kwargs,
Expand Down Expand Up @@ -1642,38 +1762,57 @@ def open_mfdataset(

# Combine all datasets, closing them in case of a ValueError
try:
# for any data_vars, coords, compat, or join that is None, set to
# default - with warning if appropriate
kwargs_with_defaults = _get_default_open_mfdataset_kwargs(
datasets,
concat_dim=concat_dim,
data_vars=data_vars,
coords=coords,
compat=compat,
join=join,
)
if combine == "nested":
# Combined nested list by successive concat and merge operations
# along each dimension, using structure given by "ids"
combined = _nested_combine(
datasets,
concat_dims=concat_dim,
compat=compat,
data_vars=data_vars,
coords=coords,
**kwargs_with_defaults,
ids=ids,
join=join,
combine_attrs=combine_attrs,
)
elif combine == "by_coords":
# Redo ordering from coordinates, ignoring how they were ordered
# previously
combined = combine_by_coords(
datasets,
compat=compat,
data_vars=data_vars,
coords=coords,
join=join,
**kwargs_with_defaults,
combine_attrs=combine_attrs,
)
else:
raise ValueError(
f"{combine} is an invalid option for the keyword argument ``combine``"
)
except ValueError:
except ValueError as e:
for ds in datasets:
ds.close()
raise
# When using new defaults, add a warning if the failure appears to be related
# to the new kwarg values.
for kwarg, new, old in (
(data_vars, "data_vars='minimal'", "date_vars='all'"),
(coords, "coords='minimal'", "coords='different'"),
(compat, "compat='override'", "compat='no_conflicts'"),
(join, "join='exact'", "join='outer'"),
):
if kwarg is None and new in str(e):
warnings.warn(
f"Failure might be related to new default ({new}) "
f"in `open_mfdataset`. Previously the default was {old}",
category=UserWarning,
stacklevel=2,
)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the warnings that are meant to provide extra context on failures when you set use_new_open_mfdataset_kwargs=True and for a while after xarray switches to using the new defaults by default.

raise e

combined.set_close(partial(_multi_file_closer, closers))

Expand Down
6 changes: 6 additions & 0 deletions xarray/core/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"keep_attrs",
"warn_for_unclosed_files",
"use_bottleneck",
"use_new_open_mfdataset_kwargs",
"use_numbagg",
"use_opt_einsum",
"use_flox",
Expand Down Expand Up @@ -57,6 +58,7 @@ class T_Options(TypedDict):
warn_for_unclosed_files: bool
use_bottleneck: bool
use_flox: bool
use_new_open_mfdataset_kwargs: bool
use_numbagg: bool
use_opt_einsum: bool

Expand Down Expand Up @@ -84,6 +86,7 @@ class T_Options(TypedDict):
"warn_for_unclosed_files": False,
"use_bottleneck": True,
"use_flox": True,
"use_new_open_mfdataset_kwargs": False,
"use_numbagg": True,
"use_opt_einsum": True,
}
Expand Down Expand Up @@ -113,6 +116,7 @@ def _positive_integer(value: Any) -> bool:
"file_cache_maxsize": _positive_integer,
"keep_attrs": lambda choice: choice in [True, False, "default"],
"use_bottleneck": lambda value: isinstance(value, bool),
"use_new_open_mfdataset_kwargs": lambda value: isinstance(value, bool),
"use_numbagg": lambda value: isinstance(value, bool),
"use_opt_einsum": lambda value: isinstance(value, bool),
"use_flox": lambda value: isinstance(value, bool),
Expand Down Expand Up @@ -250,6 +254,8 @@ class set_options:
use_flox : bool, default: True
Whether to use ``numpy_groupies`` and `flox`` to
accelerate groupby and resampling reductions.
use_new_open_mfdataset_kwargs : bool, default False
Whether to use new default kwarg values for open_mfdataset.
use_numbagg : bool, default: True
Whether to use ``numbagg`` to accelerate reductions.
Takes precedence over ``use_bottleneck`` when both are True.
Expand Down
Loading