Merge branch 'main' into refactor-unsigned-masked-cf

pydata · Aug 15, 2024 · 01a4742 · 01a4742
2 parents aecc9fa + 3c19231
commit 01a4742
Show file tree

Hide file tree

Showing 86 changed files with 601 additions and 369 deletions.
diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -139,15 +139,15 @@ jobs:
           fail_ci_if_error: false
 
   mypy39:
-    name: Mypy 3.9
+    name: Mypy 3.10
     runs-on: "ubuntu-latest"
     needs: detect-ci-trigger
     defaults:
       run:
         shell: bash -l {0}
     env:
       CONDA_ENV_FILE: ci/requirements/environment.yml
-      PYTHON_VERSION: "3.9"
+      PYTHON_VERSION: "3.10"
 
     steps:
       - uses: actions/checkout@v4
@@ -254,7 +254,7 @@ jobs:
           fail_ci_if_error: false
 
   pyright39:
-    name: Pyright 3.9
+    name: Pyright 3.10
     runs-on: "ubuntu-latest"
     needs: detect-ci-trigger
     if: |
@@ -267,7 +267,7 @@ jobs:
         shell: bash -l {0}
     env:
       CONDA_ENV_FILE: ci/requirements/environment.yml
-      PYTHON_VERSION: "3.9"
+      PYTHON_VERSION: "3.10"
 
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -47,15 +47,15 @@ jobs:
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         # Bookend python versions
-        python-version: ["3.9",  "3.12"]
+        python-version: ["3.10", "3.12"]
         env: [""]
         include:
           # Minimum python version:
           - env: "bare-minimum"
-            python-version: "3.9"
+            python-version: "3.10"
             os: ubuntu-latest
           - env: "min-all-deps"
-            python-version: "3.9"
+            python-version: "3.10"
             os: ubuntu-latest
           # Latest python version:
           - env: "all-but-dask"

diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     if: |
       github.repository == 'pydata/xarray'
-      && (github.event_name == 'push' || github.event_name == 'pull_request')
+      && (github.event_name == 'push' || github.event_name == 'pull_request' || github.event_name == 'schedule')
     outputs:
       triggered: ${{ steps.detect-trigger.outputs.trigger-found }}
     steps:

diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.9
+  - python=3.10
   - coveralls
   - pip
   - pytest

diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -7,7 +7,7 @@ dependencies:
   # Run ci/min_deps_check.py to verify that this file respects the policy.
   # When upgrading python, numpy, or pandas, must also change
   # doc/user-guide/installing.rst, doc/user-guide/plotting.rst and setup.py.
-  - python=3.9
+  - python=3.10
   - array-api-strict=1.0  # dependency for testing the array api compat
   - boto3=1.26
   - bottleneck=1.3

diff --git a/doc/getting-started-guide/installing.rst b/doc/getting-started-guide/installing.rst
@@ -6,7 +6,7 @@ Installation
 Required dependencies
 ---------------------
 
-- Python (3.9 or later)
+- Python (3.10 or later)
 - `numpy <https://www.numpy.org/>`__ (1.23 or later)
 - `packaging <https://packaging.pypa.io/en/latest/#>`__ (23.1 or later)
 - `pandas <https://pandas.pydata.org/>`__ (2.0 or later)

diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst
@@ -482,7 +482,7 @@ every 2 points along ``x`` dimension,
 
     da.coarsen(time=7, x=2).mean()
 
-:py:meth:`~xarray.DataArray.coarsen` raises an ``ValueError`` if the data
+:py:meth:`~xarray.DataArray.coarsen` raises a ``ValueError`` if the data
 length is not a multiple of the corresponding window size.
 You can choose ``boundary='trim'`` or ``boundary='pad'`` options for trimming
 the excess entries or padding ``nan`` to insufficient entries,

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -276,7 +276,7 @@ is identical to
 
     ds.groupby(x=UniqueGrouper())
 
-; and
+and
 
 .. code-block:: python
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -26,6 +26,7 @@ New Features
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
+- Support for ``python 3.9`` has been dropped (:pull:`8937`)
 
 
 Deprecations
@@ -35,6 +36,8 @@ Deprecations
 Bug fixes
 ~~~~~~~~~
 
+- Fix bug with rechunking to a frequency when some periods contain no data (:issue:`9360`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 - Fix bug causing `DataTree.from_dict` to be sensitive to insertion order (:issue:`9276`, :pull:`9292`).
   By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Fix resampling error with monthly, quarterly, or yearly frequencies with
@@ -87,6 +90,8 @@ New Features
   to return an object without ``attrs``. A ``deep`` parameter controls whether
   variables' ``attrs`` are also dropped.
   By `Maximilian Roos <https://github.com/max-sixty>`_. (:pull:`8288`)
+  By `Eni Awowale <https://github.com/eni-awowale>`_.
+- Add `open_groups` method for unaligned datasets (:issue:`9137`, :pull:`9243`)
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,6 @@ classifiers = [
   "Intended Audience :: Science/Research",
   "Programming Language :: Python",
   "Programming Language :: Python :: 3",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
@@ -20,7 +19,7 @@ dynamic = ["version"]
 license = {text = "Apache-2.0"}
 name = "xarray"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 
 dependencies = [
   "numpy>=1.23",
@@ -242,7 +241,7 @@ extend-exclude = [
   "doc",
   "_typed_ops.pyi",
 ]
-target-version = "py39"
+target-version = "py310"
 
 [tool.ruff.lint]
 # E402: module level import not at top of file
@@ -255,6 +254,7 @@ ignore = [
   "E402",
   "E501",
   "E731",
+  "UP007"
 ]
 select = [
   "F", # Pyflakes

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -1,14 +1,20 @@
 from __future__ import annotations
 
 import os
-from collections.abc import Hashable, Iterable, Mapping, MutableMapping, Sequence
+from collections.abc import (
+    Callable,
+    Hashable,
+    Iterable,
+    Mapping,
+    MutableMapping,
+    Sequence,
+)
 from functools import partial
 from io import BytesIO
 from numbers import Number
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Final,
     Literal,
     Union,
@@ -358,7 +364,7 @@ def _dataset_from_backend_dataset(
     from_array_kwargs,
     **extra_tokens,
 ):
-    if not isinstance(chunks, (int, dict)) and chunks not in {None, "auto"}:
+    if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
         raise ValueError(
             f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}."
         )
@@ -385,7 +391,7 @@ def _dataset_from_backend_dataset(
     if "source" not in ds.encoding:
         path = getattr(filename_or_obj, "path", filename_or_obj)
 
-        if isinstance(path, (str, os.PathLike)):
+        if isinstance(path, str | os.PathLike):
             ds.encoding["source"] = _normalize_path(path)
 
     return ds
@@ -837,6 +843,43 @@ def open_datatree(
     return backend.open_datatree(filename_or_obj, **kwargs)
 
 
+def open_groups(
+    filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+    engine: T_Engine = None,
+    **kwargs,
+) -> dict[str, Dataset]:
+    """
+    Open and decode a file or file-like object, creating a dictionary containing one xarray Dataset for each group in the file.
+    Useful for an HDF file ("netcdf4" or "h5netcdf") containing many groups that are not alignable with their parents
+    and cannot be opened directly with ``open_datatree``. It is encouraged to use this function to inspect your data,
+    then make the necessary changes to make the structure coercible to a `DataTree` object before calling `DataTree.from_dict()` and proceeding with your analysis.
+
+    Parameters
+    ----------
+    filename_or_obj : str, Path, file-like, or DataStore
+        Strings and Path objects are interpreted as a path to a netCDF file.
+    engine : str, optional
+        Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf"}`.
+    **kwargs : dict
+        Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group.
+
+    Returns
+    -------
+    dict[str, xarray.Dataset]
+
+    See Also
+    --------
+    open_datatree()
+    DataTree.from_dict()
+    """
+    if engine is None:
+        engine = plugins.guess_engine(filename_or_obj)
+
+    backend = plugins.get_backend(engine)
+
+    return backend.open_groups_as_dict(filename_or_obj, **kwargs)
+
+
 def open_mfdataset(
     paths: str | NestedSequence[str | os.PathLike],
     chunks: T_Chunks | None = None,
@@ -1042,7 +1085,7 @@ def open_mfdataset(
         raise OSError("no files to open")
 
     if combine == "nested":
-        if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
+        if isinstance(concat_dim, str | DataArray) or concat_dim is None:
             concat_dim = [concat_dim]  # type: ignore[assignment]
 
         # This creates a flat list which is easier to iterate over, whilst

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -132,6 +132,7 @@ def _iter_nc_groups(root, parent="/"):
     from xarray.core.treenode import NodePath
 
     parent = NodePath(parent)
+    yield str(parent)
     for path, group in root.groups.items():
         gpath = parent / path
         yield str(gpath)
@@ -535,6 +536,22 @@ def open_datatree(
 
         raise NotImplementedError()
 
+    def open_groups_as_dict(
+        self,
+        filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+        **kwargs: Any,
+    ) -> dict[str, Dataset]:
+        """
+        Opens a dictionary mapping from group names to Datasets.
+
+        Called by :py:func:`~xarray.open_groups`.
+        This function exists to provide a universal way to open all groups in a file,
+        before applying any additional consistency checks or requirements necessary
+        to create a `DataTree` object (typically done using :py:meth:`~xarray.DataTree.from_dict`).
+        """
+
+        raise NotImplementedError()
+
 
 # mapping of engine name to (module name, BackendEntrypoint Class)
 BACKEND_ENTRYPOINTS: dict[str, tuple[str | None, type[BackendEntrypoint]]] = {}
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -109,7 +109,7 @@ class H5NetCDFStore(WritableCFDataStore):
     def __init__(self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=False):
         import h5netcdf
 
-        if isinstance(manager, (h5netcdf.File, h5netcdf.Group)):
+        if isinstance(manager, h5netcdf.File | h5netcdf.Group):
             if group is None:
                 root, group = find_root_and_group(manager)
             else:
@@ -374,7 +374,7 @@ def guess_can_open(
         if magic_number is not None:
             return magic_number.startswith(b"\211HDF\r\n\032\n")
 
-        if isinstance(filename_or_obj, (str, os.PathLike)):
+        if isinstance(filename_or_obj, str | os.PathLike):
             _, ext = os.path.splitext(filename_or_obj)
             return ext in {".nc", ".nc4", ".cdf"}
 
@@ -448,9 +448,36 @@ def open_datatree(
         driver_kwds=None,
         **kwargs,
     ) -> DataTree:
-        from xarray.backends.api import open_dataset
-        from xarray.backends.common import _iter_nc_groups
+
         from xarray.core.datatree import DataTree
+
+        groups_dict = self.open_groups_as_dict(filename_or_obj, **kwargs)
+
+        return DataTree.from_dict(groups_dict)
+
+    def open_groups_as_dict(
+        self,
+        filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+        *,
+        mask_and_scale=True,
+        decode_times=True,
+        concat_characters=True,
+        decode_coords=True,
+        drop_variables: str | Iterable[str] | None = None,
+        use_cftime=None,
+        decode_timedelta=None,
+        format=None,
+        group: str | Iterable[str] | Callable | None = None,
+        lock=None,
+        invalid_netcdf=None,
+        phony_dims=None,
+        decode_vlen_strings=True,
+        driver=None,
+        driver_kwds=None,
+        **kwargs,
+    ) -> dict[str, Dataset]:
+
+        from xarray.backends.common import _iter_nc_groups
         from xarray.core.treenode import NodePath
         from xarray.core.utils import close_on_error
 
@@ -466,19 +493,19 @@ def open_datatree(
             driver=driver,
             driver_kwds=driver_kwds,
         )
+        # Check for a group and make it a parent if it exists
         if group:
             parent = NodePath("/") / NodePath(group)
         else:
             parent = NodePath("/")
 
         manager = store._manager
-        ds = open_dataset(store, **kwargs)
-        tree_root = DataTree.from_dict({str(parent): ds})
+        groups_dict = {}
         for path_group in _iter_nc_groups(store.ds, parent=parent):
             group_store = H5NetCDFStore(manager, group=path_group, **kwargs)
             store_entrypoint = StoreBackendEntrypoint()
             with close_on_error(group_store):
-                ds = store_entrypoint.open_dataset(
+                group_ds = store_entrypoint.open_dataset(
                     group_store,
                     mask_and_scale=mask_and_scale,
                     decode_times=decode_times,
@@ -488,14 +515,11 @@ def open_datatree(
                     use_cftime=use_cftime,
                     decode_timedelta=decode_timedelta,
                 )
-                new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
-                tree_root._set_item(
-                    path_group,
-                    new_node,
-                    allow_overwrite=False,
-                    new_nodes_along_path=True,
-                )
-        return tree_root
+
+            group_name = str(NodePath(path_group))
+            groups_dict[group_name] = group_ds
+
+        return groups_dict
 
 
 BACKEND_ENTRYPOINTS["h5netcdf"] = ("h5netcdf", H5netcdfBackendEntrypoint)
-Original file line number
+Diff line change
@@ Expand Up / @@ -276,7 +276,7 @@ is identical to @@
         ds.groupby(x=UniqueGrouper())
-    ; and
+    and
     .. code-block:: python
@@ Expand Down @@