Merge branch 'main' into eschalk/align-datatree-to_dict-to-existing

pydata · Aug 18, 2024 · bd44669 · bd44669
2 parents e94f491 + da9e7ec
commit bd44669
Show file tree

Hide file tree

Showing 124 changed files with 1,641 additions and 1,588 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -5,6 +5,9 @@ on:
     types: [opened, reopened, synchronize, labeled]
   workflow_dispatch:
 
+env:
+  PR_HEAD_LABEL: ${{ github.event.pull_request.head.label }}
+
 jobs:
   benchmark:
     if: ${{ contains( github.event.pull_request.labels.*.name, 'run-benchmark') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }}
@@ -49,7 +52,7 @@ jobs:
           # ID this runner
           asv machine --yes
           echo "Baseline:  ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})"
-          echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})"
+          echo "Contender: ${GITHUB_SHA} ($PR_HEAD_LABEL)"
           # Run benchmarks for current commit against base
           ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR"
           asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \

diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -139,15 +139,15 @@ jobs:
           fail_ci_if_error: false
 
   mypy39:
-    name: Mypy 3.9
+    name: Mypy 3.10
     runs-on: "ubuntu-latest"
     needs: detect-ci-trigger
     defaults:
       run:
         shell: bash -l {0}
     env:
       CONDA_ENV_FILE: ci/requirements/environment.yml
-      PYTHON_VERSION: "3.9"
+      PYTHON_VERSION: "3.10"
 
     steps:
       - uses: actions/checkout@v4
@@ -254,7 +254,7 @@ jobs:
           fail_ci_if_error: false
 
   pyright39:
-    name: Pyright 3.9
+    name: Pyright 3.10
     runs-on: "ubuntu-latest"
     needs: detect-ci-trigger
     if: |
@@ -267,7 +267,7 @@ jobs:
         shell: bash -l {0}
     env:
       CONDA_ENV_FILE: ci/requirements/environment.yml
-      PYTHON_VERSION: "3.9"
+      PYTHON_VERSION: "3.10"
 
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -47,15 +47,15 @@ jobs:
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         # Bookend python versions
-        python-version: ["3.9",  "3.12"]
+        python-version: ["3.10", "3.12"]
         env: [""]
         include:
           # Minimum python version:
           - env: "bare-minimum"
-            python-version: "3.9"
+            python-version: "3.10"
             os: ubuntu-latest
           - env: "min-all-deps"
-            python-version: "3.9"
+            python-version: "3.10"
             os: ubuntu-latest
           # Latest python version:
           - env: "all-but-dask"

diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     if: |
       github.repository == 'pydata/xarray'
-      && (github.event_name == 'push' || github.event_name == 'pull_request')
+      && (github.event_name == 'push' || github.event_name == 'pull_request' || github.event_name == 'schedule')
     outputs:
       triggered: ${{ steps.detect-trigger.outputs.trigger-found }}
     steps:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
       - id: mixed-line-ending
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.4.7'
+    rev: 'v0.5.0'
     hooks:
       - id: ruff
         args: ["--fix", "--show-fixes"]
@@ -30,7 +30,7 @@ repos:
         additional_dependencies: ["black==24.4.2"]
       - id: blackdoc-autoupdate-black
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.10.1
     hooks:
       - id: mypy
         # Copied from setup.cfg

diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md
@@ -23,14 +23,13 @@ upstream        https://github.com/pydata/xarray (push)
     ```sh
     git fetch upstream --tags
     ```
-    This will return a list of all the contributors since the last release:
+    Then run
     ```sh
-    git log "$(git tag --sort=v:refname | tail -1).." --format=%aN | sort -u | perl -pe 's/\n/$1, /'
-    ```
-    This will return the total number of contributors:
-    ```sh
-    git log "$(git tag --sort=v:refname | tail -1).." --format=%aN | sort -u | wc -l
+    python ci/release_contributors.py
     ```
+    (needs `gitpython` and `toolz` / `cytoolz`)
+
+    and copy the output.
  3. Write a release summary: ~50 words describing the high level features. This
     will be used in the release emails, tweets, GitHub release notes, etc.
  4. Look over whats-new.rst and the docs. Make sure "What's New" is complete

diff --git a/asv_bench/benchmarks/coding.py b/asv_bench/benchmarks/coding.py
@@ -0,0 +1,18 @@
+import numpy as np
+
+import xarray as xr
+
+from . import parameterized
+
+
+@parameterized(["calendar"], [("standard", "noleap")])
+class EncodeCFDatetime:
+    def setup(self, calendar):
+        self.units = "days since 2000-01-01"
+        self.dtype = np.dtype("int64")
+        self.times = xr.date_range(
+            "2000", freq="D", periods=10000, calendar=calendar
+        ).values
+
+    def time_encode_cf_datetime(self, calendar):
+        xr.coding.times.encode_cf_datetime(self.times, self.units, calendar, self.dtype)
diff --git a/ci/release_contributors.py b/ci/release_contributors.py
@@ -0,0 +1,49 @@
+import re
+import textwrap
+
+import git
+from tlz.itertoolz import last, unique
+
+co_author_re = re.compile(r"Co-authored-by: (?P<name>[^<]+?) <(?P<email>.+)>")
+
+
+def main():
+    repo = git.Repo(".")
+
+    most_recent_release = last(repo.tags)
+
+    # extract information from commits
+    contributors = {}
+    for commit in repo.iter_commits(f"{most_recent_release.name}.."):
+        matches = co_author_re.findall(commit.message)
+        if matches:
+            contributors.update({email: name for name, email in matches})
+        contributors[commit.author.email] = commit.author.name
+
+    # deduplicate and ignore
+    # TODO: extract ignores from .github/release.yml
+    ignored = ["dependabot", "pre-commit-ci"]
+    unique_contributors = unique(
+        contributor
+        for contributor in contributors.values()
+        if contributor.removesuffix("[bot]") not in ignored
+    )
+
+    sorted_ = sorted(unique_contributors)
+    if len(sorted_) > 1:
+        names = f"{', '.join(sorted_[:-1])} and {sorted_[-1]}"
+    else:
+        names = "".join(sorted_)
+
+    statement = textwrap.dedent(
+        f"""\
+    Thanks to the {len(sorted_)} contributors to this release:
+    {names}
+    """.rstrip()
+    )
+
+    print(statement)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.9
+  - python=3.10
   - coveralls
   - pip
   - pytest

diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml
@@ -8,6 +8,7 @@ dependencies:
   - bottleneck
   - cartopy
   - cfgrib
+  - kerchunk
   - dask-core>=2022.1
   - dask-expr
   - hypothesis>=6.75.8

diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -26,7 +26,7 @@ dependencies:
   - numba
   - numbagg
   - numexpr
-  - numpy
+  - numpy>=2
   - opt_einsum
   - packaging
   - pandas

diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -7,7 +7,7 @@ dependencies:
   # Run ci/min_deps_check.py to verify that this file respects the policy.
   # When upgrading python, numpy, or pandas, must also change
   # doc/user-guide/installing.rst, doc/user-guide/plotting.rst and setup.py.
-  - python=3.9
+  - python=3.10
   - array-api-strict=1.0  # dependency for testing the array api compat
   - boto3=1.26
   - bottleneck=1.3

diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -694,6 +694,6 @@
    coding.times.CFTimedeltaCoder
    coding.times.CFDatetimeCoder
 
-   core.groupers.Grouper
-   core.groupers.Resampler
-   core.groupers.EncodedGroups
+   groupers.Grouper
+   groupers.Resampler
+   groupers.EncodedGroups
diff --git a/doc/api.rst b/doc/api.rst
@@ -806,7 +806,7 @@ DataArray
 Grouper Objects
 ---------------
 
-.. currentmodule:: xarray.core
+.. currentmodule:: xarray
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/combined.json b/doc/combined.json
@@ -0,0 +1,30 @@
+{
+    "version": 1,
+    "refs": {
+        ".zgroup": "{\"zarr_format\":2}",
+        "foo/.zarray": "{\"chunks\":[4,5],\"compressor\":null,\"dtype\":\"<f8\",\"fill_value\":\"NaN\",\"filters\":null,\"order\":\"C\",\"shape\":[4,5],\"zarr_format\":2}",
+        "foo/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\",\"y\"],\"coordinates\":\"z\"}",
+        "foo/0.0": [
+            "saved_on_disk.h5",
+            8192,
+            160
+        ],
+        "x/.zarray": "{\"chunks\":[4],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[4],\"zarr_format\":2}",
+        "x/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}",
+        "x/0": [
+            "saved_on_disk.h5",
+            8352,
+            32
+        ],
+        "y/.zarray": "{\"chunks\":[5],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[5],\"zarr_format\":2}",
+        "y/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"y\"],\"calendar\":\"proleptic_gregorian\",\"units\":\"days since 2000-01-01 00:00:00\"}",
+        "y/0": [
+            "saved_on_disk.h5",
+            8384,
+            40
+        ],
+        "z/.zarray": "{\"chunks\":[4],\"compressor\":null,\"dtype\":\"|O\",\"fill_value\":null,\"filters\":[{\"allow_nan\":true,\"check_circular\":true,\"encoding\":\"utf-8\",\"ensure_ascii\":true,\"id\":\"json2\",\"indent\":null,\"separators\":[\",\",\":\"],\"skipkeys\":false,\"sort_keys\":true,\"strict\":true}],\"order\":\"C\",\"shape\":[4],\"zarr_format\":2}",
+        "z/0": "[\"a\",\"b\",\"c\",\"d\",\"|O\",[4]]",
+        "z/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}"
+    }
+}
diff --git a/doc/conf.py b/doc/conf.py
@@ -94,10 +94,11 @@
 extlinks = {
     "issue": ("https://github.com/pydata/xarray/issues/%s", "GH%s"),
     "pull": ("https://github.com/pydata/xarray/pull/%s", "PR%s"),
+    "discussion": ("https://github.com/pydata/xarray/discussions/%s", "D%s"),
 }
 
 # sphinx-copybutton configurations
-copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
+copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.{3,}: | {5,8}: "
 copybutton_prompt_is_regexp = True
 
 # nbsphinx configurations
@@ -158,8 +159,8 @@
     "Variable": "~xarray.Variable",
     "DatasetGroupBy": "~xarray.core.groupby.DatasetGroupBy",
     "DataArrayGroupBy": "~xarray.core.groupby.DataArrayGroupBy",
-    "Grouper": "~xarray.core.groupers.Grouper",
-    "Resampler": "~xarray.core.groupers.Resampler",
+    "Grouper": "~xarray.groupers.Grouper",
+    "Resampler": "~xarray.groupers.Resampler",
     # objects without namespace: numpy
     "ndarray": "~numpy.ndarray",
     "MaskedArray": "~numpy.ma.MaskedArray",

diff --git a/doc/ecosystem.rst b/doc/ecosystem.rst
@@ -17,6 +17,7 @@ Geosciences
 - `climpred <https://climpred.readthedocs.io>`_: Analysis of ensemble forecast models for climate prediction.
 - `geocube <https://corteva.github.io/geocube>`_: Tool to convert geopandas vector data into rasterized xarray data.
 - `GeoWombat <https://github.com/jgrss/geowombat>`_: Utilities for analysis of remotely sensed and gridded raster data at scale (easily tame Landsat, Sentinel, Quickbird, and PlanetScope).
+- `grib2io <https://github.com/NOAA-MDL/grib2io>`_: Utility to work with GRIB2 files including an xarray backend, DASK support for parallel reading in open_mfdataset, lazy loading of data, editing of GRIB2 attributes and GRIB2IO DataArray attrs, and spatial interpolation and reprojection of GRIB2 messages and GRIB2IO Datasets/DataArrays for both grid to grid and grid to stations.
 - `gsw-xarray <https://github.com/DocOtak/gsw-xarray>`_: a wrapper around `gsw <https://teos-10.github.io/GSW-Python>`_ that adds CF compliant attributes when possible, units, name.
 - `infinite-diff <https://github.com/spencerahill/infinite-diff>`_: xarray-based finite-differencing, focused on gridded climate/meteorology data
 - `marc_analysis <https://github.com/darothen/marc_analysis>`_: Analysis package for CESM/MARC experiments and output.

diff --git a/doc/getting-started-guide/installing.rst b/doc/getting-started-guide/installing.rst
@@ -6,10 +6,10 @@ Installation
 Required dependencies
 ---------------------
 
-- Python (3.9 or later)
+- Python (3.10 or later)
 - `numpy <https://www.numpy.org/>`__ (1.23 or later)
-- `packaging <https://packaging.pypa.io/en/latest/#>`__ (22 or later)
-- `pandas <https://pandas.pydata.org/>`__ (1.5 or later)
+- `packaging <https://packaging.pypa.io/en/latest/#>`__ (23.1 or later)
+- `pandas <https://pandas.pydata.org/>`__ (2.0 or later)
 
 .. _optional-dependencies:
 

diff --git a/doc/internals/chunked-arrays.rst b/doc/internals/chunked-arrays.rst
@@ -91,7 +91,8 @@ Once the chunkmanager subclass has been registered, xarray objects wrapping the
 The latter two methods ultimately call the chunkmanager's implementation of ``.from_array``, to which they pass the ``from_array_kwargs`` dict.
 The ``chunked_array_type`` kwarg selects which registered chunkmanager subclass to dispatch to. It defaults to ``'dask'``
 if Dask is installed, otherwise it defaults to whichever chunkmanager is registered if only one is registered.
-If multiple chunkmanagers are registered it will raise an error by default.
+If multiple chunkmanagers are registered, the ``chunk_manager`` configuration option (which can be set using :py:func:`set_options`)
+will be used to determine which chunkmanager to use, defaulting to ``'dask'``.
 
 Parallel processing without chunks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst
@@ -4,7 +4,7 @@ How to add a new backend
 ------------------------
 
 Adding a new backend for read support to Xarray does not require
-to integrate any code in Xarray; all you need to do is:
+one to integrate any code in Xarray; all you need to do is:
 
 - Create a class that inherits from Xarray :py:class:`~xarray.backends.BackendEntrypoint`
   and implements the method ``open_dataset`` see :ref:`RST backend_entrypoint`

diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst
@@ -482,7 +482,7 @@ every 2 points along ``x`` dimension,
 
     da.coarsen(time=7, x=2).mean()
 
-:py:meth:`~xarray.DataArray.coarsen` raises an ``ValueError`` if the data
+:py:meth:`~xarray.DataArray.coarsen` raises a ``ValueError`` if the data
 length is not a multiple of the corresponding window size.
 You can choose ``boundary='trim'`` or ``boundary='pad'`` options for trimming
 the excess entries or padding ``nan`` to insufficient entries,

diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst
@@ -296,6 +296,12 @@ loaded into Dask or not:
 Automatic parallelization with ``apply_ufunc`` and ``map_blocks``
 -----------------------------------------------------------------
 
+.. tip::
+
+   Some problems can become embarassingly parallel and thus easy to parallelize
+   automatically by rechunking to a frequency, e.g. ``ds.chunk(time=TimeResampler("YE"))``.
+   See :py:meth:`Dataset.chunk` for more.
+
 Almost all of xarray's built-in operations work on Dask arrays. If you want to
 use a function that isn't wrapped by xarray, and have it applied in parallel on
 each block of your xarray object, you have three options:
@@ -551,6 +557,16 @@ larger chunksizes.
 
    Check out the `dask documentation on chunks <https://docs.dask.org/en/latest/array-chunks.html>`_.
 
+.. tip::
+
+   Many time domain problems become amenable to an embarassingly parallel or blockwise solution
+   (e.g. using :py:func:`xarray.map_blocks`, :py:func:`dask.array.map_blocks`, or
+   :py:func:`dask.array.blockwise`) by rechunking to a frequency along the time dimension.
+   Provide :py:class:`xarray.groupers.TimeResampler` objects to :py:meth:`Dataset.chunk` to do so.
+   For example ``ds.chunk(time=TimeResampler("MS"))`` will set the chunks so that a month of
+   data is contained in one chunk. The resulting chunk sizes need not be uniform, depending on
+   the frequency of the data, and the calendar.
+
 
 Optimization Tips
 -----------------

diff --git a/doc/user-guide/duckarrays.rst b/doc/user-guide/duckarrays.rst
@@ -215,7 +215,7 @@ Whilst the features above allow many numpy-like array libraries to be used prett
 makes sense to use an interfacing package to make certain tasks easier.
 
 For example the `pint-xarray package <https://pint-xarray.readthedocs.io>`_ offers a custom ``.pint`` accessor (see :ref:`internals.accessors`) which provides
-convenient access to information stored within the wrapped array (e.g. ``.units`` and ``.magnitude``), and makes makes
+convenient access to information stored within the wrapped array (e.g. ``.units`` and ``.magnitude``), and makes
 creating wrapped pint arrays (and especially xarray-wrapping-pint-wrapping-dask arrays) simpler for the user.
 
 We maintain a list of libraries extending ``xarray`` to make working with particular wrapped duck arrays

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -276,7 +276,7 @@ is identical to
 
     ds.groupby(x=UniqueGrouper())
 
-; and
+and
 
 .. code-block:: python
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,7 +26,7 @@ dependencies: @@
       - numba
       - numbagg
       - numexpr
-      - numpy
+      - numpy>=2
       - opt_einsum
       - packaging
       - pandas
@@ Expand Down @@