Merge branch 'main' into vectorization_of_encode_cftime

pydata · Aug 2, 2024 · 05428fd · 05428fd
2 parents 06af81a + c7cb9f7
commit 05428fd
Show file tree

Hide file tree

Showing 47 changed files with 742 additions and 705 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
       - id: mixed-line-ending
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.4.7'
+    rev: 'v0.5.0'
     hooks:
       - id: ruff
         args: ["--fix", "--show-fixes"]
@@ -30,7 +30,7 @@ repos:
         additional_dependencies: ["black==24.4.2"]
       - id: blackdoc-autoupdate-black
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.10.1
     hooks:
       - id: mypy
         # Copied from setup.cfg

diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md
@@ -23,14 +23,13 @@ upstream        https://github.com/pydata/xarray (push)
     ```sh
     git fetch upstream --tags
     ```
-    This will return a list of all the contributors since the last release:
+    Then run
     ```sh
-    git log "$(git tag --sort=v:refname | tail -1).." --format=%aN | sort -u | perl -pe 's/\n/$1, /'
-    ```
-    This will return the total number of contributors:
-    ```sh
-    git log "$(git tag --sort=v:refname | tail -1).." --format=%aN | sort -u | wc -l
+    python ci/release_contributors.py
     ```
+    (needs `gitpython` and `toolz` / `cytoolz`)
+
+    and copy the output.
  3. Write a release summary: ~50 words describing the high level features. This
     will be used in the release emails, tweets, GitHub release notes, etc.
  4. Look over whats-new.rst and the docs. Make sure "What's New" is complete

diff --git a/ci/release_contributors.py b/ci/release_contributors.py
@@ -0,0 +1,49 @@
+import re
+import textwrap
+
+import git
+from tlz.itertoolz import last, unique
+
+co_author_re = re.compile(r"Co-authored-by: (?P<name>[^<]+?) <(?P<email>.+)>")
+
+
+def main():
+    repo = git.Repo(".")
+
+    most_recent_release = last(repo.tags)
+
+    # extract information from commits
+    contributors = {}
+    for commit in repo.iter_commits(f"{most_recent_release.name}.."):
+        matches = co_author_re.findall(commit.message)
+        if matches:
+            contributors.update({email: name for name, email in matches})
+        contributors[commit.author.email] = commit.author.name
+
+    # deduplicate and ignore
+    # TODO: extract ignores from .github/release.yml
+    ignored = ["dependabot", "pre-commit-ci"]
+    unique_contributors = unique(
+        contributor
+        for contributor in contributors.values()
+        if contributor.removesuffix("[bot]") not in ignored
+    )
+
+    sorted_ = sorted(unique_contributors)
+    if len(sorted_) > 1:
+        names = f"{', '.join(sorted_[:-1])} and {sorted_[-1]}"
+    else:
+        names = "".join(sorted_)
+
+    statement = textwrap.dedent(
+        f"""\
+    Thanks to the {len(sorted_)} contributors to this release:
+    {names}
+    """.rstrip()
+    )
+
+    print(statement)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml
@@ -8,6 +8,7 @@ dependencies:
   - bottleneck
   - cartopy
   - cfgrib
+  - kerchunk
   - dask-core>=2022.1
   - dask-expr
   - hypothesis>=6.75.8

diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -694,6 +694,6 @@
    coding.times.CFTimedeltaCoder
    coding.times.CFDatetimeCoder
 
-   core.groupers.Grouper
-   core.groupers.Resampler
-   core.groupers.EncodedGroups
+   groupers.Grouper
+   groupers.Resampler
+   groupers.EncodedGroups
diff --git a/doc/api.rst b/doc/api.rst
@@ -806,7 +806,7 @@ DataArray
 Grouper Objects
 ---------------
 
-.. currentmodule:: xarray.core
+.. currentmodule:: xarray
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/combined.json b/doc/combined.json
@@ -0,0 +1,30 @@
+{
+    "version": 1,
+    "refs": {
+        ".zgroup": "{\"zarr_format\":2}",
+        "foo/.zarray": "{\"chunks\":[4,5],\"compressor\":null,\"dtype\":\"<f8\",\"fill_value\":\"NaN\",\"filters\":null,\"order\":\"C\",\"shape\":[4,5],\"zarr_format\":2}",
+        "foo/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\",\"y\"],\"coordinates\":\"z\"}",
+        "foo/0.0": [
+            "saved_on_disk.h5",
+            8192,
+            160
+        ],
+        "x/.zarray": "{\"chunks\":[4],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[4],\"zarr_format\":2}",
+        "x/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}",
+        "x/0": [
+            "saved_on_disk.h5",
+            8352,
+            32
+        ],
+        "y/.zarray": "{\"chunks\":[5],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[5],\"zarr_format\":2}",
+        "y/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"y\"],\"calendar\":\"proleptic_gregorian\",\"units\":\"days since 2000-01-01 00:00:00\"}",
+        "y/0": [
+            "saved_on_disk.h5",
+            8384,
+            40
+        ],
+        "z/.zarray": "{\"chunks\":[4],\"compressor\":null,\"dtype\":\"|O\",\"fill_value\":null,\"filters\":[{\"allow_nan\":true,\"check_circular\":true,\"encoding\":\"utf-8\",\"ensure_ascii\":true,\"id\":\"json2\",\"indent\":null,\"separators\":[\",\",\":\"],\"skipkeys\":false,\"sort_keys\":true,\"strict\":true}],\"order\":\"C\",\"shape\":[4],\"zarr_format\":2}",
+        "z/0": "[\"a\",\"b\",\"c\",\"d\",\"|O\",[4]]",
+        "z/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}"
+    }
+}
diff --git a/doc/conf.py b/doc/conf.py
@@ -94,10 +94,11 @@
 extlinks = {
     "issue": ("https://github.com/pydata/xarray/issues/%s", "GH%s"),
     "pull": ("https://github.com/pydata/xarray/pull/%s", "PR%s"),
+    "discussion": ("https://github.com/pydata/xarray/discussions/%s", "D%s"),
 }
 
 # sphinx-copybutton configurations
-copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
+copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.{3,}: | {5,8}: "
 copybutton_prompt_is_regexp = True
 
 # nbsphinx configurations
@@ -158,8 +159,8 @@
     "Variable": "~xarray.Variable",
     "DatasetGroupBy": "~xarray.core.groupby.DatasetGroupBy",
     "DataArrayGroupBy": "~xarray.core.groupby.DataArrayGroupBy",
-    "Grouper": "~xarray.core.groupers.Grouper",
-    "Resampler": "~xarray.core.groupers.Resampler",
+    "Grouper": "~xarray.groupers.Grouper",
+    "Resampler": "~xarray.groupers.Resampler",
     # objects without namespace: numpy
     "ndarray": "~numpy.ndarray",
     "MaskedArray": "~numpy.ma.MaskedArray",

diff --git a/doc/ecosystem.rst b/doc/ecosystem.rst
@@ -17,6 +17,7 @@ Geosciences
 - `climpred <https://climpred.readthedocs.io>`_: Analysis of ensemble forecast models for climate prediction.
 - `geocube <https://corteva.github.io/geocube>`_: Tool to convert geopandas vector data into rasterized xarray data.
 - `GeoWombat <https://github.com/jgrss/geowombat>`_: Utilities for analysis of remotely sensed and gridded raster data at scale (easily tame Landsat, Sentinel, Quickbird, and PlanetScope).
+- `grib2io <https://github.com/NOAA-MDL/grib2io>`_: Utility to work with GRIB2 files including an xarray backend, DASK support for parallel reading in open_mfdataset, lazy loading of data, editing of GRIB2 attributes and GRIB2IO DataArray attrs, and spatial interpolation and reprojection of GRIB2 messages and GRIB2IO Datasets/DataArrays for both grid to grid and grid to stations.
 - `gsw-xarray <https://github.com/DocOtak/gsw-xarray>`_: a wrapper around `gsw <https://teos-10.github.io/GSW-Python>`_ that adds CF compliant attributes when possible, units, name.
 - `infinite-diff <https://github.com/spencerahill/infinite-diff>`_: xarray-based finite-differencing, focused on gridded climate/meteorology data
 - `marc_analysis <https://github.com/darothen/marc_analysis>`_: Analysis package for CESM/MARC experiments and output.

diff --git a/doc/getting-started-guide/installing.rst b/doc/getting-started-guide/installing.rst
@@ -8,8 +8,8 @@ Required dependencies
 
 - Python (3.9 or later)
 - `numpy <https://www.numpy.org/>`__ (1.23 or later)
-- `packaging <https://packaging.pypa.io/en/latest/#>`__ (22 or later)
-- `pandas <https://pandas.pydata.org/>`__ (1.5 or later)
+- `packaging <https://packaging.pypa.io/en/latest/#>`__ (23.1 or later)
+- `pandas <https://pandas.pydata.org/>`__ (2.0 or later)
 
 .. _optional-dependencies:
 

diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst
@@ -4,7 +4,7 @@ How to add a new backend
 ------------------------
 
 Adding a new backend for read support to Xarray does not require
-to integrate any code in Xarray; all you need to do is:
+one to integrate any code in Xarray; all you need to do is:
 
 - Create a class that inherits from Xarray :py:class:`~xarray.backends.BackendEntrypoint`
   and implements the method ``open_dataset`` see :ref:`RST backend_entrypoint`

diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst
@@ -296,6 +296,12 @@ loaded into Dask or not:
 Automatic parallelization with ``apply_ufunc`` and ``map_blocks``
 -----------------------------------------------------------------
 
+.. tip::
+
+   Some problems can become embarassingly parallel and thus easy to parallelize
+   automatically by rechunking to a frequency, e.g. ``ds.chunk(time=TimeResampler("YE"))``.
+   See :py:meth:`Dataset.chunk` for more.
+
 Almost all of xarray's built-in operations work on Dask arrays. If you want to
 use a function that isn't wrapped by xarray, and have it applied in parallel on
 each block of your xarray object, you have three options:
@@ -551,6 +557,16 @@ larger chunksizes.
 
    Check out the `dask documentation on chunks <https://docs.dask.org/en/latest/array-chunks.html>`_.
 
+.. tip::
+
+   Many time domain problems become amenable to an embarassingly parallel or blockwise solution
+   (e.g. using :py:func:`xarray.map_blocks`, :py:func:`dask.array.map_blocks`, or
+   :py:func:`dask.array.blockwise`) by rechunking to a frequency along the time dimension.
+   Provide :py:class:`xarray.groupers.TimeResampler` objects to :py:meth:`Dataset.chunk` to do so.
+   For example ``ds.chunk(time=TimeResampler("MS"))`` will set the chunks so that a month of
+   data is contained in one chunk. The resulting chunk sizes need not be uniform, depending on
+   the frequency of the data, and the calendar.
+
 
 Optimization Tips
 -----------------

diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
@@ -19,6 +19,81 @@ format (recommended).
 
     np.random.seed(123456)
 
+You can `read different types of files <https://docs.xarray.dev/en/stable/user-guide/io.html>`_
+in `xr.open_dataset` by specifying the engine to be used:
+
+.. ipython:: python
+    :okexcept:
+    :suppress:
+
+    import xarray as xr
+
+    xr.open_dataset("my_file.grib", engine="cfgrib")
+
+The "engine" provides a set of instructions that tells xarray how
+to read the data and pack them into a `dataset` (or `dataarray`).
+These instructions are stored in an underlying "backend".
+
+Xarray comes with several backends that cover many common data formats.
+Many more backends are available via external libraries, or you can `write your own <https://docs.xarray.dev/en/stable/internals/how-to-add-new-backend.html>`_.
+This diagram aims to help you determine - based on the format of the file you'd like to read -
+which type of backend you're using and how to use it.
+
+Text and boxes are clickable for more information.
+Following the diagram is detailed information on many popular backends.
+You can learn more about using and developing backends in the
+`Xarray tutorial JupyterBook <https://tutorial.xarray.dev/advanced/backends/backends.html>`_.
+
+.. mermaid::
+    :alt: Flowchart illustrating how to choose the right backend engine to read your data
+
+    flowchart LR
+        built-in-eng["""Is your data stored in one of these formats?
+            - netCDF4 (<code>netcdf4</code>)
+            - netCDF3 (<code>scipy</code>)
+            - Zarr (<code>zarr</code>)
+            - DODS/OPeNDAP (<code>pydap</code>)
+            - HDF5 (<code>h5netcdf</code>)
+            """]
+
+        built-in("""You're in luck! Xarray bundles a backend for this format.
+            Open data using <code>xr.open_dataset()</code>. We recommend
+            always setting the engine you want to use.""")
+
+        installed-eng["""One of these formats?
+            - <a href='https://github.com/ecmwf/cfgrib'>GRIB (<code>cfgrib</code>)
+            - <a href='https://tiledb-inc.github.io/TileDB-CF-Py/documentation/index.html'>TileDB (<code>tiledb</code>)
+            - <a href='https://corteva.github.io/rioxarray/stable/getting_started/getting_started.html#rioxarray'>GeoTIFF, JPEG-2000, ESRI-hdf (<code>rioxarray</code>, via GDAL)
+            - <a href='https://www.bopen.eu/xarray-sentinel-open-source-library/'>Sentinel-1 SAFE (<code>xarray-sentinel</code>)
+            """]
+
+        installed("""Install the package indicated in parentheses to your
+            Python environment. Restart the kernel and use
+            <code>xr.open_dataset(files, engine='rioxarray')</code>.""")
+
+        other("""Ask around to see if someone in your data community
+            has created an Xarray backend for your data type.
+            If not, you may need to create your own or consider
+            exporting your data to a more common format.""")
+
+        built-in-eng -->|Yes| built-in
+        built-in-eng -->|No| installed-eng
+
+        installed-eng -->|Yes| installed
+        installed-eng -->|No| other
+
+        click built-in-eng "https://docs.xarray.dev/en/stable/getting-started-guide/faq.html#how-do-i-open-format-x-file-as-an-xarray-dataset"
+        click other "https://docs.xarray.dev/en/stable/internals/how-to-add-new-backend.html"
+
+        classDef quesNodefmt fill:#9DEEF4,stroke:#206C89,text-align:left
+        class built-in-eng,installed-eng quesNodefmt
+
+        classDef ansNodefmt fill:#FFAA05,stroke:#E37F17,text-align:left,white-space:nowrap
+        class built-in,installed,other ansNodefmt
+
+        linkStyle default font-size:20pt,color:#206C89
+
+
 .. _io.netcdf:
 
 netCDF
@@ -985,6 +1060,59 @@ reads. Because this fall-back option is so much slower, xarray issues a
        instead of falling back to try reading non-consolidated metadata.
 
 
+.. _io.kerchunk:
+
+Kerchunk
+--------
+
+`Kerchunk <https://fsspec.github.io/kerchunk/index.html>`_ is a Python library
+that allows you to access chunked and compressed data formats (such as NetCDF3, NetCDF4, HDF5, GRIB2, TIFF & FITS),
+many of which are primary data formats for many data archives, by viewing the
+whole archive as an ephemeral `Zarr`_ dataset which allows for parallel, chunk-specific access.
+
+Instead of creating a new copy of the dataset in the Zarr spec/format or
+downloading the files locally, Kerchunk reads through the data archive and extracts the
+byte range and compression information of each chunk and saves as a ``reference``.
+These references are then saved as ``json`` files or ``parquet`` (more efficient)
+for later use. You can view some of these stored in the `references`
+directory `here <https://github.com/pydata/xarray-data>`_.
+
+
+.. note::
+    These references follow this `specification <https://fsspec.github.io/kerchunk/spec.html>`_.
+    Packages like `kerchunk`_ and `virtualizarr <https://github.com/zarr-developers/VirtualiZarr>`_
+    help in creating and reading these references.
+
+
+Reading these data archives becomes really easy with ``kerchunk`` in combination
+with ``xarray``, especially when these archives are large in size. A single combined
+reference can refer to thousands of the original data files present in these archives.
+You can view the whole dataset with from this `combined reference` using the above packages.
+
+The following example shows opening a combined references generated from a ``.hdf`` file stored locally.
+
+.. ipython:: python
+
+    storage_options = {
+        "target_protocol": "file",
+    }
+
+    # add the `remote_protocol` key in `storage_options` if you're accessing a file remotely
+
+    ds1 = xr.open_dataset(
+        "./combined.json",
+        engine="kerchunk",
+        storage_options=storage_options,
+    )
+
+    ds1
+
+.. note::
+
+    You can refer to the `project pythia kerchunk cookbook <https://projectpythia.org/kerchunk-cookbook/README.html>`_
+    and the `pangeo guide on kerchunk <https://guide.cloudnativegeo.org/kerchunk/intro.html>`_ for more information.
+
+
 .. _io.iris:
 
 Iris