Merge branch 'main' into zarr-region-chunks

pydata · Nov 17, 2023 · 9f19aca · 9f19aca
2 parents 4337b27 + b6eaf43
commit 9f19aca
Show file tree

Hide file tree

Showing 36 changed files with 546 additions and 412 deletions.
diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -2,10 +2,10 @@ name: CI Additional
 on:
   push:
     branches:
-      - "*"
+      - "main"
   pull_request:
     branches:
-      - "*"
+      - "main"
   workflow_dispatch: # allows you to trigger manually
 
 concurrency:
@@ -41,7 +41,7 @@ jobs:
 
     env:
       CONDA_ENV_FILE: ci/requirements/environment.yml
-      PYTHON_VERSION: "3.10"
+      PYTHON_VERSION: "3.11"
 
     steps:
       - uses: actions/checkout@v4
@@ -87,7 +87,7 @@ jobs:
         shell: bash -l {0}
     env:
       CONDA_ENV_FILE: ci/requirements/environment.yml
-      PYTHON_VERSION: "3.10"
+      PYTHON_VERSION: "3.11"
 
     steps:
       - uses: actions/checkout@v4
@@ -117,7 +117,7 @@ jobs:
           python xarray/util/print_versions.py
       - name: Install mypy
         run: |
-          python -m pip install mypy --force-reinstall
+          python -m pip install "mypy<1.7" --force-reinstall
 
       - name: Run mypy
         run: |
@@ -171,7 +171,7 @@ jobs:
           python xarray/util/print_versions.py
       - name: Install mypy
         run: |
-          python -m pip install mypy --force-reinstall
+          python -m pip install "mypy<1.7" --force-reinstall
 
       - name: Run mypy
         run: |
@@ -332,7 +332,7 @@ jobs:
         with:
           environment-name: xarray-tests
           create-args: >-
-            python=3.10
+            python=3.11
             pyyaml
             conda
             python-dateutil

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -2,10 +2,10 @@ name: CI
 on:
   push:
     branches:
-      - "*"
+      - "main"
   pull_request:
     branches:
-      - "*"
+      - "main"
   workflow_dispatch: # allows you to trigger manually
 
 concurrency:

diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml
@@ -50,7 +50,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
     steps:
       - uses: actions/checkout@v4
         with:
@@ -110,17 +110,17 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
     steps:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0 # Fetch all history for all branches and tags.
       - name: Set up conda environment
-        uses: mamba-org/provision-with-micromamba@v15
+        uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: ci/requirements/environment.yml
           environment-name: xarray-tests
-          extra-specs: |
+          create-args: >-
             python=${{ matrix.python-version }}
             pytest-reportlog
             conda

diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -16,11 +16,11 @@ dependencies:
   - dask-core=2022.7
   - distributed=2022.7
   - flox=0.5
-  - h5netcdf=1.0
+  - h5netcdf=1.1
   # h5py and hdf5 tend to cause conflicts
   # for e.g. hdf5 1.12 conflicts with h5py=3.1
   # prioritize bumping other packages instead
-  - h5py=3.6
+  - h5py=3.7
   - hdf5=1.12
   - hypothesis
   - iris=3.2

diff --git a/doc/internals/interoperability.rst b/doc/internals/interoperability.rst
@@ -36,9 +36,9 @@ it is entirely possible today to:
 - track the physical units of the data through computations (e.g via `pint-xarray <https://pint-xarray.readthedocs.io/en/stable/>`_),
 - query the data via custom index logic optimized for specific applications (e.g. an :py:class:`~xarray.Index` object backed by a KDTree structure),
 - attach domain-specific logic via accessor methods (e.g. to understand geographic Coordinate Reference System metadata),
-- organize hierarchical groups of xarray data in a :py:class:`~datatree.DataTree` (e.g. to treat heterogenous simulation and observational data together during analysis).
+- organize hierarchical groups of xarray data in a :py:class:`~datatree.DataTree` (e.g. to treat heterogeneous simulation and observational data together during analysis).
 
-All of these features can be provided simultaneously, using libaries compatible with the rest of the scientific python ecosystem.
+All of these features can be provided simultaneously, using libraries compatible with the rest of the scientific python ecosystem.
 In this situation xarray would be essentially a thin wrapper acting as pure-python framework, providing a common interface and
 separation of concerns via various domain-agnostic abstractions.
 

diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
@@ -44,9 +44,9 @@ __ https://www.unidata.ucar.edu/software/netcdf/
 
 .. _netCDF FAQ: https://www.unidata.ucar.edu/software/netcdf/docs/faq.html#What-Is-netCDF
 
-Reading and writing netCDF files with xarray requires scipy or the
-`netCDF4-Python`__ library to be installed (the latter is required to
-read/write netCDF V4 files and use the compression options described below).
+Reading and writing netCDF files with xarray requires scipy, h5netcdf, or the
+`netCDF4-Python`__ library to be installed. SciPy only supports reading and writing
+of netCDF V3 files.
 
 __ https://github.com/Unidata/netcdf4-python
 
@@ -675,8 +675,8 @@ the same as the one that was saved.
 
 .. note::
 
-    xarray does not write NCZarr attributes. Therefore, NCZarr data must be
-    opened in read-only mode.
+    xarray does not write `NCZarr <https://docs.unidata.ucar.edu/nug/current/nczarr_head.html>`_ attributes.
+    Therefore, NCZarr data must be opened in read-only mode.
 
 To store variable length strings, convert them to object arrays first with
 ``dtype=object``.
@@ -696,10 +696,10 @@ It is possible to read and write xarray datasets directly from / to cloud
 storage buckets using zarr. This example uses the `gcsfs`_ package to provide
 an interface to `Google Cloud Storage`_.
 
-From v0.16.2: general `fsspec`_ URLs are parsed and the store set up for you
-automatically when reading, such that you can open a dataset in a single
-call. You should include any arguments to the storage backend as the
-key ``storage_options``, part of ``backend_kwargs``.
+General `fsspec`_ URLs, those that begin with ``s3://`` or ``gcs://`` for example,
+are parsed and the store set up for you automatically when reading.
+You should include any arguments to the storage backend as the
+key ```storage_options``, part of ``backend_kwargs``.
 
 .. code:: python
 
@@ -715,7 +715,7 @@ key ``storage_options``, part of ``backend_kwargs``.
 This also works with ``open_mfdataset``, allowing you to pass a list of paths or
 a URL to be interpreted as a glob string.
 
-For older versions, and for writing, you must explicitly set up a ``MutableMapping``
+For writing, you must explicitly set up a ``MutableMapping``
 instance and pass this, as follows:
 
 .. code:: python
@@ -769,10 +769,10 @@ Consolidated Metadata
 ~~~~~~~~~~~~~~~~~~~~~
 
 Xarray needs to read all of the zarr metadata when it opens a dataset.
-In some storage mediums, such as with cloud object storage (e.g. amazon S3),
+In some storage mediums, such as with cloud object storage (e.g. `Amazon S3`_),
 this can introduce significant overhead, because two separate HTTP calls to the
 object store must be made for each variable in the dataset.
-As of xarray version 0.18, xarray by default uses a feature called
+By default Xarray uses a feature called
 *consolidated metadata*, storing all metadata for the entire dataset with a
 single key (by default called ``.zmetadata``). This typically drastically speeds
 up opening the store. (For more information on this feature, consult the
@@ -796,16 +796,20 @@ reads. Because this fall-back option is so much slower, xarray issues a
 
 .. _io.zarr.appending:
 
-Appending to existing Zarr stores
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Modifying existing Zarr stores
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Xarray supports several ways of incrementally writing variables to a Zarr
 store. These options are useful for scenarios when it is infeasible or
 undesirable to write your entire dataset at once.
 
+1. Use ``mode='a'`` to add or overwrite entire variables,
+2. Use ``append_dim`` to resize and append to exiting variables, and
+3. Use ``region`` to write to limited regions of existing arrays.
+
 .. tip::
 
-    If you can load all of your data into a single ``Dataset`` using dask, a
+    For ``Dataset`` objects containing dask arrays, a
     single call to ``to_zarr()`` will write all of your data in parallel.
 
 .. warning::
@@ -876,8 +880,8 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata
     ds.to_zarr(path, compute=False)
 
 Now, a Zarr store with the correct variable shapes and attributes exists that
-can be filled out by subsequent calls to ``to_zarr``. ``region`` can be
-specified as ``"auto"``, which opens the existing store and determines the
+can be filled out by subsequent calls to ``to_zarr``.
+Setting ``region="auto"`` will open the existing store and determine the
 correct alignment of the new data with the existing coordinates, or as an
 explicit mapping from dimension names to Python ``slice`` objects indicating
 where the data should be written (in index space, not label space), e.g.,

diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst
@@ -89,7 +89,7 @@ items and with the `slice` object:
 
 .. ipython:: python
 
-    time = pd.date_range("2000-01-01", freq="H", periods=365 * 24)
+    time = pd.date_range("2000-01-01", freq="h", periods=365 * 24)
     ds = xr.Dataset({"foo": ("time", np.arange(365 * 24)), "time": time})
     ds.sel(time="2000-01")
     ds.sel(time=slice("2000-06-01", "2000-06-10"))
@@ -115,7 +115,7 @@ given ``DataArray`` can be quickly computed using a special ``.dt`` accessor.
 
 .. ipython:: python
 
-    time = pd.date_range("2000-01-01", freq="6H", periods=365 * 4)
+    time = pd.date_range("2000-01-01", freq="6h", periods=365 * 4)
     ds = xr.Dataset({"foo": ("time", np.arange(365 * 4)), "time": time})
     ds.time.dt.hour
     ds.time.dt.dayofweek
@@ -207,7 +207,7 @@ For example, we can downsample our dataset from hourly to 6-hourly:
 .. ipython:: python
     :okwarning:
 
-    ds.resample(time="6H")
+    ds.resample(time="6h")
 
 This will create a specialized ``Resample`` object which saves information
 necessary for resampling. All of the reduction methods which work with
@@ -216,21 +216,21 @@ necessary for resampling. All of the reduction methods which work with
 .. ipython:: python
     :okwarning:
 
-    ds.resample(time="6H").mean()
+    ds.resample(time="6h").mean()
 
 You can also supply an arbitrary reduction function to aggregate over each
 resampling group:
 
 .. ipython:: python
 
-    ds.resample(time="6H").reduce(np.mean)
+    ds.resample(time="6h").reduce(np.mean)
 
 You can also resample on the time dimension while applying reducing along other dimensions at the same time
 by specifying the `dim` keyword argument
 
 .. code-block:: python
 
-    ds.resample(time="6H").mean(dim=["time", "latitude", "longitude"])
+    ds.resample(time="6h").mean(dim=["time", "latitude", "longitude"])
 
 For upsampling, xarray provides six methods: ``asfreq``, ``ffill``, ``bfill``, ``pad``,
 ``nearest`` and ``interpolate``. ``interpolate`` extends ``scipy.interpolate.interp1d``
@@ -243,7 +243,7 @@ Data that has indices outside of the given ``tolerance`` are set to ``NaN``.
 
 .. ipython:: python
 
-    ds.resample(time="1H").nearest(tolerance="1H")
+    ds.resample(time="1h").nearest(tolerance="1h")
 
 
 For more examples of using grouped operations on a time dimension, see

diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst
@@ -239,7 +239,7 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports:
 
 .. ipython:: python
 
-    da.resample(time="81T", closed="right", label="right", offset="3T").mean()
+    da.resample(time="81min", closed="right", label="right", offset="3min").mean()
 
 .. _nanosecond-precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
 .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -32,14 +32,23 @@ New Features
   By `Sam Levang <https://github.com/slevang>`_.
 - Allow the usage of h5py drivers (eg: ros3) via h5netcdf (:pull:`8360`).
   By `Ezequiel Cimadevilla <https://github.com/zequihg50>`_.
+- Enable VLEN string fill_values, preserve VLEN string dtypes (:issue:`1647`, :issue:`7652`, :issue:`7868`, :pull:`7869`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
 - drop support for `cdms2 <https://github.com/CDAT/cdms>`_. Please use
   `xcdat <https://github.com/xCDAT/xcdat>`_ instead (:pull:`8441`).
-  By `Justus Magin <https://github.com/keewis`_.
-
+  By `Justus Magin <https://github.com/keewis>`_.
+- Following pandas, :py:meth:`infer_freq` will return ``"Y"``, ``"YS"``,
+  ``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``, ``"ms"``, ``"us"``, or
+  ``"ns"`` instead of ``"A"``, ``"AS"``, ``"Q"``, ``"M"``, ``"H"``, ``"T"``,
+  ``"S"``, ``"L"``, ``"U"``, or ``"N"``.  This is to be consistent with the
+  deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By
+  `Spencer Clark <https://github.com/spencerkclark>`_.
 - Bump minimum tested pint version to ``>=0.22``. By `Deepak Cherian <https://github.com/dcherian>`_.
+- Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``.
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Deprecations
 ~~~~~~~~~~~~
@@ -51,6 +60,14 @@ Deprecations
   this was one place in the API where dimension positions were used.
   (:pull:`8341`)
   By `Maximilian Roos <https://github.com/max-sixty>`_.
+- Following pandas, the frequency strings ``"A"``, ``"AS"``, ``"Q"``, ``"M"``,
+  ``"H"``, ``"T"``, ``"S"``, ``"L"``, ``"U"``, and ``"N"`` are deprecated in
+  favor of ``"Y"``, ``"YS"``, ``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``,
+  ``"ms"``, ``"us"``, and ``"ns"``, respectively.  These strings are used, for
+  example, in :py:func:`date_range`, :py:func:`cftime_range`,
+  :py:meth:`DataArray.resample`, and :py:meth:`Dataset.resample` among others
+  (:issue:`8394`, :pull:`8415`).  By `Spencer Clark
+  <https://github.com/spencerkclark>`_.
 - Rename :py:meth:`Dataset.to_array` to  :py:meth:`Dataset.to_dataarray` for
   consistency with :py:meth:`DataArray.to_dataset` &
   :py:func:`open_dataarray` functions. This is a "soft" deprecation — the
@@ -84,6 +101,8 @@ Bug fixes
 
 Documentation
 ~~~~~~~~~~~~~
+- Small updates to documentation on distributed writes: See :ref:`io.zarr.appending` to Zarr.
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 
 
 Internal Changes

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -271,15 +271,6 @@ def prepare_variable(
         dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding)
 
         fillvalue = attrs.pop("_FillValue", None)
-        if dtype is str and fillvalue is not None:
-            raise NotImplementedError(
-                "h5netcdf does not yet support setting a fill value for "
-                "variable-length strings "
-                "(https://github.com/h5netcdf/h5netcdf/issues/37). "
-                f"Either remove '_FillValue' from encoding on variable {name!r} "
-                "or set {'dtype': 'S1'} in encoding to use the fixed width "
-                "NC_CHAR type."
-            )
 
         if dtype is str:
             dtype = h5py.special_dtype(vlen=str)

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -490,16 +490,6 @@ def prepare_variable(
 
         fill_value = attrs.pop("_FillValue", None)
 
-        if datatype is str and fill_value is not None:
-            raise NotImplementedError(
-                "netCDF4 does not yet support setting a fill value for "
-                "variable-length strings "
-                "(https://github.com/Unidata/netcdf4-python/issues/730). "
-                f"Either remove '_FillValue' from encoding on variable {name!r} "
-                "or set {'dtype': 'S1'} in encoding to use the fixed width "
-                "NC_CHAR type."
-            )
-
         encoding = _extract_nc4_variable_encoding(
             variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims
         )