Merge branch 'main' into zarr-v3-tests

pydata · Jan 5, 2024 · 4b74c05 · 4b74c05
2 parents ade5c21 + cd6862b
commit 4b74c05
Show file tree

Hide file tree

Showing 17 changed files with 266 additions and 134 deletions.
diff --git a/.github/workflows/benchmarks-last-release.yml b/.github/workflows/benchmarks-last-release.yml
@@ -72,7 +72,7 @@ jobs:
           cp benchmarks/README_CI.md benchmarks.log .asv/results/
         working-directory: ${{ env.ASV_DIR }}
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: asv-benchmark-results-${{ runner.os }}

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -67,7 +67,7 @@ jobs:
           cp benchmarks/README_CI.md benchmarks.log .asv/results/
         working-directory: ${{ env.ASV_DIR }}
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: asv-benchmark-results-${{ runner.os }}

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -129,9 +129,9 @@ jobs:
 
       - name: Upload test results
         if: always()
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          name: Test results for ${{ runner.os }}-${{ matrix.python-version }}
+          name: Test results for ${{ runner.os }}-${{ matrix.python-version }} ${{ matrix.env }}
           path: pytest.xml
 
       - name: Upload code coverage to Codecov
@@ -149,7 +149,7 @@ jobs:
     if: github.repository == 'pydata/xarray'
     steps:
       - name: Upload
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: Event File
           path: ${{ github.event_path }}
diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml
@@ -41,7 +41,7 @@ jobs:
           else
             echo "✅ Looks good"
           fi
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: releases
           path: dist
@@ -54,7 +54,7 @@ jobs:
         name: Install Python
         with:
           python-version: "3.11"
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
           name: releases
           path: dist
@@ -82,7 +82,7 @@ jobs:
       id-token: write
 
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
           name: releases
           path: dist
@@ -106,7 +106,7 @@ jobs:
       id-token: write
 
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
           name: releases
           path: dist

diff --git a/doc/ecosystem.rst b/doc/ecosystem.rst
@@ -98,7 +98,6 @@ Visualization
 Non-Python projects
 ~~~~~~~~~~~~~~~~~~~
 - `xframe <https://github.com/xtensor-stack/xframe>`_: C++ data structures inspired by xarray.
-- `AxisArrays <https://github.com/JuliaArrays/AxisArrays.jl>`_ and
-  `NamedArrays <https://github.com/davidavdav/NamedArrays.jl>`_: similar data structures for Julia.
+- `AxisArrays <https://github.com/JuliaArrays/AxisArrays.jl>`_, `NamedArrays <https://github.com/davidavdav/NamedArrays.jl>`_ and `YAXArrays.jl <https://github.com/JuliaDataCubes/YAXArrays.jl>`_: similar data structures for Julia.
 
 More projects can be found at the `"xarray" Github topic <https://github.com/topics/xarray>`_.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -64,6 +64,8 @@ Bug fixes
 
 - Reverse index output of bottleneck's rolling move_argmax/move_argmin functions (:issue:`8541`, :pull:`8552`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+- Vendor `SerializableLock` from dask and use as default lock for netcdf4 backends (:issue:`8442`, :pull:`8571`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 
 Documentation
@@ -72,7 +74,10 @@ Documentation
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
-
+- The implementation of :py:func:`map_blocks` has changed to minimize graph size and duplication of data.
+  This should be a strict improvement even though the graphs are not always embarassingly parallel any more.
+  Please open an issue if you spot a regression. (:pull:`8412`, :issue:`8409`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 - Remove null values before plotting. (:pull:`8535`).
   By `Jimmy Westling <https://github.com/illviljan>`_.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -91,6 +91,7 @@ module = [
   "cf_units.*",
   "cfgrib.*",
   "cftime.*",
+  "cloudpickle.*",
   "cubed.*",
   "cupy.*",
   "dask.types.*",

diff --git a/xarray/backends/locks.py b/xarray/backends/locks.py
@@ -2,15 +2,83 @@
 
 import multiprocessing
 import threading
+import uuid
 import weakref
-from collections.abc import MutableMapping
-from typing import Any
-
-try:
-    from dask.utils import SerializableLock
-except ImportError:
-    # no need to worry about serializing the lock
-    SerializableLock = threading.Lock  # type: ignore
+from collections.abc import Hashable, MutableMapping
+from typing import Any, ClassVar
+from weakref import WeakValueDictionary
+
+
+# SerializableLock is adapted from Dask:
+# https://github.com/dask/dask/blob/74e898f0ec712e8317ba86cc3b9d18b6b9922be0/dask/utils.py#L1160-L1224
+# Used under the terms of Dask's license, see licenses/DASK_LICENSE.
+class SerializableLock:
+    """A Serializable per-process Lock
+
+    This wraps a normal ``threading.Lock`` object and satisfies the same
+    interface.  However, this lock can also be serialized and sent to different
+    processes.  It will not block concurrent operations between processes (for
+    this you should look at ``dask.multiprocessing.Lock`` or ``locket.lock_file``
+    but will consistently deserialize into the same lock.
+
+    So if we make a lock in one process::
+
+        lock = SerializableLock()
+
+    And then send it over to another process multiple times::
+
+        bytes = pickle.dumps(lock)
+        a = pickle.loads(bytes)
+        b = pickle.loads(bytes)
+
+    Then the deserialized objects will operate as though they were the same
+    lock, and collide as appropriate.
+
+    This is useful for consistently protecting resources on a per-process
+    level.
+
+    The creation of locks is itself not threadsafe.
+    """
+
+    _locks: ClassVar[
+        WeakValueDictionary[Hashable, threading.Lock]
+    ] = WeakValueDictionary()
+    token: Hashable
+    lock: threading.Lock
+
+    def __init__(self, token: Hashable | None = None):
+        self.token = token or str(uuid.uuid4())
+        if self.token in SerializableLock._locks:
+            self.lock = SerializableLock._locks[self.token]
+        else:
+            self.lock = threading.Lock()
+            SerializableLock._locks[self.token] = self.lock
+
+    def acquire(self, *args, **kwargs):
+        return self.lock.acquire(*args, **kwargs)
+
+    def release(self, *args, **kwargs):
+        return self.lock.release(*args, **kwargs)
+
+    def __enter__(self):
+        self.lock.__enter__()
+
+    def __exit__(self, *args):
+        self.lock.__exit__(*args)
+
+    def locked(self):
+        return self.lock.locked()
+
+    def __getstate__(self):
+        return self.token
+
+    def __setstate__(self, token):
+        self.__init__(token)
+
+    def __str__(self):
+        return f"<{self.__class__.__name__}: {self.token}>"
+
+    __repr__ = __str__
 
 
 # Locks used by multiple backends.

diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py
@@ -47,12 +47,11 @@ class EncodedStringCoder(VariableCoder):
     def __init__(self, allows_unicode=True):
         self.allows_unicode = allows_unicode
 
-    def encode(self, variable, name=None):
+    def encode(self, variable: Variable, name=None) -> Variable:
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         contains_unicode = is_unicode_dtype(data.dtype)
         encode_as_char = encoding.get("dtype") == "S1"
-
         if encode_as_char:
             del encoding["dtype"]  # no longer relevant
 
@@ -69,9 +68,12 @@ def encode(self, variable, name=None):
             # TODO: figure out how to handle this in a lazy way with dask
             data = encode_string_array(data, string_encoding)
 
-        return Variable(dims, data, attrs, encoding)
+            return Variable(dims, data, attrs, encoding)
+        else:
+            variable.encoding = encoding
+            return variable
 
-    def decode(self, variable, name=None):
+    def decode(self, variable: Variable, name=None) -> Variable:
         dims, data, attrs, encoding = unpack_for_decoding(variable)
 
         if "_Encoding" in attrs:
@@ -95,13 +97,15 @@ def encode_string_array(string_array, encoding="utf-8"):
     return np.array(encoded, dtype=bytes).reshape(string_array.shape)
 
 
-def ensure_fixed_length_bytes(var):
+def ensure_fixed_length_bytes(var: Variable) -> Variable:
     """Ensure that a variable with vlen bytes is converted to fixed width."""
-    dims, data, attrs, encoding = unpack_for_encoding(var)
-    if check_vlen_dtype(data.dtype) == bytes:
+    if check_vlen_dtype(var.dtype) == bytes:
+        dims, data, attrs, encoding = unpack_for_encoding(var)
         # TODO: figure out how to handle this with dask
         data = np.asarray(data, dtype=np.bytes_)
-    return Variable(dims, data, attrs, encoding)
+        return Variable(dims, data, attrs, encoding)
+    else:
+        return var
 
 
 class CharacterArrayCoder(VariableCoder):

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -414,7 +414,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
 class UnsignedIntegerCoder(VariableCoder):
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         # from netCDF best practices
-        # https://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html
+        # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
         #     "_Unsigned = "true" to indicate that
         #      integer data should be treated as unsigned"
         if variable.encoding.get("_Unsigned", "false") == "true":

diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -16,7 +16,7 @@
 )
 from xarray.core.pycompat import is_duck_dask_array
 from xarray.core.utils import emit_user_level_warning
-from xarray.core.variable import IndexVariable, Variable
+from xarray.core.variable import Variable
 
 CF_RELATED_DATA = (
     "bounds",
@@ -97,10 +97,10 @@ def _infer_dtype(array, name=None):
 
 
 def ensure_not_multiindex(var: Variable, name: T_Name = None) -> None:
-    if isinstance(var, IndexVariable) and isinstance(var.to_index(), pd.MultiIndex):
+    if isinstance(var._data, indexing.PandasMultiIndexingAdapter):
         raise NotImplementedError(
             f"variable {name!r} is a MultiIndex, which cannot yet be "
-            "serialized to netCDF files. Instead, either use reset_index() "
+            "serialized. Instead, either use reset_index() "
             "to convert MultiIndex levels into coordinate variables instead "
             "or use https://cf-xarray.readthedocs.io/en/latest/coding.html."
         )
@@ -647,7 +647,9 @@ def cf_decoder(
     return variables, attributes
 
 
-def _encode_coordinates(variables, attributes, non_dim_coord_names):
+def _encode_coordinates(
+    variables: T_Variables, attributes: T_Attrs, non_dim_coord_names
+):
     # calculate global and variable specific coordinates
     non_dim_coord_names = set(non_dim_coord_names)
 
@@ -675,7 +677,7 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
                 variable_coordinates[k].add(coord_name)
 
             if any(
-                attr_name in v.encoding and coord_name in v.encoding.get(attr_name)
+                coord_name in v.encoding.get(attr_name, tuple())
                 for attr_name in CF_RELATED_DATA
             ):
                 not_technically_coordinates.add(coord_name)
@@ -742,7 +744,7 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
     return variables, attributes
 
 
-def encode_dataset_coordinates(dataset):
+def encode_dataset_coordinates(dataset: Dataset):
     """Encode coordinates on the given dataset object into variable specific
     and global attributes.
 
@@ -764,7 +766,7 @@ def encode_dataset_coordinates(dataset):
     )
 
 
-def cf_encoder(variables, attributes):
+def cf_encoder(variables: T_Variables, attributes: T_Attrs):
     """
     Encode a set of CF encoded variables and attributes.
     Takes a dicts of variables and attributes and encodes them