Merge branch 'main' of https://github.com/pydata/xarray

TomNicholas · Nov 23, 2024 · 3132f6a · 3132f6a
2 parents d377780 + 1bb867d
commit 3132f6a
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 161 deletions.
diff --git a/DATATREE_MIGRATION_GUIDE.md b/DATATREE_MIGRATION_GUIDE.md
@@ -45,6 +45,7 @@ A number of other API changes have been made, which should only require minor mo
 - The `DataTree.parent` property is now read-only. To assign a ancestral relationships directly you must instead use the `.children` property on the parent node, which remains settable.
 - Similarly the `parent` kwarg has been removed from the `DataTree.__init__` constructor.
 - DataTree objects passed to the `children` kwarg in `DataTree.__init__` are now shallow-copied.
+- `DataTree.map_over_subtree` has been renamed to `DataTree.map_over_datasets`, and changed to no longer work like a decorator. Instead you use it to apply the function and arguments directly, more like how `xarray.apply_ufunc` works.
 - `DataTree.as_array` has been replaced by `DataTree.to_dataarray`.
 - A number of methods which were not well tested have been (temporarily) disabled. In general we have tried to only keep things that are known to work, with the plan to increase API surface incrementally after release.
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -16,24 +16,13 @@ What's New
 
 .. _whats-new.2024.10.1:
 
-v.2024.10.1 (unreleased)
-------------------------
-
-
-Breaking Changes
-~~~~~~~~~~~~~~~~
-- The minimum versions of some dependencies were changed
+v.2024.11.0 (Nov 22, 2024)
+--------------------------
 
-  ===================== =========  =======
-   Package                    Old      New
-  ===================== =========  =======
-    boto3                    1.28     1.29
-    dask-core             2023.9   2023.11
-    distributed           2023.9   2023.11
-    h5netcdf                 1.2      1.3
-    numbagg                0.2.1      0.6
-    typing_extensions       4.7       4.8
-  ===================== =========  =======
+This release brings better support for wrapping JAX arrays and Astropy Quantity objects, :py:meth:`DataTree.persist`, algorithmic improvements
+to many methods with dask (:py:meth:`Dataset.polyfit`, :py:meth:`Dataset.ffill`, :py:meth:`Dataset.bfill`, rolling reductions), and bug fixes.
+Thanks to the 22 contributors to this release:
+Benoit Bovy, Deepak Cherian, Dimitri Papadopoulos Orfanos, Holly Mandel, James Bourbeau, Joe Hamman, Justus Magin, Kai Mühlbauer, Lukas Trippe, Mathias Hauser, Maximilian Roos, Michael Niklas, Pascal Bourgault, Patrick Hoefler, Sam Levang, Sarah Charlotte Johnson, Scott Huberty, Stephan Hoyer, Tom Nicholas, Virgile Andreani, joseph nowak and tvo
 
 New Features
 ~~~~~~~~~~~~
@@ -64,10 +53,23 @@ New Features
   underlying array's backend. Provides better support for certain wrapped array types
   like ``jax.numpy.ndarray``. (:issue:`7848`, :pull:`9776`).
   By `Sam Levang <https://github.com/slevang>`_.
+- Speed up loading of large zarr stores using dask arrays. (:issue:`8902`)
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 
-Breaking changes
+Breaking Changes
 ~~~~~~~~~~~~~~~~
+- The minimum versions of some dependencies were changed
 
+  ===================== =========  =======
+   Package                    Old      New
+  ===================== =========  =======
+    boto3                    1.28     1.29
+    dask-core             2023.9   2023.11
+    distributed           2023.9   2023.11
+    h5netcdf                 1.2      1.3
+    numbagg                0.2.1      0.6
+    typing_extensions       4.7       4.8
+  ===================== =========  =======
 
 Deprecations
 ~~~~~~~~~~~~

diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py
@@ -200,7 +200,7 @@ def get_backend(engine: str | type[BackendEntrypoint]) -> BackendEntrypoint:
         engines = list_engines()
         if engine not in engines:
             raise ValueError(
-                f"unrecognized engine {engine} must be one of your download engines: {list(engines)}"
+                f"unrecognized engine '{engine}' must be one of your download engines: {list(engines)}. "
                 "To install additional dependencies, see:\n"
                 "https://docs.xarray.dev/en/stable/user-guide/io.html \n"
                 "https://docs.xarray.dev/en/stable/getting-started-guide/installing.html"

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -37,6 +37,7 @@
 from xarray.namedarray.utils import module_available
 
 if TYPE_CHECKING:
+    from zarr import Array as ZarrArray
     from zarr import Group as ZarrGroup
 
     from xarray.backends.common import AbstractDataStore
@@ -443,7 +444,7 @@ def extract_zarr_variable_encoding(
     shape = shape if shape else variable.shape
     encoding = variable.encoding.copy()
 
-    safe_to_drop = {"source", "original_shape"}
+    safe_to_drop = {"source", "original_shape", "preferred_chunks"}
     valid_encodings = {
         "codecs",
         "chunks",
@@ -871,16 +872,27 @@ def store(
         else:
             zarr = attempt_import("zarr")
 
-        existing_keys = tuple(self.zarr_group.array_keys())
+        if self._mode == "w":
+            # always overwrite, so we don't care about existing names,
+            # and consistency of encoding
+            new_variable_names = set(variables)
+            existing_keys = {}
+            existing_variable_names = {}
+        else:
+            existing_keys = tuple(self.zarr_group.array_keys())
+            existing_variable_names = {
+                vn for vn in variables if _encode_variable_name(vn) in existing_keys
+            }
+            new_variable_names = set(variables) - existing_variable_names
 
-        if self._mode == "r+":
-            new_names = [k for k in variables if k not in existing_keys]
-            if new_names:
-                raise ValueError(
-                    f"dataset contains non-pre-existing variables {new_names}, "
-                    "which is not allowed in ``xarray.Dataset.to_zarr()`` with "
-                    "``mode='r+'``. To allow writing new variables, set ``mode='a'``."
-                )
+        if self._mode == "r+" and (
+            new_names := [k for k in variables if k not in existing_keys]
+        ):
+            raise ValueError(
+                f"dataset contains non-pre-existing variables {new_names!r}, "
+                "which is not allowed in ``xarray.Dataset.to_zarr()`` with "
+                "``mode='r+'``. To allow writing new variables, set ``mode='a'``."
+            )
 
         if self._append_dim is not None and self._append_dim not in existing_keys:
             # For dimensions without coordinate values, we must parse
@@ -895,10 +907,6 @@ def store(
                     f"dataset dimensions {existing_dims}"
                 )
 
-        existing_variable_names = {
-            vn for vn in variables if _encode_variable_name(vn) in existing_keys
-        }
-        new_variable_names = set(variables) - existing_variable_names
         variables_encoded, attributes = self.encode(
             {vn: variables[vn] for vn in new_variable_names}, attributes
         )
@@ -920,10 +928,9 @@ def store(
             # Modified variables must use the same encoding as the store.
             vars_with_encoding = {}
             for vn in existing_variable_names:
-                if self._mode in ["a", "a-", "r+"]:
-                    _validate_datatypes_for_zarr_append(
-                        vn, existing_vars[vn], variables[vn]
-                    )
+                _validate_datatypes_for_zarr_append(
+                    vn, existing_vars[vn], variables[vn]
+                )
                 vars_with_encoding[vn] = variables[vn].copy(deep=False)
                 vars_with_encoding[vn].encoding = existing_vars[vn].encoding
             vars_with_encoding, _ = self.encode(vars_with_encoding, {})
@@ -968,6 +975,69 @@ def store(
     def sync(self):
         pass
 
+    def _open_existing_array(self, *, name) -> ZarrArray:
+        import zarr
+
+        # TODO: if mode="a", consider overriding the existing variable
+        # metadata. This would need some case work properly with region
+        # and append_dim.
+        if self._write_empty is not None:
+            # Write to zarr_group.chunk_store instead of zarr_group.store
+            # See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
+            #    The open_consolidated() enforces a mode of r or r+
+            #    (and to_zarr with region provided enforces a read mode of r+),
+            #    and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
+            #    and a 'normal Store subtype for chunk_store.
+            #    The exact type depends on if a local path was used, or a URL of some sort,
+            #    but the point is that it's not a read-only ConsolidatedMetadataStore.
+            #    It is safe to write chunk data to the chunk_store because no metadata would be changed by
+            #    to_zarr with the region parameter:
+            #     - Because the write mode is enforced to be r+, no new variables can be added to the store
+            #       (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
+            #     - Existing variables already have their attrs included in the consolidated metadata file.
+            #     - The size of dimensions can not be expanded, that would require a call using `append_dim`
+            #        which is mutually exclusive with `region`
+            zarr_array = zarr.open(
+                store=(
+                    self.zarr_group.store if _zarr_v3() else self.zarr_group.chunk_store
+                ),
+                # TODO: see if zarr should normalize these strings.
+                path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip("/"),
+                write_empty_chunks=self._write_empty,
+            )
+        else:
+            zarr_array = self.zarr_group[name]
+
+        return zarr_array
+
+    def _create_new_array(
+        self, *, name, shape, dtype, fill_value, encoding, attrs
+    ) -> ZarrArray:
+        if coding.strings.check_vlen_dtype(dtype) is str:
+            dtype = str
+
+        if self._write_empty is not None:
+            if (
+                "write_empty_chunks" in encoding
+                and encoding["write_empty_chunks"] != self._write_empty
+            ):
+                raise ValueError(
+                    'Differing "write_empty_chunks" values in encoding and parameters'
+                    f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
+                )
+            else:
+                encoding["write_empty_chunks"] = self._write_empty
+
+        zarr_array = self.zarr_group.create(
+            name,
+            shape=shape,
+            dtype=dtype,
+            fill_value=fill_value,
+            **encoding,
+        )
+        zarr_array = _put_attrs(zarr_array, attrs)
+        return zarr_array
+
     def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None):
         """
         This provides a centralized method to set the variables on the data
@@ -986,8 +1056,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
             dimensions.
         """
 
-        import zarr
-
         existing_keys = tuple(self.zarr_group.array_keys())
         is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3
 
@@ -1016,47 +1084,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
                 else:
                     del v.encoding["_FillValue"]
 
-            zarr_array = None
             zarr_shape = None
             write_region = self._write_region if self._write_region is not None else {}
             write_region = {dim: write_region.get(dim, slice(None)) for dim in dims}
 
-            if name in existing_keys:
+            if self._mode != "w" and name in existing_keys:
                 # existing variable
-                # TODO: if mode="a", consider overriding the existing variable
-                # metadata. This would need some case work properly with region
-                # and append_dim.
-                if self._write_empty is not None:
-                    # Write to zarr_group.chunk_store instead of zarr_group.store
-                    # See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
-                    #    The open_consolidated() enforces a mode of r or r+
-                    #    (and to_zarr with region provided enforces a read mode of r+),
-                    #    and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
-                    #    and a 'normal Store subtype for chunk_store.
-                    #    The exact type depends on if a local path was used, or a URL of some sort,
-                    #    but the point is that it's not a read-only ConsolidatedMetadataStore.
-                    #    It is safe to write chunk data to the chunk_store because no metadata would be changed by
-                    #    to_zarr with the region parameter:
-                    #     - Because the write mode is enforced to be r+, no new variables can be added to the store
-                    #       (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
-                    #     - Existing variables already have their attrs included in the consolidated metadata file.
-                    #     - The size of dimensions can not be expanded, that would require a call using `append_dim`
-                    #        which is mutually exclusive with `region`
-                    zarr_array = zarr.open(
-                        store=(
-                            self.zarr_group.store
-                            if _zarr_v3()
-                            else self.zarr_group.chunk_store
-                        ),
-                        # TODO: see if zarr should normalize these strings.
-                        path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip(
-                            "/"
-                        ),
-                        write_empty_chunks=self._write_empty,
-                    )
-                else:
-                    zarr_array = self.zarr_group[name]
-
+                zarr_array = self._open_existing_array(name=name)
                 if self._append_dim is not None and self._append_dim in dims:
                     # resize existing variable
                     append_axis = dims.index(self._append_dim)
@@ -1089,40 +1123,27 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
                 shape=zarr_shape,
             )
 
-            if name not in existing_keys:
+            if self._mode == "w" or name not in existing_keys:
                 # new variable
-                encoded_attrs = {}
+                encoded_attrs = {k: self.encode_attribute(v) for k, v in attrs.items()}
                 # the magic for storing the hidden dimension data
                 if is_zarr_v3_format:
                     encoding["dimension_names"] = dims
                 else:
                     encoded_attrs[DIMENSION_KEY] = dims
-                for k2, v2 in attrs.items():
-                    encoded_attrs[k2] = self.encode_attribute(v2)
-
-                if coding.strings.check_vlen_dtype(dtype) is str:
-                    dtype = str
-
-                if self._write_empty is not None:
-                    if (
-                        "write_empty_chunks" in encoding
-                        and encoding["write_empty_chunks"] != self._write_empty
-                    ):
-                        raise ValueError(
-                            'Differing "write_empty_chunks" values in encoding and parameters'
-                            f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
-                        )
-                    else:
-                        encoding["write_empty_chunks"] = self._write_empty
-
-                zarr_array = self.zarr_group.create(
-                    name,
-                    shape=shape,
+
+                encoding["exists_ok" if _zarr_v3() else "overwrite"] = (
+                    True if self._mode == "w" else False
+                )
+
+                zarr_array = self._create_new_array(
+                    name=name,
                     dtype=dtype,
+                    shape=shape,
                     fill_value=fill_value,
-                    **encoding,
+                    encoding=encoding,
+                    attrs=encoded_attrs,
                 )
-                zarr_array = _put_attrs(zarr_array, encoded_attrs)
 
             writer.add(v.data, zarr_array, region)