From 90e44867f7270e7de5e31b8713224039e39d9704 Mon Sep 17 00:00:00 2001
From: Alfonso Ladino <aladinor@unal.edu.co>
Date: Mon, 1 Jul 2024 09:33:15 -0500
Subject: [PATCH] Add open_datatree benchmark (#9158)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* open_datatree performance improvement on NetCDF files

* fixing issue with forward slashes

* fixing issue with pytest

* open datatree in zarr format improvement

* fixing incompatibility in returned object

* passing group parameter to opendatatree method and reducing duplicated code

* passing group parameter to opendatatree method - NetCDF

* Update xarray/backends/netCDF4_.py

renaming variables

Co-authored-by: Tom Nicholas <tom@cworthy.org>

* renaming variables

* renaming variables

* renaming group_store variable

* removing _open_datatree_netcdf function not used anymore in open_datatree implementations

* improving performance of open_datatree method

* renaming 'i' variable within list comprehension in open_store method for zarr datatree

* using the default generator instead of loading zarr groups in memory

* fixing issue with group path to avoid using group[1:] notation. Adding group variable typing hints (str | Iterable[str] | callable) under the open_datatree for h5 files. Finally, separating positional from keyword args

* fixing issue with group path to avoid using group[1:] notation and adding group variable typing hints (str | Iterable[str] | callable) under the open_datatree method for netCDF files

* fixing issue with group path to avoid using group[1:] notation and adding group variable typing hints (str | Iterable[str] | callable) under the open_datatree method for zarr files

* adding 'mode' parameter to open_datatree method

* adding 'mode' parameter to H5NetCDFStore.open method

* adding new entry related to open_datatree performance improvement

* adding new entry related to open_datatree performance improvement

* Getting rid of unnecessary parameters for 'open_datatree' method for netCDF4 and Hdf5 backends

* passing parent argument into _iter_zarr_groups instead of group[1:] for creating group path

* adding benchmark test for opening a deeply nested data tree. This include a new class named 'IONestedDataTree' and another class for benchmarck named 'IOReadDataTreeNetCDF4'

* Update doc/whats-new.rst

---------

Co-authored-by: Tom Nicholas <tom@cworthy.org>
Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de>
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
---
 asv_bench/benchmarks/dataset_io.py | 113 ++++++++++++++++++++++++++++-
 xarray/backends/zarr.py            |   2 +-
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
index dcc2de0473b..0956be67dad 100644
--- a/asv_bench/benchmarks/dataset_io.py
+++ b/asv_bench/benchmarks/dataset_io.py
@@ -7,6 +7,8 @@
 import pandas as pd
 
 import xarray as xr
+from xarray.backends.api import open_datatree
+from xarray.core.datatree import DataTree
 
 from . import _skip_slow, parameterized, randint, randn, requires_dask
 
@@ -16,7 +18,6 @@
 except ImportError:
     pass
 
-
 os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 
 _ENGINES = tuple(xr.backends.list_engines().keys() - {"store"})
@@ -469,6 +470,116 @@ def create_delayed_write():
     return ds.to_netcdf("file.nc", engine="netcdf4", compute=False)
 
 
+class IONestedDataTree:
+    """
+    A few examples that benchmark reading/writing a heavily nested netCDF datatree with
+    xarray
+    """
+
+    timeout = 300.0
+    repeat = 1
+    number = 5
+
+    def make_datatree(self, nchildren=10):
+        # multiple Dataset
+        self.ds = xr.Dataset()
+        self.nt = 1000
+        self.nx = 90
+        self.ny = 45
+        self.nchildren = nchildren
+
+        self.block_chunks = {
+            "time": self.nt / 4,
+            "lon": self.nx / 3,
+            "lat": self.ny / 3,
+        }
+
+        self.time_chunks = {"time": int(self.nt / 36)}
+
+        times = pd.date_range("1970-01-01", periods=self.nt, freq="D")
+        lons = xr.DataArray(
+            np.linspace(0, 360, self.nx),
+            dims=("lon",),
+            attrs={"units": "degrees east", "long_name": "longitude"},
+        )
+        lats = xr.DataArray(
+            np.linspace(-90, 90, self.ny),
+            dims=("lat",),
+            attrs={"units": "degrees north", "long_name": "latitude"},
+        )
+        self.ds["foo"] = xr.DataArray(
+            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
+            coords={"lon": lons, "lat": lats, "time": times},
+            dims=("time", "lon", "lat"),
+            name="foo",
+            attrs={"units": "foo units", "description": "a description"},
+        )
+        self.ds["bar"] = xr.DataArray(
+            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
+            coords={"lon": lons, "lat": lats, "time": times},
+            dims=("time", "lon", "lat"),
+            name="bar",
+            attrs={"units": "bar units", "description": "a description"},
+        )
+        self.ds["baz"] = xr.DataArray(
+            randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
+            coords={"lon": lons, "lat": lats},
+            dims=("lon", "lat"),
+            name="baz",
+            attrs={"units": "baz units", "description": "a description"},
+        )
+
+        self.ds.attrs = {"history": "created for xarray benchmarking"}
+
+        self.oinds = {
+            "time": randint(0, self.nt, 120),
+            "lon": randint(0, self.nx, 20),
+            "lat": randint(0, self.ny, 10),
+        }
+        self.vinds = {
+            "time": xr.DataArray(randint(0, self.nt, 120), dims="x"),
+            "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"),
+            "lat": slice(3, 20),
+        }
+        root = {f"group_{group}": self.ds for group in range(self.nchildren)}
+        nested_tree1 = {
+            f"group_{group}/subgroup_1": xr.Dataset() for group in range(self.nchildren)
+        }
+        nested_tree2 = {
+            f"group_{group}/subgroup_2": xr.DataArray(np.arange(1, 10)).to_dataset(
+                name="a"
+            )
+            for group in range(self.nchildren)
+        }
+        nested_tree3 = {
+            f"group_{group}/subgroup_2/sub-subgroup_1": self.ds
+            for group in range(self.nchildren)
+        }
+        dtree = root | nested_tree1 | nested_tree2 | nested_tree3
+        self.dtree = DataTree.from_dict(dtree)
+
+
+class IOReadDataTreeNetCDF4(IONestedDataTree):
+    def setup(self):
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        requires_dask()
+
+        self.make_datatree()
+        self.format = "NETCDF4"
+        self.filepath = "datatree.nc4.nc"
+        dtree = self.dtree
+        dtree.to_netcdf(filepath=self.filepath)
+
+    def time_load_datatree_netcdf4(self):
+        open_datatree(self.filepath, engine="netcdf4").load()
+
+    def time_open_datatree_netcdf4(self):
+        open_datatree(self.filepath, engine="netcdf4")
+
+
 class IOWriteNetCDFDask:
     timeout = 60
     repeat = 1
diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
index 9796fcbf9e2..85a1a6e214c 100644
--- a/xarray/backends/zarr.py
+++ b/xarray/backends/zarr.py
@@ -446,7 +446,7 @@ def open_store(
             stacklevel=stacklevel,
             zarr_version=zarr_version,
         )
-        group_paths = [str(group / node[1:]) for node in _iter_zarr_groups(zarr_group)]
+        group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)]
         return {
             group: cls(
                 zarr_group.get(group),