Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Kvikio backend entrypoint #10

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9deadb7
Add Kvikio backend entrypoint
dcherian Aug 2, 2022
aa2dc91
Add demo notebook
dcherian Aug 2, 2022
7fb4b94
Update kvikio notebook
dcherian Aug 16, 2022
743fe7d
Merge branch 'main' into kvikio-entrypoint
dcherian Aug 17, 2022
5d501e4
Merge branch 'main' into kvikio-entrypoint
andersy005 Aug 17, 2022
facf5f7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 17, 2022
f3f5189
Update cupy_xarray/kvikio.py
dcherian Aug 17, 2022
9c98d19
Merge branch 'main' into kvikio-entrypoint
andersy005 Jan 3, 2023
dd8bc57
Merge branch 'main' into kvikio-entrypoint
dcherian Jan 20, 2023
d2da1e4
Add url, description.
dcherian Jan 21, 2023
b87c3c2
Working
dcherian Aug 18, 2023
87cb74e
Updated notebook
dcherian Aug 22, 2023
d7394ef
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 22, 2023
1b23fef
Merge remote-tracking branch 'upstream/main' into kvikio-entrypoint
dcherian Nov 3, 2023
ca0cf45
Add tests
dcherian Nov 3, 2023
97260d6
Merge branch 'main' into kvikio-entrypoint
weiji14 Jun 21, 2024
5d27b26
Move kvikio notebook under docs/source
weiji14 Jun 21, 2024
85491d7
Add zarr as a dependency in ci/doc.yml
weiji14 Jun 22, 2024
c470b97
Add entry for KvikioBackendEntrypoint in API docs
weiji14 Jun 22, 2024
95efa18
Fix input argument into CupyZarrArrayWrapper
weiji14 Jun 22, 2024
d684dad
Merge branch 'main' into kvikio-entrypoint
weiji14 Dec 14, 2024
ae2a7f1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 14, 2024
15fbafd
Re-add kvikio backend entrypoint to pyproject.toml
weiji14 Dec 14, 2024
f3df115
Fix C408 and E402
weiji14 Dec 14, 2024
4e1857a
Use get_duck_array instead of get_array
weiji14 Dec 14, 2024
7345b61
Fix SIM108 Use ternary operator
weiji14 Dec 16, 2024
e2b410e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/doc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- furo>=2024.8.6
- myst-nb
- xarray
- zarr
- pip:
# relative to this file. Needs to be editable to be accepted.
- --editable ..
3 changes: 2 additions & 1 deletion cupy_xarray/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import _version
from .accessors import CupyDataArrayAccessor, CupyDatasetAccessor # noqa
from .accessors import CupyDataArrayAccessor, CupyDatasetAccessor # noqa: F401
from .kvikio import KvikioBackendEntrypoint # noqa: F401

__version__ = _version.get_versions()["version"]
230 changes: 230 additions & 0 deletions cupy_xarray/kvikio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
"""
:doc:`kvikIO <kvikio:index>` backend for xarray to read Zarr stores directly into CuPy
arrays in GPU memory.
"""

import os
import warnings

import cupy as cp
import numpy as np
from xarray import Variable
from xarray.backends import zarr as zarr_backend
from xarray.backends.common import _normalize_path # TODO: can this be public
from xarray.backends.store import StoreBackendEntrypoint
from xarray.backends.zarr import ZarrArrayWrapper, ZarrBackendEntrypoint, ZarrStore
from xarray.core import indexing
from xarray.core.utils import close_on_error # TODO: can this be public.

try:
import kvikio.zarr
import zarr

has_kvikio = True
except ImportError:
has_kvikio = False


# TODO: minimum kvikio version for supporting consolidated
# TODO: minimum xarray version for ZarrArrayWrapper._array 2023.10.0?


class DummyZarrArrayWrapper(ZarrArrayWrapper):
def __init__(self, array: np.ndarray):
assert isinstance(array, np.ndarray)
self._array = array
self.filters = None
self.dtype = array.dtype
self.shape = array.shape

def __array__(self):
return self._array

def get_array(self):
return self._array

def __getitem__(self, key):
return self._array[key]


class CupyZarrArrayWrapper(ZarrArrayWrapper):
def __array__(self):
return self.get_array()


class EagerCupyZarrArrayWrapper(ZarrArrayWrapper):
"""Used to wrap dimension coordinates."""

def __array__(self):
return self._array[:].get()

def get_duck_array(self):
# total hack: make a numpy array look like a Zarr array
# this gets us through Xarray's backend layers
return DummyZarrArrayWrapper(self._array[:].get())


class GDSZarrStore(ZarrStore):
@classmethod
def open_group(
cls,
store,
mode="r",
synchronizer=None,
group=None,
consolidated=False,
consolidate_on_close=False,
chunk_store=None,
storage_options=None,
append_dim=None,
write_region=None,
safe_chunks=True,
stacklevel=2,
):
# zarr doesn't support pathlib.Path objects yet. zarr-python#601
if isinstance(store, os.PathLike):
store = os.fspath(store)

open_kwargs = {
"mode": mode,
"synchronizer": synchronizer,
"path": group,
########## NEW STUFF
"meta_array": cp.empty(()),
}
open_kwargs["storage_options"] = storage_options

if chunk_store:
open_kwargs["chunk_store"] = chunk_store
if consolidated is None:
consolidated = False

store = kvikio.zarr.GDSStore(store)

if consolidated is None:
try:
zarr_group = zarr.open_consolidated(store, **open_kwargs)
except KeyError:
warnings.warn(
"Failed to open Zarr store with consolidated metadata, "
"falling back to try reading non-consolidated metadata. "
"This is typically much slower for opening a dataset. "
"To silence this warning, consider:\n"
"1. Consolidating metadata in this existing store with "
"zarr.consolidate_metadata().\n"
"2. Explicitly setting consolidated=False, to avoid trying "
"to read consolidate metadata, or\n"
"3. Explicitly setting consolidated=True, to raise an "
"error in this case instead of falling back to try "
"reading non-consolidated metadata.",
RuntimeWarning,
stacklevel=stacklevel,
)
zarr_group = zarr.open_group(store, **open_kwargs)
elif consolidated:
# TODO: an option to pass the metadata_key keyword
zarr_group = zarr.open_consolidated(store, **open_kwargs)
else:
zarr_group = zarr.open_group(store, **open_kwargs)

return cls(
zarr_group,
mode,
consolidate_on_close,
append_dim,
write_region,
safe_chunks,
)

def open_store_variable(self, name, zarr_array):
try_nczarr = self._mode == "r"
dimensions, attributes = zarr_backend._get_zarr_dims_and_attrs(
zarr_array, zarr_backend.DIMENSION_KEY, try_nczarr
)

#### Changed from zarr array wrapper
# we want indexed dimensions to be loaded eagerly
# Right now we load in to device and then transfer to host
# But these should be small-ish arrays
# TODO: can we tell GDSStore to load as numpy array directly
# not cupy array?
array_wrapper = EagerCupyZarrArrayWrapper if name in dimensions else CupyZarrArrayWrapper
data = indexing.LazilyIndexedArray(array_wrapper(zarr_array))
Copy link
Member

@weiji14 weiji14 Jun 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently stuck on this line, and trying to figure out what to change in relation to pydata/xarray#6874. My guess is that in an ideal world where we've solved all the numpy/cupy duck typing issues, we could just use the recently modified ZarrArrayWrapper (pydata/xarray#8472) like this:

Suggested change
data = indexing.LazilyIndexedArray(array_wrapper(zarr_array))
data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array))

However, that leads to this error:

____________________________________________________________________________________ test_lazy_load[True] ____________________________________________________________________________________

consolidated = True, store = PosixPath('/tmp/pytest-of-weiji/pytest-32/test_lazy_load_True_0/kvikio.zarr')

    @pytest.mark.parametrize("consolidated", [True, False])
    def test_lazy_load(consolidated, store):
>       with xr.open_dataset(store, engine="kvikio", consolidated=consolidated) as ds:

cupy_xarray/tests/test_kvikio.py:37: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/backends/api.py:571: in open_dataset
    backend_ds = backend.open_dataset(
cupy_xarray/kvikio.py:241: in open_dataset
    ds = store_entrypoint.open_dataset(
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/backends/store.py:58: in open_dataset
    ds = Dataset(vars, attrs=attrs)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/dataset.py:711: in __init__
    variables, coord_names, dims, indexes, _ = merge_data_and_coords(
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/dataset.py:425: in merge_data_and_coords
    return merge_core(
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/merge.py:699: in merge_core
    collected = collect_variables_and_indexes(aligned, indexes=indexes)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/merge.py:362: in collect_variables_and_indexes
    idx, idx_vars = create_default_index_implicit(variable)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexes.py:1404: in create_default_index_implicit
    index = PandasIndex.from_variables(dim_var, options={})
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexes.py:654: in from_variables
    obj = cls(data, dim, coord_dtype=var.dtype)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexes.py:589: in __init__
    index = safe_cast_to_index(array)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexes.py:469: in safe_cast_to_index
    index = pd.Index(np.asarray(array), **kwargs)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexing.py:509: in __array__
    return np.asarray(self.get_duck_array(), dtype=dtype)
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/backends/common.py:181: in get_duck_array
    return self[key]  # type: ignore [index]
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/backends/zarr.py:104: in __getitem__
    return indexing.explicit_indexing_adapter(
../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexing.py:1017: in explicit_indexing_adapter
    indexable = NumpyIndexingAdapter(result)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <[AttributeError("'NumpyIndexingAdapter' object has no attribute 'array'") raised in repr()] NumpyIndexingAdapter object at 0x7f71beb14a00>
array = array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4])

    def __init__(self, array):
        # In NumpyIndexingAdapter we only allow to store bare np.ndarray
        if not isinstance(array, np.ndarray):
>           raise TypeError(
                "NumpyIndexingAdapter only wraps np.ndarray. "
                f"Trying to wrap {type(array)}"
            )
E           TypeError: NumpyIndexingAdapter only wraps np.ndarray. Trying to wrap <class 'cupy.ndarray'>

../../../mambaforge/envs/cupy-xarray-doc/lib/python3.10/site-packages/xarray/core/indexing.py:1493: TypeError
================================================================================== short test summary info ===================================================================================
FAILED cupy_xarray/tests/test_kvikio.py::test_lazy_load[True] - TypeError: NumpyIndexingAdapter only wraps np.ndarray. Trying to wrap <class 'cupy.ndarray'>
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Looked into this a bit and got lost in all the xarray class inheritance logic and mixins... @dcherian, could you point out what's the recommended route forward? Is there something we still need to upstream into xarray? I'm testing this on xarray=2024.6.0 btw.

Output of xr.show_versions() for reference:

INSTALLED VERSIONS
------------------
commit: None
python: 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]
python-bits: 64
OS: Linux
OS-release: 6.7.12+bpo-amd64
machine: x86_64
processor: 
byteorder: little
LC_ALL: None
LANG: en_NZ.UTF-8
LOCALE: ('en_NZ', 'UTF-8')
libhdf5: 1.14.3
libnetcdf: 4.9.2

xarray: 2024.6.0
pandas: 2.2.2
numpy: 1.26.4
scipy: None
netCDF4: 1.7.1
pydap: None
h5netcdf: None
h5py: None
zarr: 2.18.2
cftime: 1.6.4
nc_time_axis: None
iris: None
bottleneck: None
dask: 2024.6.2
distributed: None
matplotlib: None
cartopy: None
seaborn: None
numbagg: None
fsspec: 2024.6.0
cupy: 13.2.0
pint: None
sparse: None
flox: None
numpy_groupies: None
setuptools: 70.0.0
pip: 24.0
conda: None
pytest: 8.2.2
mypy: None
IPython: 8.25.0
sphinx: 7.3.7

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also facing a problem at the same place in the code when it comes to dimension coordinates. Consider the dummy dataset:

<xarray.Dataset> Size: 352B
Dimensions:      (instrument: 3, loc: 2, time: 4)
Coordinates:
  * instrument   (instrument) <U8 96B 'manufac1' 'manufac2' 'manufac3'
    lat          (loc) float64 16B ...
    lon          (loc) float64 16B ...
  * time         (time) datetime64[ns] 32B 2014-09-06 2014-09-07 ... 2014-09-09
Dimensions without coordinates: loc
Data variables:
    temperature  (loc, instrument, time) float64 192B ...
-------------------------------------------------

that I write to a zarr file:

myzarr = 'mytest.zarr'
ds.to_zarr(myzarr, mode="w")

which is ok to read back with:

ds2 = xr.open_dataset(myzarr)

But when I want to use the kvikio engine:

ds3 = xr.open_dataset(myzarr, engine='kvikio', consolidated=False)

it fails with:

Traceback (most recent call last):
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/conventions.py", line 450, in decode_cf_variables
    new_vars[k] = decode_cf_variable(
                  ^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/conventions.py", line 291, in decode_cf_variable
    var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/coding/times.py", line 992, in decode
    dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/coding/times.py", line 214, in _decode_cf_datetime_dtype
    [first_n_items(values, 1) or [0], last_item(values) or [0]]
     ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/core/formatting.py", line 97, in first_n_items
    return np.ravel(to_duck_array(array))[:n_desired]
                    ^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/namedarray/pycompat.py", line 138, in to_duck_array
    return np.asarray(data)  # type: ignore[return-value]
           ^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/core/indexing.py", line 574, in __array__
    return np.asarray(self.get_duck_array(), dtype=dtype)
                      ^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/core/indexing.py", line 577, in get_duck_array
    return self.array.get_duck_array()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/core/indexing.py", line 651, in get_duck_array
    array = self.array[self.key]
            ~~~~~~~~~~^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/backends/zarr.py", line 104, in __getitem__
    return indexing.explicit_indexing_adapter(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/core/indexing.py", line 1015, in explicit_indexing_adapter
    result = raw_indexing_method(raw_key.tuple)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/backends/zarr.py", line 94, in _getitem
    return self._array[key]
           ~~~~~~~~~~~^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 800, in __getitem__
    result = self.get_basic_selection(pure_selection, fields=fields)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 926, in get_basic_selection
    return self._get_basic_selection_nd(selection=selection, out=out, fields=fields)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 968, in _get_basic_selection_nd
    return self._get_selection(indexer=indexer, out=out, fields=fields)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 1343, in _get_selection
    self._chunk_getitems(
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 2183, in _chunk_getitems
    self._process_chunk(
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 2096, in _process_chunk
    chunk = self._decode_chunk(cdata)
            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/zarr/core.py", line 2352, in _decode_chunk
    chunk = self._compressor.decode(cdata)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "numcodecs/blosc.pyx", line 563, in numcodecs.blosc.Blosc.decode
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/numcodecs/compat.py", line 154, in ensure_contiguous_ndarray
    return ensure_ndarray(
           ^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/numcodecs/compat.py", line 67, in ensure_ndarray
    return np.array(ensure_ndarray_like(buf), copy=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "cupy/_core/core.pyx", line 1481, in cupy._core.core._ndarray_base.__array__
TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/orliac/RADIOBLOCKS/new_inst/test_kvikio.py", line 72, in <module>
    ds3 = xr.open_dataset(myzarr, engine='kvikio', consolidated=False)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/backends/api.py", line 588, in open_dataset
    backend_ds = backend.open_dataset(
                 ^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/cupy-xarray/cupy_xarray/kvikio.py", line 248, in open_dataset
    ds = store_entrypoint.open_dataset(
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/backends/store.py", line 46, in open_dataset
    vars, attrs, coord_names = conventions.decode_cf_variables(
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/new_inst/VENV2/lib/python3.11/site-packages/xarray/conventions.py", line 461, in decode_cf_variables
    raise type(e)(f"Failed to decode variable {k!r}: {e}") from e
TypeError: Failed to decode variable 'time': Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

Any idea?

Just a bit of context: I'm working on a large radio astronomy project called RADIOBLOCKS where I'm willing to process very large Dask Xarray Datasets on a GPU cluster, hence my interest for your work on kvikio. Our GPU cluster has H100 and GDS if that is of interest for you for running benchmarks.
Thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The core issue here is that the "eager" wrapper isn't working as expected. Dimension coordinates must be eagerly loaded in to CPU memory, and this is a bit hacky at the moment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@weiji14 I think pydata/xarray#8408 fixes your issue. It needs a test though.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried @dcherian's suggestion, with

    print("raw_key =\n", raw_key)
    print("numpy_indices =\n", numpy_indices)
    result = raw_indexing_method(raw_key.tuple)
    print("result =", result, type(result))
    
    if numpy_indices.tuple:
        # index the loaded np.ndarray
        #indexable = NumpyIndexingAdapter(result)
        #result = apply_indexer(indexable, numpy_indices)
        result = as_indexable(result)[numpy_indices]
    return result

but now it fails with:

raw_key =
 OuterIndexer((slice(0, 1, 1),))
numpy_indices =
 OuterIndexer((slice(None, None, None),))
result =
 [1867128.] <class 'numpy.ndarray'>
array is np.ndarray
Traceback (most recent call last):
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/conventions.py", line 448, in decode_cf_variables
    new_vars[k] = decode_cf_variable(
                  ^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/conventions.py", line 289, in decode_cf_variable
    var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/coding/times.py", line 992, in decode
    dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/coding/times.py", line 214, in _decode_cf_datetime_dtype
    [first_n_items(values, 1) or [0], last_item(values) or [0]]
     ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/formatting.py", line 97, in first_n_items
    return np.ravel(to_duck_array(array))[:n_desired]
                    ^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/namedarray/pycompat.py", line 138, in to_duck_array
    return np.asarray(data)  # type: ignore[return-value]
           ^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/indexing.py", line 574, in __array__
    return np.asarray(self.get_duck_array(), dtype=dtype)
                      ^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/indexing.py", line 577, in get_duck_array
    return self.array.get_duck_array()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/indexing.py", line 651, in get_duck_array
    array = self.array[self.key]
            ~~~~~~~~~~^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/backends/netCDF4_.py", line 100, in __getitem__
    return indexing.explicit_indexing_adapter(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/indexing.py", line 1031, in explicit_indexing_adapter
    result = as_indexable(result)[numpy_indices]
             ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/indexing.py", line 1526, in __getitem__
    self._check_and_raise_if_non_basic_indexer(indexer)
  File "/home/orliac/RADIOBLOCKS/kuma_inst/VENV/lib/python3.11/site-packages/xarray/core/indexing.py", line 550, in _check_and_raise_if_non_basic_indexer
    raise TypeError(
TypeError: Vectorized indexing with vectorized or outer indexers is not supported. Please use .vindex and .oindex properties to index the array.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@orliac What xarray version do you use?

I think this might be a relevant to this PR:
pydata/xarray#8845

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @negin513
I had this version installed: xarray 2024.7.1.dev57+g25debff9.d20240911


attributes = dict(attributes)
encoding = {
"chunks": zarr_array.chunks,
"preferred_chunks": dict(zip(dimensions, zarr_array.chunks)),
"compressor": zarr_array.compressor,
"filters": zarr_array.filters,
}
# _FillValue needs to be in attributes, not encoding, so it will get
# picked up by decode_cf
if zarr_array.fill_value is not None:
attributes["_FillValue"] = zarr_array.fill_value

return Variable(dimensions, data, attributes, encoding)


class KvikioBackendEntrypoint(ZarrBackendEntrypoint):
"""
Xarray backend to read Zarr stores using 'kvikio' engine.

For more information about the underlying library, visit
:doc:`kvikIO's Zarr page<kvikio:zarr>`.
"""

available = has_kvikio
description = "Open zarr files (.zarr) using Kvikio"
url = "https://docs.rapids.ai/api/kvikio/stable/api/#zarr"

# disabled by default
# We need to provide this because of the subclassing from
# ZarrBackendEntrypoint
def guess_can_open(self, filename_or_obj):
return False

def open_dataset(
self,
filename_or_obj,
mask_and_scale=True,
decode_times=True,
concat_characters=True,
decode_coords=True,
drop_variables=None,
use_cftime=None,
decode_timedelta=None,
group=None,
mode="r",
synchronizer=None,
consolidated=None,
chunk_store=None,
storage_options=None,
stacklevel=3,
):
filename_or_obj = _normalize_path(filename_or_obj)
store = GDSZarrStore.open_group(
filename_or_obj,
group=group,
mode=mode,
synchronizer=synchronizer,
consolidated=consolidated,
consolidate_on_close=False,
chunk_store=chunk_store,
storage_options=storage_options,
stacklevel=stacklevel + 1,
)

store_entrypoint = StoreBackendEntrypoint()
with close_on_error(store):
ds = store_entrypoint.open_dataset(
store,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters,
decode_coords=decode_coords,
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)
return ds
54 changes: 54 additions & 0 deletions cupy_xarray/tests/test_kvikio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import cupy as cp
import numpy as np
import pytest
import xarray as xr
from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin

kvikio = pytest.importorskip("kvikio")
zarr = pytest.importorskip("zarr")

import kvikio.zarr # noqa
import xarray.core.indexing # noqa


@pytest.fixture
def store(tmp_path):
ds = xr.Dataset(
{
"a": ("x", np.arange(10), {"foo": "bar"}),
"scalar": np.array(1),
},
coords={"x": ("x", np.arange(-5, 5))},
)

for var in ds.variables:
ds[var].encoding["compressor"] = None

store_path = tmp_path / "kvikio.zarr"
ds.to_zarr(store_path, consolidated=True)
return store_path


def test_entrypoint():
assert "kvikio" in xr.backends.list_engines()


@pytest.mark.parametrize("consolidated", [True, False])
def test_lazy_load(consolidated, store):
with xr.open_dataset(store, engine="kvikio", consolidated=consolidated) as ds:
for _, da in ds.data_vars.items():
assert isinstance(da.variable._data, ExplicitlyIndexedNDArrayMixin)


@pytest.mark.parametrize("indexer", [slice(None), slice(2, 4), 2, [2, 3, 5]])
def test_lazy_indexing(indexer, store):
with xr.open_dataset(store, engine="kvikio") as ds:
ds = ds.isel(x=indexer)
for _, da in ds.data_vars.items():
assert isinstance(da.variable._data, ExplicitlyIndexedNDArrayMixin)

loaded = ds.compute()
for _, da in loaded.data_vars.items():
if da.ndim == 0:
continue
assert isinstance(da.data, cp.ndarray)
13 changes: 13 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,16 @@ Methods

Dataset.cupy.as_cupy
Dataset.cupy.as_numpy


KvikIO engine
-------------

.. currentmodule:: cupy_xarray

.. automodule:: cupy_xarray.kvikio

.. autosummary::
:toctree: generated/

KvikioBackendEntrypoint
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"python": ("https://docs.python.org/3/", None),
"dask": ("https://docs.dask.org/en/latest", None),
"cupy": ("https://docs.cupy.dev/en/latest", None),
"kvikio": ("https://docs.rapids.ai/api/kvikio/stable", None),
"xarray": ("http://docs.xarray.dev/en/latest/", None),
}

Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Large parts of this documentations comes from [SciPy 2023 Xarray on GPUs tutoria
source/high-level-api
source/apply-ufunc
source/real-example-1
source/kvikio


**Tutorials & Presentations**:
Expand Down
Loading
Loading