From 130cdeca3fb09b9e91eff5ca1ad5d14750c9433c Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 18 Nov 2024 14:00:26 -0600 Subject: [PATCH 1/7] Add utility for opening remote files with fsspec --- xarray/backends/common.py | 9 +++++++++ xarray/backends/h5netcdf_.py | 8 ++++++++ xarray/backends/store.py | 1 + xarray/tests/test_backends.py | 21 +++++++++++++++++++++ 4 files changed, 39 insertions(+) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 11e6e20a9dc..b19f97cfb09 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -178,6 +178,15 @@ def _normalize_path_list( return _normalize_path_list(paths) +def _open_file(file, mode, storage_options=None): + import fsspec + + fs, _, paths = fsspec.get_fs_token_paths( + file, mode=mode, storage_options=storage_options + ) + return fs.open(paths[0], mode=mode) + + def _encode_variable_name(name): if name is None: name = NONE_VAR_NAME diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 95cc1a1e93d..353222e8c50 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -13,6 +13,7 @@ BackendEntrypoint, WritableCFDataStore, _normalize_path, + _open_file, datatree_from_dict_with_io_cleanup, find_root_and_group, ) @@ -149,9 +150,14 @@ def open( decode_vlen_strings=True, driver=None, driver_kwds=None, + storage_options=None, ): import h5netcdf + if isinstance(filename, str) and is_remote_uri(filename) and driver is None: + mode_ = "rb" if mode == "r" else mode + filename = _open_file(filename, mode=mode_, storage_options=storage_options) + if isinstance(filename, bytes): raise ValueError( "can't open netCDF4/HDF5 as bytes " @@ -425,6 +431,7 @@ def open_dataset( decode_vlen_strings=True, driver=None, driver_kwds=None, + storage_options=None, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( @@ -450,6 +457,7 @@ def open_dataset( drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + storage_options=None, ) return ds diff --git a/xarray/backends/store.py b/xarray/backends/store.py index b1b3956ca8e..ad5ff856bea 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -38,6 +38,7 @@ def open_dataset( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, + storage_options=None, ) -> Dataset: assert isinstance(filename_or_obj, AbstractDataStore) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c543333c61e..225b7929792 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6489,3 +6489,24 @@ def test_zarr_safe_chunk_region(tmp_path): chunk = ds.isel(region) chunk = chunk.chunk() chunk.chunk().to_zarr(store, region=region) + + +@requires_h5netcdf +@requires_fsspec +def test_h5netcdf_storage_options() -> None: + with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2): + ds1 = create_test_data() + ds1.to_netcdf(f1, engine="h5netcdf") + + ds2 = create_test_data() + ds2.to_netcdf(f2, engine="h5netcdf") + + files = [f"file://{f}" for f in [f1, f2]] + ds = xr.open_mfdataset( + files, + engine="h5netcdf", + concat_dim="time", + combine="nested", + storage_options={"skip_instance_cache": False}, + ) + assert_identical(xr.concat([ds1, ds2], dim="time"), ds) From 8dbc2718b99418c339301ebfa6dd1ca452549a5a Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 18 Nov 2024 16:52:44 -0600 Subject: [PATCH 2/7] Apply Joe's suggestions from code review Co-authored-by: Joe Hamman --- xarray/backends/common.py | 2 +- xarray/backends/h5netcdf_.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index b19f97cfb09..ca0fbea0868 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -178,7 +178,7 @@ def _normalize_path_list( return _normalize_path_list(paths) -def _open_file(file, mode, storage_options=None): +def _open_remote_file(file, mode, storage_options=None): import fsspec fs, _, paths = fsspec.get_fs_token_paths( diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 353222e8c50..1c8edb5dc97 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -156,7 +156,7 @@ def open( if isinstance(filename, str) and is_remote_uri(filename) and driver is None: mode_ = "rb" if mode == "r" else mode - filename = _open_file(filename, mode=mode_, storage_options=storage_options) + filename = _open_remote_file(filename, mode=mode_, storage_options=storage_options) if isinstance(filename, bytes): raise ValueError( From 41b8c3bc2ec499572be133379ab7466b8812f0a9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 22:53:01 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/h5netcdf_.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 1c8edb5dc97..9c4461b2356 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -13,7 +13,6 @@ BackendEntrypoint, WritableCFDataStore, _normalize_path, - _open_file, datatree_from_dict_with_io_cleanup, find_root_and_group, ) @@ -156,7 +155,9 @@ def open( if isinstance(filename, str) and is_remote_uri(filename) and driver is None: mode_ = "rb" if mode == "r" else mode - filename = _open_remote_file(filename, mode=mode_, storage_options=storage_options) + filename = _open_remote_file( + filename, mode=mode_, storage_options=storage_options + ) if isinstance(filename, bytes): raise ValueError( From 4e13bd669992fc68c96b6e695ab0321226790da7 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 18 Nov 2024 16:55:29 -0600 Subject: [PATCH 4/7] Lint --- xarray/backends/h5netcdf_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 9c4461b2356..cc74a3eb24b 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -13,6 +13,7 @@ BackendEntrypoint, WritableCFDataStore, _normalize_path, + _open_remote_file, datatree_from_dict_with_io_cleanup, find_root_and_group, ) From ccf9568aa07ad7ba00710641c2c2ce3d102f70f8 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 20 Nov 2024 12:17:52 -0600 Subject: [PATCH 5/7] Add what's new entry --- doc/whats-new.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ee826e6e56f..74b5990422a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,12 +32,15 @@ New Features - Optimize ffill, bfill with dask when limit is specified (:pull:`9771`). By `Joseph Nowak `_, and - `Patrick Hoefler `. + `Patrick Hoefler `_. - Allow wrapping ``np.ndarray`` subclasses, e.g. ``astropy.units.Quantity`` (:issue:`9704`, :pull:`9760`). By `Sam Levang `_ and `Tien Vo `_. - Optimize :py:meth:`DataArray.polyfit` and :py:meth:`Dataset.polyfit` with dask, when used with arrays with more than two dimensions. (:issue:`5629`). By `Deepak Cherian `_. +- Support for directly opening remote files as string paths (for example, ``s3://bucket/data.nc``) + with ``fsspec`` when using the ``h5netcdf`` engine (:issue:`9723`, :pull:`9797`). + By `James Bourbeau `_. Breaking changes ~~~~~~~~~~~~~~~~ From eea68ea66b3606b306894ca2ed189637389b6020 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 20 Nov 2024 15:20:25 -0600 Subject: [PATCH 6/7] Type hint --- xarray/backends/h5netcdf_.py | 6 +++--- xarray/backends/store.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index cc74a3eb24b..2cd32d0a2ed 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -150,7 +150,7 @@ def open( decode_vlen_strings=True, driver=None, driver_kwds=None, - storage_options=None, + storage_options: dict[str, Any] | None = None, ): import h5netcdf @@ -433,7 +433,7 @@ def open_dataset( decode_vlen_strings=True, driver=None, driver_kwds=None, - storage_options=None, + storage_options: dict[str, Any] | None = None, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( @@ -446,6 +446,7 @@ def open_dataset( decode_vlen_strings=decode_vlen_strings, driver=driver, driver_kwds=driver_kwds, + storage_options=storage_options, ) store_entrypoint = StoreBackendEntrypoint() @@ -459,7 +460,6 @@ def open_dataset( drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, - storage_options=None, ) return ds diff --git a/xarray/backends/store.py b/xarray/backends/store.py index ad5ff856bea..b1b3956ca8e 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -38,7 +38,6 @@ def open_dataset( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - storage_options=None, ) -> Dataset: assert isinstance(filename_or_obj, AbstractDataStore) From 6c5c49b71cdbd2520814d5a1d88b0fc56ddbe674 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 20 Nov 2024 15:55:37 -0600 Subject: [PATCH 7/7] Make mypy happy --- xarray/backends/h5netcdf_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 2cd32d0a2ed..717ee48db3b 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -169,7 +169,7 @@ def open( magic_number = read_magic_number_from_file(filename) if not magic_number.startswith(b"\211HDF\r\n\032\n"): raise ValueError( - f"{magic_number} is not the signature of a valid netCDF4 file" + f"{magic_number!r} is not the signature of a valid netCDF4 file" ) if format not in [None, "NETCDF4"]: