From caed27437cc695e6fc83475c24c9ae2268806f28 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Sun, 30 Jun 2024 16:03:46 +0200 Subject: [PATCH] `"source"` encoding for datasets opened from `fsspec` objects (#8923) * draft for setting `source` from pre-opened `fsspec` file objects * refactor to only import `fsspec` if we're actually going to check Could use `getattr(filename_or_obj, "path", filename_or_obj)` to avoid `isinstance` checks. * replace with a simple `getattr` on `"path"` * add a test * whats-new entry * open the file as a context manager --- doc/whats-new.rst | 2 ++ xarray/backends/api.py | 7 +++++-- xarray/tests/test_backends.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c58f73cb1fa..0174e16602f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,8 @@ New Features ~~~~~~~~~~~~ - Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`). By `Martin Raspaud `_. +- Extract the source url from fsspec objects (:issue:`9142`, :pull:`8923`). + By `Justus Magin `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7054c62126e..521bdf65e6a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -382,8 +382,11 @@ def _dataset_from_backend_dataset( ds.set_close(backend_ds._close) # Ensure source filename always stored in dataset object - if "source" not in ds.encoding and isinstance(filename_or_obj, (str, os.PathLike)): - ds.encoding["source"] = _normalize_path(filename_or_obj) + if "source" not in ds.encoding: + path = getattr(filename_or_obj, "path", filename_or_obj) + + if isinstance(path, (str, os.PathLike)): + ds.encoding["source"] = _normalize_path(path) return ds diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 177700a5404..15485dc178a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5151,6 +5151,21 @@ def test_source_encoding_always_present_with_pathlib() -> None: assert ds.encoding["source"] == tmp +@requires_h5netcdf +@requires_fsspec +def test_source_encoding_always_present_with_fsspec() -> None: + import fsspec + + rnddata = np.random.randn(10) + original = Dataset({"foo": ("x", rnddata)}) + with create_tmp_file() as tmp: + original.to_netcdf(tmp) + + fs = fsspec.filesystem("file") + with fs.open(tmp) as f, open_dataset(f) as ds: + assert ds.encoding["source"] == tmp + + def _assert_no_dates_out_of_range_warning(record): undesired_message = "dates out of range" for warning in record: