Skip to content

Commit

Permalink
Fix rechunking to a frequency with empty bins.
Browse files Browse the repository at this point in the history
Closes #9360
  • Loading branch information
dcherian committed Aug 14, 2024
1 parent 28dfea7 commit f5c9525
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Deprecations
Bug fixes
~~~~~~~~~

- Fix bug with rechunking to a frequency when some periods contain no data (:issue:`9360`).
By `Deepak Cherian <https://github.com/dcherian>`_.
- Fix bug causing `DataTree.from_dict` to be sensitive to insertion order (:issue:`9276`, :pull:`9292`).
By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Fix resampling error with monthly, quarterly, or yearly frequencies with
Expand Down
10 changes: 7 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2752,17 +2752,21 @@ def _resolve_frequency(
)

assert variable.ndim == 1
chunks: tuple[int, ...] = tuple(
chunks = (
DataArray(
np.ones(variable.shape, dtype=int),
dims=(name,),
coords={name: variable},
)
.resample({name: resampler})
.sum()
.data.tolist()
)
return chunks
# When bins (binning) or time periods are missing (resampling)
# we can end up with NaNs. Drop them.
if chunks.dtype.kind == "f":
chunks = chunks.dropna(name).astype(int)
chunks_tuple: tuple[int, ...] = tuple(chunks.data.tolist())
return chunks_tuple

chunks_mapping_ints: Mapping[Any, T_ChunkDim] = {
name: (
Expand Down
24 changes: 17 additions & 7 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,24 +1209,34 @@ def get_dask_names(ds):
),
)
@pytest.mark.parametrize("freq", ["D", "W", "5ME", "YE"])
def test_chunk_by_frequency(self, freq, calendar) -> None:
@pytest.mark.parametrize("add_gap", [True, False])
def test_chunk_by_frequency(self, freq: str, calendar: str, add_gap: bool) -> None:
import dask.array

N = 365 * 2
ΔN = 28
time = xr.date_range(
"2001-01-01", periods=N + ΔN, freq="D", calendar=calendar
).to_numpy()
if add_gap:
# introduce an empty bin
time[31 : 31 + ΔN] = np.datetime64("NaT")
time = time[~np.isnat(time)]
else:
time = time[:N]

ds = Dataset(
{
"pr": ("time", dask.array.random.random((N), chunks=(20))),
"pr2d": (("x", "time"), dask.array.random.random((10, N), chunks=(20))),
"ones": ("time", np.ones((N,))),
},
coords={
"time": xr.date_range(
"2001-01-01", periods=N, freq="D", calendar=calendar
)
},
coords={"time": time},
)
rechunked = ds.chunk(x=2, time=TimeResampler(freq))
expected = tuple(ds.ones.resample(time=freq).sum().data.tolist())
expected = tuple(
ds.ones.resample(time=freq).sum().dropna("time").astype(int).data.tolist()
)
assert rechunked.chunksizes["time"] == expected
assert rechunked.chunksizes["x"] == (2,) * 5

Expand Down

0 comments on commit f5c9525

Please sign in to comment.