Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
hollymandel authored Aug 21, 2024
2 parents da54a7f + ed5900b commit 31f40d1
Show file tree
Hide file tree
Showing 17 changed files with 605 additions and 313 deletions.
1 change: 0 additions & 1 deletion doc/api-hidden.rst
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,6 @@

conventions.decode_cf_variables

coding.variables.UnsignedIntegerCoder
coding.variables.CFMaskCoder
coding.variables.CFScaleOffsetCoder

Expand Down
3 changes: 3 additions & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@
"matplotlib colormap name": ":doc:`matplotlib colormap name <matplotlib:gallery/color/colormap_reference>`",
"matplotlib axes object": ":py:class:`matplotlib axes object <matplotlib.axes.Axes>`",
"colormap": ":py:class:`colormap <matplotlib.colors.Colormap>`",
# xarray terms
"dim name": ":term:`dimension name <name>`",
"var name": ":term:`variable name <name>`",
# objects without namespace: xarray
"DataArray": "~xarray.DataArray",
"Dataset": "~xarray.Dataset",
Expand Down
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ New Features
~~~~~~~~~~~~
- Make chunk manager an option in ``set_options`` (:pull:`9362`).
By `Tom White <https://github.com/tomwhite>`_.
- Allow data variable specific ``constant_values`` in the dataset ``pad`` function (:pull:`9353``).
By `Tiago Sanona <https://github.com/tsanona>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand All @@ -47,6 +49,9 @@ Bug fixes
date "0001-01-01". (:issue:`9108`, :pull:`9116`) By `Spencer Clark
<https://github.com/spencerkclark>`_ and `Deepak Cherian
<https://github.com/dcherian>`_.
- Fix issue with passing parameters to ZarrStore.open_store when opening
datatree in zarr format (:issue:`9376`, :pull:`9377`).
By `Alfonso Ladino <https://github.com/aladinor>`_

Documentation
~~~~~~~~~~~~~
Expand Down
26 changes: 24 additions & 2 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1225,15 +1225,37 @@ def open_datatree(
filename_or_obj = _normalize_path(filename_or_obj)
if group:
parent = NodePath("/") / NodePath(group)
stores = ZarrStore.open_store(filename_or_obj, group=parent)
stores = ZarrStore.open_store(
filename_or_obj,
group=parent,
mode=mode,
synchronizer=synchronizer,
consolidated=consolidated,
consolidate_on_close=False,
chunk_store=chunk_store,
storage_options=storage_options,
stacklevel=stacklevel + 1,
zarr_version=zarr_version,
)
if not stores:
ds = open_dataset(
filename_or_obj, group=parent, engine="zarr", **kwargs
)
return DataTree.from_dict({str(parent): ds})
else:
parent = NodePath("/")
stores = ZarrStore.open_store(filename_or_obj, group=parent)
stores = ZarrStore.open_store(
filename_or_obj,
group=parent,
mode=mode,
synchronizer=synchronizer,
consolidated=consolidated,
consolidate_on_close=False,
chunk_store=chunk_store,
storage_options=storage_options,
stacklevel=stacklevel + 1,
zarr_version=zarr_version,
)
ds = open_dataset(filename_or_obj, group=parent, engine="zarr", **kwargs)
tree_root = DataTree.from_dict({str(parent): ds})
for path_group, store in stores.items():
Expand Down
239 changes: 131 additions & 108 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def _is_time_like(units):


def _check_fill_values(attrs, name, dtype):
""" "Check _FillValue and missing_value if available.
"""Check _FillValue and missing_value if available.
Return dictionary with raw fill values and set with encoded fill values.
Expand Down Expand Up @@ -298,18 +298,87 @@ def _check_fill_values(attrs, name, dtype):
return raw_fill_dict, encoded_fill_values


def _convert_unsigned_fill_value(
name: T_Name,
data: Any,
unsigned: str,
raw_fill_value: Any,
encoded_fill_values: set,
) -> Any:
if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if raw_fill_value is not None:
new_fill = np.array(raw_fill_value, dtype=data.dtype)
encoded_fill_values.remove(raw_fill_value)
# use view here to prevent OverflowError
encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if raw_fill_value is not None:
new_fill = signed_dtype.type(raw_fill_value)
encoded_fill_values.remove(raw_fill_value)
encoded_fill_values.add(new_fill)
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)
return data


def _encode_unsigned_fill_value(
name: T_Name,
fill_value: Any,
encoded_dtype: np.dtype,
) -> Any:
try:
if hasattr(fill_value, "item"):
# if numpy type, convert to python native integer to determine overflow
# otherwise numpy unsigned ints will silently cast to the signed counterpart
fill_value = fill_value.item()
# passes if provided fill value fits in encoded on-disk type
new_fill = encoded_dtype.type(fill_value)
except OverflowError:
encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned"
warnings.warn(
f"variable {name!r} will be stored as {encoded_kind_str} integers "
f"but _FillValue attribute can't be represented as a "
f"{encoded_kind_str} integer.",
SerializationWarning,
stacklevel=3,
)
# user probably provided the fill as the in-memory dtype,
# convert to on-disk type to match CF standard
orig_kind = "u" if encoded_dtype.kind == "i" else "i"
orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item()
return new_fill


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

def encode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

dtype = np.dtype(encoding.get("dtype", data.dtype))
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
has_unsigned = encoding.get("_Unsigned") is not None
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")
# to properly handle _FillValue/missing_value below [a], [b]
# we need to check if unsigned data is written as signed data
unsigned = encoding.get("_Unsigned") is not None
fill_value = None

fv_exists = fv is not None
mv_exists = mv is not None
Expand All @@ -324,23 +393,28 @@ def encode(self, variable: Variable, name: T_Name = None):

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
# [a] need to skip this if _Unsigned is available
if not unsigned:
encoding["_FillValue"] = dtype.type(fv)
encoding["_FillValue"] = (
_encode_unsigned_fill_value(name, fv, dtype)
if has_unsigned
else dtype.type(fv)
)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)

if mv_exists:
# try to use _FillValue, if it exists to align both values
# or use missing_value and ensure it's cast to same dtype as data's
# [b] need to provide mv verbatim if _Unsigned is available
encoding["missing_value"] = attrs.get(
"_FillValue",
(dtype.type(mv) if not unsigned else mv),
(
_encode_unsigned_fill_value(name, mv, dtype)
if has_unsigned
else dtype.type(mv)
),
)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)

# apply fillna
if not pd.isnull(fill_value):
if fill_value is not None and not pd.isnull(fill_value):
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
data = duck_array_ops.where(
Expand All @@ -349,46 +423,63 @@ def encode(self, variable: Variable, name: T_Name = None):
else:
data = duck_array_ops.fillna(data, fill_value)

if fill_value is not None and has_unsigned:
pop_to(encoding, attrs, "_Unsigned")
# XXX: Is this actually needed? Doesn't the backend handle this?
data = duck_array_ops.astype(duck_array_ops.around(data), dtype)
attrs["_FillValue"] = fill_value

return Variable(dims, data, attrs, encoding, fastpath=True)

def decode(self, variable: Variable, name: T_Name = None):
raw_fill_dict, encoded_fill_values = _check_fill_values(
variable.attrs, name, variable.dtype
)
if "_Unsigned" not in variable.attrs and not raw_fill_dict:
return variable

if raw_fill_dict:
dims, data, attrs, encoding = unpack_for_decoding(variable)
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
dims, data, attrs, encoding = unpack_for_decoding(variable)

# Even if _Unsigned is use, retain on-disk _FillValue
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if "_Unsigned" in attrs:
unsigned = pop_to(attrs, encoding, "_Unsigned")
data = _convert_unsigned_fill_value(
name,
data,
unsigned,
raw_fill_dict.get("_FillValue"),
encoded_fill_values,
)

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)
transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
return Variable(dims, data, attrs, encoding, fastpath=True)


def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
Expand Down Expand Up @@ -506,74 +597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return variable


class UnsignedIntegerCoder(VariableCoder):
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
if variable.encoding.get("_Unsigned", "false") == "true":
dims, data, attrs, encoding = unpack_for_encoding(variable)

pop_to(encoding, attrs, "_Unsigned")
# we need the on-disk type here
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
if "_FillValue" in attrs:
try:
# user provided the on-disk signed fill
new_fill = signed_dtype.type(attrs["_FillValue"])
except OverflowError:
# user provided the in-memory unsigned fill, convert to signed type
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = (
np.array(attrs["_FillValue"], dtype=unsigned_dtype)
.view(signed_dtype)
.item()
)
attrs["_FillValue"] = new_fill
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if "_Unsigned" in variable.attrs:
dims, data, attrs, encoding = unpack_for_decoding(variable)
unsigned = pop_to(attrs, encoding, "_Unsigned")

if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if "_FillValue" in attrs:
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
# use view here to prevent OverflowError
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if "_FillValue" in attrs:
new_fill = signed_dtype.type(attrs["_FillValue"])
attrs["_FillValue"] = new_fill
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

Expand Down
2 changes: 0 additions & 2 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def encode_cf_variable(
times.CFTimedeltaCoder(),
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.UnsignedIntegerCoder(),
variables.NativeEnumCoder(),
variables.NonStringCoder(),
variables.DefaultFillvalueCoder(),
Expand Down Expand Up @@ -279,7 +278,6 @@ def decode_cf_variable(

if mask_and_scale:
for coder in [
variables.UnsignedIntegerCoder(),
variables.CFMaskCoder(),
variables.CFScaleOffsetCoder(),
]:
Expand Down
Loading

0 comments on commit 31f40d1

Please sign in to comment.