Skip to content

Commit

Permalink
correctly encode/decode _FillValues
Browse files Browse the repository at this point in the history
  • Loading branch information
kmuehlbauer committed Feb 6, 2024
1 parent c9ba2be commit 477cd58
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 47 deletions.
77 changes: 62 additions & 15 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,10 @@ def encode(self, variable: Variable, name: T_Name = None):
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Use _FillValue if available to align missing_value to prevent issues
# when decoding
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
if is_time_like and data.dtype.kind in "iu":
Expand Down Expand Up @@ -322,7 +324,13 @@ def decode(self, variable: Variable, name: T_Name = None):
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

if encoded_fill_values:
transform = partial(
Expand All @@ -347,23 +355,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
return data


def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
def _choose_float_dtype(
dtype: np.dtype, mapping: MutableMapping
) -> type[np.floating[Any]]:
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
# check scale/offset first to derive wanted float dtype
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
scale_factor = mapping.get("scale_factor")
add_offset = mapping.get("add_offset")
if scale_factor is not None or add_offset is not None:
# get the type from scale_factor/add_offset to determine
# the needed floating point type
if scale_factor is not None:
scale_type = type(scale_factor)
if add_offset is not None:
offset_type = type(add_offset)
# CF conforming, both scale_factor and add-offset are given and
# of same floating point type (float32/64)
if (
add_offset is not None
and scale_factor is not None
and offset_type == scale_type
and scale_type in [np.float32, np.float64]
):
# in case of int32 -> we need upcast to float64
# due to precision issues
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
return np.float64
return np.dtype(scale_type).type
# Not CF conforming and add_offset given:
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if add_offset is not None:
return np.float64
# return float32 in other cases where only scale_factor is given
return np.float32
# If no scale_factor or add_offset is given, use some general rules.
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64
return np.float32


class CFScaleOffsetCoder(VariableCoder):
Expand All @@ -377,7 +413,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
# if we have a _FillValue/masked_value we do not want to cast now
# but leave that to CFMaskCoder
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(data.dtype, encoding)
# but still we need a copy prevent changing original data
data = duck_array_ops.astype(data, dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
Expand All @@ -393,11 +434,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
# if we have a _FillValue/masked_value we already have the wanted
# floating point dtype here (via CFMaskCoder), so no check is necessary
# only check in other cases
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(dtype, encoding)

transform = partial(
_scale_offset_decoding,
scale_factor=scale_factor,
Expand Down
72 changes: 40 additions & 32 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,96 +142,100 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
)


def create_masked_and_scaled_data() -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
def create_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
encoding = {
"_FillValue": -1,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
"dtype": "i2",
}
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_masked_and_scaled_data() -> Dataset:
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
attributes = {
"_FillValue": -1,
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
return Dataset(
{"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
)


def create_unsigned_masked_scaled_data() -> Dataset:
def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": "true",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_encoded_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": "true",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_bad_unsigned_masked_scaled_data() -> Dataset:
def create_bad_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": True,
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": True,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_signed_masked_scaled_data() -> Dataset:
def create_signed_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": -127,
"_Unsigned": "false",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_signed_masked_scaled_data() -> Dataset:
def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -127,
"_Unsigned": "false",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype.type(10),
"scale_factor": dtype.type(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([-110, 1, 127, -127], dtype="i1")
Expand Down Expand Up @@ -889,9 +893,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None:
(create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
],
)
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
decoded = decoded_fn()
encoded = encoded_fn()
@pytest.mark.parametrize("dtype", [np.dtype("float64"), np.dtype("float32")])
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
decoded = decoded_fn(dtype)
encoded = encoded_fn(dtype)

with self.roundtrip(decoded) as actual:
for k in decoded.variables:
Expand All @@ -912,7 +917,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:

# make sure roundtrip encoding didn't change the
# original dataset.
assert_allclose(encoded, encoded_fn(), decode_bytes=False)
assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)

with self.roundtrip(encoded) as actual:
for k in decoded.variables:
Expand Down Expand Up @@ -1645,6 +1650,7 @@ def test_mask_and_scale(self) -> None:
v.add_offset = 10
v.scale_factor = 0.1
v[:] = np.array([-1, -1, 0, 1, 2])
dtype = type(v.scale_factor)

# first make sure netCDF4 reads the masked and scaled data
# correctly
Expand All @@ -1653,11 +1659,13 @@ def test_mask_and_scale(self) -> None:
[-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]
)
actual = nc.variables["x"][:]
print(actual.dtype, expected.dtype)
assert_array_equal(expected, actual)

# now check xarray
with open_dataset(tmp_file) as ds:
expected = create_masked_and_scaled_data()
expected = create_masked_and_scaled_data(np.dtype(dtype))
print(ds.variables["x"].dtype, expected.variables["x"].dtype)
assert_identical(expected, ds)

def test_0dimensional_variable(self) -> None:
Expand Down

0 comments on commit 477cd58

Please sign in to comment.