From 013a4268124919fcc1f22118685ddc2a179ea24f Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 13 Feb 2024 10:48:35 +0100 Subject: [PATCH] unstack: require unique MultiIndex (#8737) * unstack: require unique multiindex * whats new * fix ds creation * fix the correct array * update error message * update err msg in tests * Apply suggestions from code review --- doc/whats-new.rst | 2 ++ xarray/core/indexes.py | 7 +++++++ xarray/tests/test_dataarray.py | 9 +++++++++ xarray/tests/test_dataset.py | 8 ++++++++ xarray/tests/test_indexes.py | 9 +++++++++ 5 files changed, 35 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ed0b1c30987..50eece5f0af 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -74,6 +74,8 @@ Bug fixes lead to integer overflow or unsafe conversion from floating point to integer values (:issue:`8542`, :pull:`8575`). By `Spencer Clark `_. +- Raise an error when unstacking a MultiIndex that has duplicates as this would lead + to silent data loss (:issue:`7104`, :pull:`8737`). By `Mathias Hauser `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 1697762f7ae..e71c4a6f073 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1017,6 +1017,13 @@ def stack( def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]: clean_index = remove_unused_levels_categories(self.index) + if not clean_index.is_unique: + raise ValueError( + "Cannot unstack MultiIndex containing duplicates. Make sure entries " + f"are unique, e.g., by calling ``.drop_duplicates('{self.dim}')``, " + "before unstacking." + ) + new_indexes: dict[Hashable, Index] = {} for name, lev in zip(clean_index.names, clean_index.levels): idx = PandasIndex( diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d898d3a30b9..2829fd7d49c 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2532,6 +2532,15 @@ def test_unstack_pandas_consistency(self) -> None: actual = DataArray(s, dims="z").unstack("z") assert_identical(expected, actual) + def test_unstack_requires_unique(self) -> None: + df = pd.DataFrame({"foo": range(2), "x": ["a", "a"], "y": [0, 0]}) + s = df.set_index(["x", "y"])["foo"] + + with pytest.raises( + ValueError, match="Cannot unstack MultiIndex containing duplicates" + ): + DataArray(s, dims="z").unstack("z") + @pytest.mark.filterwarnings("error") def test_unstack_roundtrip_integer_array(self) -> None: arr = xr.DataArray( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 267a5ca603a..ae7d87bb790 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3764,6 +3764,14 @@ def test_unstack_errors(self) -> None: with pytest.raises(ValueError, match=r".*do not have exactly one multi-index"): ds.unstack("x") + ds = Dataset({"da": [1, 2]}, coords={"y": ("x", [1, 1]), "z": ("x", [0, 0])}) + ds = ds.set_index(x=("y", "z")) + + with pytest.raises( + ValueError, match="Cannot unstack MultiIndex containing duplicates" + ): + ds.unstack("x") + def test_unstack_fill_value(self) -> None: ds = xr.Dataset( {"var": (("x",), np.arange(6)), "other_var": (("x",), np.arange(3, 9))}, diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 866c2ef7e85..3ee7f045360 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -452,6 +452,15 @@ def test_unstack(self) -> None: assert new_indexes["two"].equals(PandasIndex([1, 2, 3], "two")) assert new_pd_idx.equals(pd_midx) + def test_unstack_requires_unique(self) -> None: + pd_midx = pd.MultiIndex.from_product([["a", "a"], [1, 2]], names=["one", "two"]) + index = PandasMultiIndex(pd_midx, "x") + + with pytest.raises( + ValueError, match="Cannot unstack MultiIndex containing duplicates" + ): + index.unstack() + def test_create_variables(self) -> None: foo_data = np.array([0, 0, 1], dtype="int64") bar_data = np.array([1.1, 1.2, 1.3], dtype="float64")