From 4ce9b0f0f2ffdfd025394560c691705c84c112e2 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 1 Feb 2025 13:53:23 +0000 Subject: [PATCH] fix: pandas was raising when index name and column names overlapped in groupby (#1908) --- README.md | 1 + docs/pandas_like_concepts/pandas_index.md | 7 +++++-- narwhals/_pandas_like/group_by.py | 19 +++++++------------ narwhals/typing.py | 9 ++------- tests/conftest.py | 2 +- tests/group_by_test.py | 14 ++++++++++++++ 6 files changed, 30 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 533bd12de9..a2a0198a5d 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ [![PyPI version](https://badge.fury.io/py/narwhals.svg)](https://badge.fury.io/py/narwhals) [![Downloads](https://static.pepy.tech/badge/narwhals/month)](https://pepy.tech/project/narwhals) [![Trusted publishing](https://img.shields.io/badge/Trusted_publishing-Provides_attestations-bright_green)](https://peps.python.org/pep-0740/) +[![PYPI - Types](https://img.shields.io/pypi/types/narwhals)](https://pypi.org/project/narwhals) Extremely lightweight and extensible compatibility layer between dataframe libraries! diff --git a/docs/pandas_like_concepts/pandas_index.md b/docs/pandas_like_concepts/pandas_index.md index aa5f11bcfe..ede5138e5b 100644 --- a/docs/pandas_like_concepts/pandas_index.md +++ b/docs/pandas_like_concepts/pandas_index.md @@ -16,7 +16,7 @@ Narwhals aims to accommodate both! Let's learn about what Narwhals promises. -## 1. Narwhals will preserve your index for dataframe operations +## 1. Narwhals will preserve your index for common dataframe operations ```python exec="1" source="above" session="ex1" import narwhals as nw @@ -39,7 +39,10 @@ print(my_func(df)) ``` Note how the result still has the original index - Narwhals did not modify -it. +it. Narwhals will preserve your original index for most common dataframe +operations. However, Narwhals will _not_ preserve the original index for +`DataFrame.group_by`, because there, overlapping index and column names +raise errors. ## 2. Index alignment follows the left-hand-rule diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 95ad7730d6..7e586cab29 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -43,32 +43,27 @@ def __init__( ) -> None: self._df = df self._keys = keys + # Drop index to avoid potential collisions: + # https://github.com/narwhals-dev/narwhals/issues/1907. + native_frame = df._native_frame.reset_index(drop=True) if ( self._df._implementation is Implementation.PANDAS and self._df._backend_version < (1, 1) ): # pragma: no cover if ( not drop_null_keys - and select_columns_by_name( - self._df._native_frame, - self._keys, - self._df._backend_version, - self._df._implementation, - ) - .isna() - .any() - .any() + and self._df.simple_select(*self._keys)._native_frame.isna().any().any() ): - msg = "Grouping by null values is not supported in pandas < 1.0.0" + msg = "Grouping by null values is not supported in pandas < 1.1.0" raise NotImplementedError(msg) - self._grouped = self._df._native_frame.groupby( + self._grouped = native_frame.groupby( list(self._keys), sort=False, as_index=True, observed=True, ) else: - self._grouped = self._df._native_frame.groupby( + self._grouped = native_frame.groupby( list(self._keys), sort=False, as_index=True, diff --git a/narwhals/typing.py b/narwhals/typing.py index 4e2d078945..0c284cfa34 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -60,19 +60,14 @@ def simple_select( ) -> CompliantDataFrame: ... # `select` where all args are column names -class CompliantLazyFramme(Protocol): - def __narwhals_lazyframe__(self) -> CompliantDataFrame: ... +class CompliantLazyFrame(Protocol): + def __narwhals_lazyframe__(self) -> CompliantLazyFrame: ... def __narwhals_namespace__(self) -> Any: ... def simple_select( self, *column_names: str ) -> CompliantLazyFrame: ... # `select` where all args are column names -class CompliantLazyFrame(Protocol): - def __narwhals_lazyframe__(self) -> CompliantLazyFrame: ... - def __narwhals_namespace__(self) -> Any: ... - - CompliantSeriesT_co = TypeVar( "CompliantSeriesT_co", bound=CompliantSeries, covariant=True ) diff --git a/tests/conftest.py b/tests/conftest.py index a95f6a5f84..ade98d014d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -210,7 +210,7 @@ def sqlframe_pyspark_lazy_constructor( "pyspark": pyspark_lazy_constructor, # type: ignore[dict-item] # We've reported several bugs to sqlframe - once they address # them, we can start testing them as part of our CI. - # "sqlframe": pyspark_lazy_constructor, # noqa: ERA001 + # "sqlframe": sqlframe_pyspark_lazy_constructor, # noqa: ERA001 } GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor} diff --git a/tests/group_by_test.py b/tests/group_by_test.py index d5f25554d9..09be5cab13 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -406,3 +406,17 @@ def test_group_by_expr(constructor: Constructor) -> None: df = nw.from_native(constructor({"a": [1, 1, 3], "b": [4, 5, 6]})) with pytest.raises(NotImplementedError, match=r"not \(yet\?\) supported"): df.group_by(nw.col("a")).agg(nw.col("b").mean()) # type: ignore[arg-type] + + +def test_pandas_group_by_index_and_column_overlap() -> None: + df = pd.DataFrame( + {"a": [1, 1, 2], "b": [4, 5, 6]}, index=pd.Index([0, 1, 2], name="a") + ) + result = nw.from_native(df, eager_only=True).group_by("a").agg(nw.col("b").mean()) + expected = {"a": [1, 2], "b": [4.5, 6.0]} + assert_equal_data(result, expected) + + key, result = next(iter(nw.from_native(df, eager_only=True).group_by("a"))) + assert key == (1,) + expected_native = pd.DataFrame({"a": [1, 1], "b": [4, 5]}) + pd.testing.assert_frame_equal(result.to_native(), expected_native)