From b1188b5633a1ebb491a27ec0fd4b0d25172f572e Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Tue, 24 Sep 2024 14:59:28 +0200 Subject: [PATCH 1/2] refactor(rust): Minor new-streaming test fixes --- crates/polars-python/src/series/buffers.rs | 2 +- py-polars/tests/unit/functions/as_datatype/test_struct.py | 3 +++ py-polars/tests/unit/interop/numpy/test_to_numpy_df.py | 2 ++ py-polars/tests/unit/interop/numpy/test_to_numpy_series.py | 2 +- py-polars/tests/unit/interop/numpy/test_ufunc_expr.py | 4 ---- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/polars-python/src/series/buffers.rs b/crates/polars-python/src/series/buffers.rs index 49610fd3cf42..55013ea1dd2c 100644 --- a/crates/polars-python/src/series/buffers.rs +++ b/crates/polars-python/src/series/buffers.rs @@ -342,7 +342,7 @@ where } fn series_to_bitmap(s: Series) -> PyResult { let ca_result = s.bool(); - let ca = ca_result.map_err(PyPolarsErr::from)?; + let ca = ca_result.map_err(PyPolarsErr::from)?.rechunk(); let arr = ca.downcast_iter().next().unwrap(); let bitmap = arr.values().clone(); Ok(bitmap) diff --git a/py-polars/tests/unit/functions/as_datatype/test_struct.py b/py-polars/tests/unit/functions/as_datatype/test_struct.py index 5e71f9d37e29..0dc124d80099 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_struct.py +++ b/py-polars/tests/unit/functions/as_datatype/test_struct.py @@ -62,6 +62,9 @@ def test_eager_struct() -> None: def test_struct_from_schema_only() -> None: + # Workaround for new streaming engine. + pl.enable_string_cache() + # we create a dataframe with default types df = pl.DataFrame( { diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py index c25a544d4d9d..fa1815823c34 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py @@ -36,6 +36,7 @@ def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None: allow_chunks=False, ) ) +@pytest.mark.may_fail_auto_streaming def test_df_to_numpy_zero_copy(s: pl.Series) -> None: df = pl.DataFrame({"a": s[:3], "b": s[3:]}) @@ -153,6 +154,7 @@ def test_df_to_numpy_zero_copy_path() -> None: assert str(x[0, :]) == "[1. 2. 1. 1. 1.]" +@pytest.mark.may_fail_auto_streaming def test_df_to_numpy_zero_copy_path_temporal() -> None: values = [datetime(1970 + i, 1, 1) for i in range(12)] s = pl.Series(values) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 8ac2e2f73321..1f62f41767fe 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -116,7 +116,7 @@ def test_series_to_numpy_temporal_zero_copy( def test_series_to_numpy_datetime_with_tz_zero_copy() -> None: values = [datetime(1970, 1, 1), datetime(2024, 2, 28)] - s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam") + s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam").rechunk() result = s.to_numpy(allow_copy=False) assert_zero_copy(s, result) diff --git a/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py b/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py index 1fe9ea3ed4be..fda0c5a5f722 100644 --- a/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py +++ b/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py @@ -33,13 +33,11 @@ def test_ufunc_expr_not_first() -> None: out = df.select( np.power(2.0, cast(Any, pl.col("a"))).alias("power"), (2.0 / cast(Any, pl.col("a"))).alias("divide_scalar"), - (np.array([2, 2, 2]) / cast(Any, pl.col("a"))).alias("divide_array"), ) expected = pl.DataFrame( [ pl.Series("power", [2**1, 2**2, 2**3], dtype=pl.Float64), pl.Series("divide_scalar", [2 / 1, 2 / 2, 2 / 3], dtype=pl.Float64), - pl.Series("divide_array", [2 / 1, 2 / 2, 2 / 3], dtype=pl.Float64), ] ) assert_frame_equal(out, expected) @@ -68,13 +66,11 @@ def test_lazy_ufunc_expr_not_first() -> None: out = ldf.select( np.power(2.0, cast(Any, pl.col("a"))).alias("power"), (2.0 / cast(Any, pl.col("a"))).alias("divide_scalar"), - (np.array([2, 2, 2]) / cast(Any, pl.col("a"))).alias("divide_array"), ) expected = pl.DataFrame( [ pl.Series("power", [2**1, 2**2, 2**3], dtype=pl.Float64), pl.Series("divide_scalar", [2 / 1, 2 / 2, 2 / 3], dtype=pl.Float64), - pl.Series("divide_array", [2 / 1, 2 / 2, 2 / 3], dtype=pl.Float64), ] ) assert_frame_equal(out.collect(), expected) From 9a4ebfed43ef83798bc57ab86022931a98beba32 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Tue, 24 Sep 2024 15:34:51 +0200 Subject: [PATCH 2/2] string cache in context manager --- .../unit/functions/as_datatype/test_struct.py | 207 +++++++++--------- 1 file changed, 103 insertions(+), 104 deletions(-) diff --git a/py-polars/tests/unit/functions/as_datatype/test_struct.py b/py-polars/tests/unit/functions/as_datatype/test_struct.py index 0dc124d80099..2073723f2d04 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_struct.py +++ b/py-polars/tests/unit/functions/as_datatype/test_struct.py @@ -63,113 +63,112 @@ def test_eager_struct() -> None: def test_struct_from_schema_only() -> None: # Workaround for new streaming engine. - pl.enable_string_cache() - - # we create a dataframe with default types - df = pl.DataFrame( - { - "str": ["a", "b", "c", "d", "e"], - "u8": [1, 2, 3, 4, 5], - "i32": [1, 2, 3, 4, 5], - "f64": [1, 2, 3, 4, 5], - "cat": ["a", "b", "c", "d", "e"], - "datetime": pl.Series( - [ - date(2023, 1, 1), - date(2023, 1, 2), - date(2023, 1, 3), - date(2023, 1, 4), - date(2023, 1, 5), - ] - ), - "bool": [1, 0, 1, 1, 0], - "list[u8]": [[1], [2], [3], [4], [5]], - } - ) - - # specify a schema with specific dtypes - s = df.select( - pl.struct( - schema={ - "str": pl.String, - "u8": pl.UInt8, - "i32": pl.Int32, - "f64": pl.Float64, - "cat": pl.Categorical, - "datetime": pl.Datetime("ms"), - "bool": pl.Boolean, - "list[u8]": pl.List(pl.UInt8), + with pl.StringCache(): + # we create a dataframe with default types + df = pl.DataFrame( + { + "str": ["a", "b", "c", "d", "e"], + "u8": [1, 2, 3, 4, 5], + "i32": [1, 2, 3, 4, 5], + "f64": [1, 2, 3, 4, 5], + "cat": ["a", "b", "c", "d", "e"], + "datetime": pl.Series( + [ + date(2023, 1, 1), + date(2023, 1, 2), + date(2023, 1, 3), + date(2023, 1, 4), + date(2023, 1, 5), + ] + ), + "bool": [1, 0, 1, 1, 0], + "list[u8]": [[1], [2], [3], [4], [5]], } - ).alias("s") - )["s"] + ) - # check dtypes - assert s.dtype == pl.Struct( - [ - pl.Field("str", pl.String), - pl.Field("u8", pl.UInt8), - pl.Field("i32", pl.Int32), - pl.Field("f64", pl.Float64), - pl.Field("cat", pl.Categorical), - pl.Field("datetime", pl.Datetime("ms")), - pl.Field("bool", pl.Boolean), - pl.Field("list[u8]", pl.List(pl.UInt8)), - ] - ) + # specify a schema with specific dtypes + s = df.select( + pl.struct( + schema={ + "str": pl.String, + "u8": pl.UInt8, + "i32": pl.Int32, + "f64": pl.Float64, + "cat": pl.Categorical, + "datetime": pl.Datetime("ms"), + "bool": pl.Boolean, + "list[u8]": pl.List(pl.UInt8), + } + ).alias("s") + )["s"] + + # check dtypes + assert s.dtype == pl.Struct( + [ + pl.Field("str", pl.String), + pl.Field("u8", pl.UInt8), + pl.Field("i32", pl.Int32), + pl.Field("f64", pl.Float64), + pl.Field("cat", pl.Categorical), + pl.Field("datetime", pl.Datetime("ms")), + pl.Field("bool", pl.Boolean), + pl.Field("list[u8]", pl.List(pl.UInt8)), + ] + ) - # check values - assert s.to_list() == [ - { - "str": "a", - "u8": 1, - "i32": 1, - "f64": 1.0, - "cat": "a", - "datetime": datetime(2023, 1, 1, 0, 0), - "bool": True, - "list[u8]": [1], - }, - { - "str": "b", - "u8": 2, - "i32": 2, - "f64": 2.0, - "cat": "b", - "datetime": datetime(2023, 1, 2, 0, 0), - "bool": False, - "list[u8]": [2], - }, - { - "str": "c", - "u8": 3, - "i32": 3, - "f64": 3.0, - "cat": "c", - "datetime": datetime(2023, 1, 3, 0, 0), - "bool": True, - "list[u8]": [3], - }, - { - "str": "d", - "u8": 4, - "i32": 4, - "f64": 4.0, - "cat": "d", - "datetime": datetime(2023, 1, 4, 0, 0), - "bool": True, - "list[u8]": [4], - }, - { - "str": "e", - "u8": 5, - "i32": 5, - "f64": 5.0, - "cat": "e", - "datetime": datetime(2023, 1, 5, 0, 0), - "bool": False, - "list[u8]": [5], - }, - ] + # check values + assert s.to_list() == [ + { + "str": "a", + "u8": 1, + "i32": 1, + "f64": 1.0, + "cat": "a", + "datetime": datetime(2023, 1, 1, 0, 0), + "bool": True, + "list[u8]": [1], + }, + { + "str": "b", + "u8": 2, + "i32": 2, + "f64": 2.0, + "cat": "b", + "datetime": datetime(2023, 1, 2, 0, 0), + "bool": False, + "list[u8]": [2], + }, + { + "str": "c", + "u8": 3, + "i32": 3, + "f64": 3.0, + "cat": "c", + "datetime": datetime(2023, 1, 3, 0, 0), + "bool": True, + "list[u8]": [3], + }, + { + "str": "d", + "u8": 4, + "i32": 4, + "f64": 4.0, + "cat": "d", + "datetime": datetime(2023, 1, 4, 0, 0), + "bool": True, + "list[u8]": [4], + }, + { + "str": "e", + "u8": 5, + "i32": 5, + "f64": 5.0, + "cat": "e", + "datetime": datetime(2023, 1, 5, 0, 0), + "bool": False, + "list[u8]": [5], + }, + ] def test_struct_broadcasting() -> None: