diff --git a/crates/polars-core/src/chunked_array/ops/row_encode.rs b/crates/polars-core/src/chunked_array/ops/row_encode.rs index c5a36afa7aac..fb0ed52d0219 100644 --- a/crates/polars-core/src/chunked_array/ops/row_encode.rs +++ b/crates/polars-core/src/chunked_array/ops/row_encode.rs @@ -74,7 +74,7 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( /// /// This should be given the logical type in order to communicate Polars datatype information down /// into the row encoding / decoding. -pub fn get_row_encoding_context(dtype: &DataType) -> Option { +pub fn get_row_encoding_context(dtype: &DataType, ordered: bool) -> Option { match dtype { DataType::Boolean | DataType::UInt8 @@ -108,59 +108,78 @@ pub fn get_row_encoding_context(dtype: &DataType) -> Option }, #[cfg(feature = "dtype-array")] - DataType::Array(dtype, _) => get_row_encoding_context(dtype), - DataType::List(dtype) => get_row_encoding_context(dtype), + DataType::Array(dtype, _) => get_row_encoding_context(dtype, ordered), + DataType::List(dtype) => get_row_encoding_context(dtype, ordered), #[cfg(feature = "dtype-categorical")] DataType::Categorical(revmap, ordering) | DataType::Enum(revmap, ordering) => { - let revmap = revmap.as_ref().unwrap(); - - let (num_known_categories, lexical_sort_idxs) = match revmap.as_ref() { - RevMapping::Global(map, _, _) => { - let num_known_categories = map.keys().max().copied().map_or(0, |m| m + 1); - - // @TODO: This should probably be cached. - let lexical_sort_idxs = - matches!(ordering, CategoricalOrdering::Lexical).then(|| { - let read_map = crate::STRING_CACHE.read_map(); - let payloads = read_map.get_current_payloads(); - assert!(payloads.len() >= num_known_categories as usize); - - let mut idxs = (0..num_known_categories).collect::>(); - idxs.sort_by_key(|&k| payloads[k as usize].as_str()); - let mut sort_idxs = vec![0; num_known_categories as usize]; - for (i, idx) in idxs.into_iter().enumerate_u32() { - sort_idxs[idx as usize] = i; - } - sort_idxs - }); - - (num_known_categories, lexical_sort_idxs) + let is_enum = dtype.is_enum(); + let ctx = match revmap { + Some(revmap) => { + let (num_known_categories, lexical_sort_idxs) = match revmap.as_ref() { + RevMapping::Global(map, _, _) => { + let num_known_categories = + map.keys().max().copied().map_or(0, |m| m + 1); + + // @TODO: This should probably be cached. + let lexical_sort_idxs = (ordered + && matches!(ordering, CategoricalOrdering::Lexical)) + .then(|| { + let read_map = crate::STRING_CACHE.read_map(); + let payloads = read_map.get_current_payloads(); + assert!(payloads.len() >= num_known_categories as usize); + + let mut idxs = (0..num_known_categories).collect::>(); + idxs.sort_by_key(|&k| payloads[k as usize].as_str()); + let mut sort_idxs = vec![0; num_known_categories as usize]; + for (i, idx) in idxs.into_iter().enumerate_u32() { + sort_idxs[idx as usize] = i; + } + sort_idxs + }); + + (num_known_categories, lexical_sort_idxs) + }, + RevMapping::Local(values, _) => { + // @TODO: This should probably be cached. + let lexical_sort_idxs = (ordered + && matches!(ordering, CategoricalOrdering::Lexical)) + .then(|| { + assert_eq!(values.null_count(), 0); + let values: Vec<&str> = values.values_iter().collect(); + + let mut idxs = (0..values.len() as u32).collect::>(); + idxs.sort_by_key(|&k| values[k as usize]); + let mut sort_idxs = vec![0; values.len()]; + for (i, idx) in idxs.into_iter().enumerate_u32() { + sort_idxs[idx as usize] = i; + } + sort_idxs + }); + + (values.len() as u32, lexical_sort_idxs) + }, + }; + + RowEncodingCategoricalContext { + num_known_categories, + is_enum, + lexical_sort_idxs, + } }, - RevMapping::Local(values, _) => { - // @TODO: This should probably be cached. - let lexical_sort_idxs = - matches!(ordering, CategoricalOrdering::Lexical).then(|| { - assert_eq!(values.null_count(), 0); - let values: Vec<&str> = values.values_iter().collect(); - - let mut idxs = (0..values.len() as u32).collect::>(); - idxs.sort_by_key(|&k| values[k as usize]); - let mut sort_idxs = vec![0; values.len()]; - for (i, idx) in idxs.into_iter().enumerate_u32() { - sort_idxs[idx as usize] = i; - } - sort_idxs - }); - - (values.len() as u32, lexical_sort_idxs) + None => { + let num_known_categories = u32::MAX; + + if matches!(ordering, CategoricalOrdering::Lexical) && ordered { + panic!("lexical ordering not yet supported if rev-map not given"); + } + RowEncodingCategoricalContext { + num_known_categories, + is_enum, + lexical_sort_idxs: None, + } }, }; - let ctx = RowEncodingCategoricalContext { - num_known_categories, - is_enum: matches!(dtype, DataType::Enum(_, _)), - lexical_sort_idxs, - }; Some(RowEncodingContext::Categorical(ctx)) }, #[cfg(feature = "dtype-struct")] @@ -168,7 +187,7 @@ pub fn get_row_encoding_context(dtype: &DataType) -> Option let mut ctxts = Vec::new(); for (i, f) in fs.iter().enumerate() { - if let Some(ctxt) = get_row_encoding_context(f.dtype()) { + if let Some(ctxt) = get_row_encoding_context(f.dtype(), ordered) { ctxts.reserve(fs.len()); ctxts.extend(std::iter::repeat_n(None, i)); ctxts.push(Some(ctxt)); @@ -183,7 +202,7 @@ pub fn get_row_encoding_context(dtype: &DataType) -> Option ctxts.extend( fs[ctxts.len()..] .iter() - .map(|f| get_row_encoding_context(f.dtype())), + .map(|f| get_row_encoding_context(f.dtype(), ordered)), ); Some(RowEncodingContext::Struct(ctxts)) @@ -214,7 +233,7 @@ pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult { let by = by.as_materialized_series(); let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed(); let opt = RowEncodingOptions::new_unsorted(); - let ctxt = get_row_encoding_context(by.dtype()); + let ctxt = get_row_encoding_context(by.dtype(), false); cols.push(arr); opts.push(opt); @@ -245,7 +264,7 @@ pub fn _get_rows_encoded( let by = by.as_materialized_series(); let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed(); let opt = RowEncodingOptions::new_sorted(*desc, *null_last); - let ctxt = get_row_encoding_context(by.dtype()); + let ctxt = get_row_encoding_context(by.dtype(), true); cols.push(arr); opts.push(opt); diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index db957e94f0af..ddbcbb04802a 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -509,15 +509,26 @@ impl Series { }, #[cfg(feature = "dtype-categorical")] - (D::UInt32, D::Categorical(revmap, ordering)) => Ok(unsafe { - CategoricalChunked::from_cats_and_rev_map_unchecked( - self.u32().unwrap().clone(), - revmap.as_ref().unwrap().clone(), - false, - *ordering, - ) - } - .into_series()), + (D::UInt32, D::Categorical(revmap, ordering)) => match revmap { + Some(revmap) => Ok(unsafe { + CategoricalChunked::from_cats_and_rev_map_unchecked( + self.u32().unwrap().clone(), + revmap.clone(), + false, + *ordering, + ) + } + .into_series()), + // In the streaming engine this is `None` and the global string cache is turned on + // for the duration of the query. + None => Ok(unsafe { + CategoricalChunked::from_global_indices_unchecked( + self.u32().unwrap().clone(), + *ordering, + ) + .into_series() + }), + }, #[cfg(feature = "dtype-categorical")] (D::UInt32, D::Enum(revmap, ordering)) => Ok(unsafe { CategoricalChunked::from_cats_and_rev_map_unchecked( diff --git a/crates/polars-expr/src/groups/row_encoded.rs b/crates/polars-expr/src/groups/row_encoded.rs index 885f8c6114e7..e2b47d523ec8 100644 --- a/crates/polars-expr/src/groups/row_encoded.rs +++ b/crates/polars-expr/src/groups/row_encoded.rs @@ -42,7 +42,7 @@ impl RowEncodedHashGrouper { let ctxts = self .key_schema .iter() - .map(|(_, dt)| get_row_encoding_context(dt)) + .map(|(_, dt)| get_row_encoding_context(dt, false)) .collect::>(); let fields = vec![RowEncodingOptions::new_unsorted(); key_dtypes.len()]; let key_columns = diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 04d6b301aec5..ecae11d3b93e 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -868,6 +868,7 @@ impl LazyFrame { payload, }); + let _hold = StringCacheHolder::hold(); let f = || { polars_stream::run_query(stream_lp_top, alp_plan.lp_arena, &mut alp_plan.expr_arena) }; diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs index 07ebb4f2a5ce..b0e2850fd950 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs @@ -77,7 +77,7 @@ impl Eval { let mut dicts = Vec::with_capacity(self.key_columns_expr.len()); for phys_e in self.key_columns_expr.iter() { let s = phys_e.evaluate(chunk, &context.execution_state)?; - dicts.push(get_row_encoding_context(s.dtype())); + dicts.push(get_row_encoding_context(s.dtype(), false)); let s = s.to_physical_repr().into_owned(); let s = prepare_key(&s, chunk); keys_columns.push(s.to_arrow(0, CompatLevel::newest())); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 63d093f61827..2dad78673f87 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -262,7 +262,7 @@ impl AggHashTable { .output_schema .iter_values() .take(self.num_keys) - .map(get_row_encoding_context) + .map(|dt| get_row_encoding_context(dt, false)) .collect::>(); let fields = vec![Default::default(); self.num_keys]; let key_columns = diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs index 4096c8083f0d..6108526ee6fc 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs @@ -142,7 +142,7 @@ impl GenericBuild { let s = phys_e.evaluate(chunk, &context.execution_state)?; let arr = s.to_physical_repr().rechunk().array_ref(0).clone(); self.join_columns.push(arr); - ctxts.push(get_row_encoding_context(s.dtype())); + ctxts.push(get_row_encoding_context(s.dtype(), false)); } let rows_encoded = polars_row::convert_columns_no_order( self.join_columns[0].len(), // @NOTE: does not work for ZFS diff --git a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs index b4e0e8337bcc..45d14fc60039 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs @@ -60,7 +60,7 @@ impl RowValues { names.push(s.name().to_string()); } self.join_columns_material.push(s.array_ref(0).clone()); - ctxts.push(get_row_encoding_context(s.dtype())); + ctxts.push(get_row_encoding_context(s.dtype(), false)); } // We determine the indices of the columns that have to be removed diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index daa80f18b4f4..24cd778a89f6 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -142,7 +142,7 @@ impl SortSinkMultiple { .iter() .map(|i| { let (_, dtype) = schema.get_at_index(*i).unwrap(); - get_row_encoding_context(dtype) + get_row_encoding_context(dtype, true) }) .collect::>(); diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index 4537b8e55d9b..e8c4069464bc 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -484,7 +484,9 @@ impl PySeries { let dicts = dtypes .iter() - .map(|(_, dtype)| get_row_encoding_context(&dtype.0)) + .map(|(_, dt)| dt) + .zip(opts.iter()) + .map(|(dtype, opts)| get_row_encoding_context(&dtype.0, opts.is_ordered())) .collect::>(); // Get the BinaryOffset array. diff --git a/crates/polars-row/src/row.rs b/crates/polars-row/src/row.rs index d0929c67d4ba..ecde2696955b 100644 --- a/crates/polars-row/src/row.rs +++ b/crates/polars-row/src/row.rs @@ -78,6 +78,10 @@ impl RowEncodingOptions { Self::NO_ORDER } + pub fn is_ordered(self) -> bool { + !self.contains(Self::NO_ORDER) + } + pub fn null_sentinel(self) -> u8 { if self.contains(Self::NULLS_LAST) { 0xFF diff --git a/py-polars/tests/unit/constructors/test_any_value_fallbacks.py b/py-polars/tests/unit/constructors/test_any_value_fallbacks.py index 490515b89844..a197f3705443 100644 --- a/py-polars/tests/unit/constructors/test_any_value_fallbacks.py +++ b/py-polars/tests/unit/constructors/test_any_value_fallbacks.py @@ -399,6 +399,7 @@ def test_fallback_with_dtype_strict_failure_decimal_precision() -> None: @pytest.mark.usefixtures("test_global_and_local") +@pytest.mark.may_fail_auto_streaming def test_categorical_lit_18874() -> None: assert_frame_equal( pl.DataFrame( diff --git a/py-polars/tests/unit/constructors/test_constructors.py b/py-polars/tests/unit/constructors/test_constructors.py index ce68349cfc61..88794f7af4dc 100644 --- a/py-polars/tests/unit/constructors/test_constructors.py +++ b/py-polars/tests/unit/constructors/test_constructors.py @@ -1741,6 +1741,7 @@ def __arrow_c_array__(self, requested_schema: object = None) -> object: return self.arrow_obj.__arrow_c_array__(requested_schema) +@pytest.mark.may_fail_auto_streaming def test_pycapsule_interface(df: pl.DataFrame) -> None: df = df.rechunk() pyarrow_table = df.to_arrow() diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index b8d2ef35816e..231b569ec4fe 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1902,6 +1902,7 @@ def test_empty_projection() -> None: assert empty_df.shape == (0, 0) +@pytest.mark.may_fail_auto_streaming def test_fill_null() -> None: df = pl.DataFrame({"a": [1, 2], "b": [3, None]}) assert_frame_equal(df.fill_null(4), pl.DataFrame({"a": [1, 2], "b": [3, 4]})) diff --git a/py-polars/tests/unit/dataframe/test_serde.py b/py-polars/tests/unit/dataframe/test_serde.py index 91b7f6e3a20f..9fa5751d8098 100644 --- a/py-polars/tests/unit/dataframe/test_serde.py +++ b/py-polars/tests/unit/dataframe/test_serde.py @@ -40,6 +40,7 @@ def test_df_serde_roundtrip_json(df: pl.DataFrame) -> None: assert_frame_equal(result, df, categorical_as_str=True) +@pytest.mark.may_fail_auto_streaming def test_df_serde(df: pl.DataFrame) -> None: serialized = df.serialize() assert isinstance(serialized, bytes) @@ -47,6 +48,7 @@ def test_df_serde(df: pl.DataFrame) -> None: assert_frame_equal(result, df) +@pytest.mark.may_fail_auto_streaming def test_df_serde_json_stringio(df: pl.DataFrame) -> None: serialized = df.serialize(format="json") assert isinstance(serialized, str) diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 3ab60c6aee5d..42dbf8fb35da 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -625,6 +625,7 @@ def test_categorical_fill_null_existing_category() -> None: @pytest.mark.usefixtures("test_global_and_local") +@pytest.mark.may_fail_auto_streaming def test_categorical_fill_null_stringcache() -> None: df = pl.LazyFrame( {"index": [1, 2, 3], "cat": ["a", "b", None]}, @@ -849,6 +850,7 @@ def test_cat_preserve_lexical_ordering_on_concat() -> None: @pytest.mark.usefixtures("test_global_and_local") +@pytest.mark.may_fail_auto_streaming def test_cat_append_lexical_sorted_flag() -> None: df = pl.DataFrame({"x": [0, 1, 1], "y": ["B", "B", "A"]}).with_columns( pl.col("y").cast(pl.Categorical(ordering="lexical")) @@ -932,7 +934,6 @@ def test_categorical_unique() -> None: assert s.unique().sort().to_list() == [None, "a", "b"] -@pytest.mark.may_fail_auto_streaming @pytest.mark.usefixtures("test_global_and_local") def test_categorical_unique_20539() -> None: df = pl.DataFrame({"number": [1, 1, 2, 2, 3], "letter": ["a", "b", "b", "c", "c"]}) @@ -953,7 +954,6 @@ def test_categorical_unique_20539() -> None: } -@pytest.mark.may_fail_auto_streaming @pytest.mark.usefixtures("test_global_and_local") def test_categorical_prefill() -> None: # https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443 diff --git a/py-polars/tests/unit/datatypes/test_enum.py b/py-polars/tests/unit/datatypes/test_enum.py index 37f0ceb904ac..6bd7b5337f47 100644 --- a/py-polars/tests/unit/datatypes/test_enum.py +++ b/py-polars/tests/unit/datatypes/test_enum.py @@ -707,6 +707,7 @@ class Number(*EnumBase): # type: ignore[misc] assert_series_equal(expected, s) +@pytest.mark.may_fail_auto_streaming def test_read_enum_from_csv() -> None: df = pl.DataFrame( { diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 4526dbacb303..65b0e32e559e 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -831,6 +831,7 @@ def test_list_list_sum_exception_12935() -> None: pl.Series([[1], [2]]).sum() +@pytest.mark.may_fail_auto_streaming def test_null_list_categorical_16405() -> None: df = pl.DataFrame( [(None, "foo")], diff --git a/py-polars/tests/unit/functions/test_when_then.py b/py-polars/tests/unit/functions/test_when_then.py index afc6732e5d1b..a7c9abf42fe6 100644 --- a/py-polars/tests/unit/functions/test_when_then.py +++ b/py-polars/tests/unit/functions/test_when_then.py @@ -232,6 +232,7 @@ def test_object_when_then_4702() -> None: } +@pytest.mark.may_fail_auto_streaming def test_comp_categorical_lit_dtype() -> None: df = pl.DataFrame( data={"column": ["a", "b", "e"], "values": [1, 5, 9]}, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 7cdc1793ad5c..8e0692af2291 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1501,7 +1501,6 @@ def test_csv_categorical_lifetime() -> None: assert (df["a"] == df["b"]).to_list() == [False, False, None] -@pytest.mark.may_fail_auto_streaming def test_csv_categorical_categorical_merge() -> None: N = 50 f = io.BytesIO() diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 7b90967ca20d..c2f70ce08453 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -30,6 +30,7 @@ from tests.unit.conftest import MemoryUsage +@pytest.mark.may_fail_auto_streaming def test_round_trip(df: pl.DataFrame) -> None: f = io.BytesIO() df.write_parquet(f) @@ -37,6 +38,7 @@ def test_round_trip(df: pl.DataFrame) -> None: assert_frame_equal(pl.read_parquet(f), df) +@pytest.mark.may_fail_auto_streaming def test_scan_round_trip(df: pl.DataFrame) -> None: f = io.BytesIO() df.write_parquet(f) @@ -832,6 +834,7 @@ def test_parquet_string_rle_encoding() -> None: ) +@pytest.mark.may_fail_auto_streaming def test_sliced_dict_with_nulls_14904() -> None: df = ( pl.DataFrame({"x": [None, None]}) @@ -2434,6 +2437,7 @@ def test_dict_masked( @pytest.mark.usefixtures("test_global_and_local") +@pytest.mark.may_fail_auto_streaming def test_categorical_sliced_20017() -> None: f = io.BytesIO() df = ( @@ -2611,6 +2615,7 @@ def test_parquet_unsupported_dictionary_to_pl_17945() -> None: ) +@pytest.mark.may_fail_auto_streaming def test_parquet_cast_to_cat() -> None: t = pa.table( { diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index cd9cf2dc9982..57a55fa5aaa2 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -500,6 +500,7 @@ def test_read_invalid_worksheet( (pl.read_ods, "path_ods_mixed", {}), ], ) +@pytest.mark.may_fail_auto_streaming def test_read_mixed_dtype_columns( read_spreadsheet: Callable[..., dict[str, pl.DataFrame]], source: str, diff --git a/py-polars/tests/unit/operations/namespaces/test_categorical.py b/py-polars/tests/unit/operations/namespaces/test_categorical.py index 396f3ae092f4..98e83f0880f7 100644 --- a/py-polars/tests/unit/operations/namespaces/test_categorical.py +++ b/py-polars/tests/unit/operations/namespaces/test_categorical.py @@ -63,7 +63,6 @@ def test_categorical_lexical_ordering_after_concat() -> None: } -@pytest.mark.may_fail_auto_streaming @pytest.mark.usefixtures("test_global_and_local") def test_sort_categoricals_6014_internal() -> None: # create basic categorical diff --git a/py-polars/tests/unit/operations/test_hist.py b/py-polars/tests/unit/operations/test_hist.py index a3eda321808d..33df7e42ea64 100644 --- a/py-polars/tests/unit/operations/test_hist.py +++ b/py-polars/tests/unit/operations/test_hist.py @@ -335,6 +335,7 @@ def test_hist() -> None: assert_frame_equal(out, expected) +@pytest.mark.may_fail_auto_streaming def test_hist_all_null() -> None: s = pl.Series([None], dtype=pl.Float64) out = s.hist() @@ -446,6 +447,7 @@ def test_hist_max_boundary_20133() -> None: assert result["count"].sum() == 2 +@pytest.mark.may_fail_auto_streaming def test_hist_same_values_20030() -> None: out = pl.Series([1, 1]).hist(bin_count=2) expected = pl.DataFrame( @@ -458,6 +460,7 @@ def test_hist_same_values_20030() -> None: assert_frame_equal(out, expected) +@pytest.mark.may_fail_auto_streaming def test_hist_breakpoint_accuracy() -> None: s = pl.Series([1, 2, 3, 4]) out = s.hist(bin_count=3) diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py index 7af0cd8d2289..dc5c1dfdaa5c 100644 --- a/py-polars/tests/unit/operations/test_is_in.py +++ b/py-polars/tests/unit/operations/test_is_in.py @@ -323,6 +323,7 @@ def test_is_in_with_wildcard_13809() -> None: @pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum(["a", "b", "c", "d"])]) +@pytest.mark.may_fail_auto_streaming def test_cat_is_in_from_str(dtype: pl.DataType) -> None: s = pl.Series(["c", "c", "b"], dtype=dtype) @@ -334,6 +335,7 @@ def test_cat_is_in_from_str(dtype: pl.DataType) -> None: @pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum(["a", "b", "c", "d"])]) +@pytest.mark.may_fail_auto_streaming def test_cat_list_is_in_from_cat(dtype: pl.DataType) -> None: df = pl.DataFrame( [ @@ -359,6 +361,7 @@ def test_cat_list_is_in_from_cat(dtype: pl.DataType) -> None: ("e", [False, False, False, None, False]), ], ) +@pytest.mark.may_fail_auto_streaming def test_cat_list_is_in_from_cat_single(val: str | None, expected: list[bool]) -> None: df = pl.Series( "li", diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index d32ea5e49eed..3f33ae2c3c3b 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -64,7 +64,7 @@ def test_semi_anti_join() -> None: } -@pytest.mark.may_fail_auto_streaming # flaky in CI, https://github.com/pola-rs/polars/issues/20943 +@pytest.mark.may_fail_auto_streaming def test_join_same_cat_src() -> None: df = pl.DataFrame( data={"column": ["a", "a", "b"], "more": [1, 2, 3]}, diff --git a/py-polars/tests/unit/operations/test_replace_strict.py b/py-polars/tests/unit/operations/test_replace_strict.py index 14f99585e64e..b30fd94b8055 100644 --- a/py-polars/tests/unit/operations/test_replace_strict.py +++ b/py-polars/tests/unit/operations/test_replace_strict.py @@ -351,6 +351,7 @@ def test_replace_strict_str_to_int() -> None: (contextlib.nullcontext(), pl.Enum(["a", "b", "OTHER"])), ], ) +@pytest.mark.may_fail_auto_streaming def test_replace_strict_cat_str( context: contextlib.AbstractContextManager, # type: ignore[type-arg] dtype: pl.DataType, @@ -378,6 +379,7 @@ def test_replace_strict_cat_str( @pytest.mark.parametrize( "context", [pl.StringCache(), pytest.warns(CategoricalRemappingWarning)] ) +@pytest.mark.may_fail_auto_streaming def test_replace_strict_cat_cat( context: contextlib.AbstractContextManager, # type: ignore[type-arg] ) -> None: diff --git a/py-polars/tests/unit/operations/test_sets.py b/py-polars/tests/unit/operations/test_sets.py index 689d593f76c1..8414133c0ec9 100644 --- a/py-polars/tests/unit/operations/test_sets.py +++ b/py-polars/tests/unit/operations/test_sets.py @@ -46,6 +46,7 @@ def test_set_intersection_st_17129() -> None: ), ], ) +@pytest.mark.may_fail_auto_streaming def test_set_operations_cats(set_operation: str, outcome: list[set[str]]) -> None: with pytest.warns(CategoricalRemappingWarning): df = pl.DataFrame( diff --git a/py-polars/tests/unit/operations/test_shift.py b/py-polars/tests/unit/operations/test_shift.py index 7ad6028bf3ad..62a4bd313563 100644 --- a/py-polars/tests/unit/operations/test_shift.py +++ b/py-polars/tests/unit/operations/test_shift.py @@ -2,6 +2,8 @@ from datetime import date +import pytest + import polars as pl from polars.testing import assert_frame_equal, assert_series_equal @@ -74,6 +76,7 @@ def test_shift_expr() -> None: assert out.to_dict(as_series=False) == {"a": [5, 5, 1, 2, 3], "b": [5, 5, 1, 2, 3]} +@pytest.mark.may_fail_auto_streaming def test_shift_categorical() -> None: df = pl.Series("a", ["a", "b"], dtype=pl.Categorical).to_frame() diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index a9c61d76ecc9..8f8c197abdb2 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -621,7 +621,6 @@ def test_sort_by_in_over_5499() -> None: } -@pytest.mark.may_fail_auto_streaming def test_merge_sorted() -> None: df_a = ( pl.datetime_range( diff --git a/py-polars/tests/unit/operations/test_unpivot.py b/py-polars/tests/unit/operations/test_unpivot.py index 177548721f2d..26cfb98d97b4 100644 --- a/py-polars/tests/unit/operations/test_unpivot.py +++ b/py-polars/tests/unit/operations/test_unpivot.py @@ -115,6 +115,7 @@ def test_unpivot_categorical_global() -> None: } +@pytest.mark.may_fail_auto_streaming def test_unpivot_categorical_raise_19770() -> None: with pytest.raises(pl.exceptions.ComputeError): (pl.DataFrame({"x": ["foo"]}).cast(pl.Categorical).unpivot()) diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index db1e24b6691a..a0d705bb7cff 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -762,6 +762,7 @@ def test_init_nested_tuple() -> None: assert s3.dtype == pl.List(pl.Int32) +@pytest.mark.may_fail_auto_streaming def test_fill_null() -> None: s = pl.Series("a", [1, 2, None]) assert_series_equal(s.fill_null(strategy="forward"), pl.Series("a", [1, 2, 2])) diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 84aae68616a1..ed6b15af2335 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -19,7 +19,6 @@ pytestmark = pytest.mark.xdist_group("streaming") -@pytest.mark.may_fail_auto_streaming def test_streaming_categoricals_5921() -> None: with pl.StringCache(): out_lazy = (