From 8b5b05a7c388987617620a2b49cd8a0b7bf020c8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 24 Jan 2025 15:31:30 +0100 Subject: [PATCH] fix: Warn if asof keys not sorted (#20887) --- .../polars-ops/src/frame/join/asof/groups.rs | 22 ++++++++++++- crates/polars-ops/src/frame/join/asof/mod.rs | 31 ++++++++++++++++--- crates/polars-ops/src/frame/join/mod.rs | 2 ++ crates/polars-python/src/lazyframe/general.rs | 4 ++- .../rust/user-guide/transformations/joins.rs | 2 ++ py-polars/polars/dataframe/frame.py | 6 ++++ py-polars/polars/lazyframe/frame.py | 7 ++++- 7 files changed, 66 insertions(+), 8 deletions(-) diff --git a/crates/polars-ops/src/frame/join/asof/groups.rs b/crates/polars-ops/src/frame/join/asof/groups.rs index 99baa73226aa..09e1717057ec 100644 --- a/crates/polars-ops/src/frame/join/asof/groups.rs +++ b/crates/polars-ops/src/frame/join/asof/groups.rs @@ -485,6 +485,7 @@ pub trait AsofJoinBy: IntoDf { slice: Option<(i64, usize)>, coalesce: bool, allow_eq: bool, + check_sortedness: bool, ) -> PolarsResult { let (self_sliced_slot, other_sliced_slot, left_slice_s, right_slice_s); // Keeps temporaries alive. let (self_df, other_df, left_key, right_key); @@ -513,6 +514,7 @@ pub trait AsofJoinBy: IntoDf { &right_asof, tolerance.is_some(), left_by.is_empty() && right_by.is_empty(), + check_sortedness, )?; let mut left_by = self_df.select(left_by)?; @@ -579,6 +581,7 @@ pub trait AsofJoinBy: IntoDf { strategy: AsofStrategy, tolerance: Option>, allow_eq: bool, + check_sortedness: bool, ) -> PolarsResult where I: IntoIterator, @@ -590,8 +593,18 @@ pub trait AsofJoinBy: IntoDf { let left_key = self_df.column(left_on)?.as_materialized_series(); let right_key = other.column(right_on)?.as_materialized_series(); self_df._join_asof_by( - other, left_key, right_key, left_by, right_by, strategy, tolerance, None, None, true, + other, + left_key, + right_key, + left_by, + right_by, + strategy, + tolerance, + None, + None, + true, allow_eq, + check_sortedness, ) } } @@ -624,6 +637,7 @@ mod test { AsofStrategy::Backward, None, true, + true, )?; assert_eq!(out.get_column_names(), &["a", "b", "right_vals"]); let out = out.column("right_vals").unwrap(); @@ -668,6 +682,7 @@ mod test { AsofStrategy::Backward, None, true, + true, )?; let a = out.column("bid_right").unwrap(); let a = a.f64().unwrap(); @@ -684,6 +699,7 @@ mod test { AsofStrategy::Backward, None, true, + true, )?; let a = out.column("bid_right").unwrap(); let a = a.f64().unwrap(); @@ -715,6 +731,7 @@ mod test { AsofStrategy::Forward, None, true, + true, )?; assert_eq!(out.get_column_names(), &["a", "b", "right_vals"]); let out = out.column("right_vals").unwrap(); @@ -733,6 +750,7 @@ mod test { AsofStrategy::Forward, Some(AnyValue::Int32(1)), true, + true, )?; assert_eq!(out.get_column_names(), &["a", "b", "right_vals"]); let out = out.column("right_vals").unwrap(); @@ -802,6 +820,7 @@ mod test { AsofStrategy::Forward, None, true, + true, )?; let a = out.column("bid_right").unwrap(); let a = a.f64().unwrap(); @@ -824,6 +843,7 @@ mod test { AsofStrategy::Forward, None, true, + true, )?; let a = out.column("bid_right").unwrap(); let a = a.f64().unwrap(); diff --git a/crates/polars-ops/src/frame/join/asof/mod.rs b/crates/polars-ops/src/frame/join/asof/mod.rs index ebfe4dd10f6d..627ad402739c 100644 --- a/crates/polars-ops/src/frame/join/asof/mod.rs +++ b/crates/polars-ops/src/frame/join/asof/mod.rs @@ -200,13 +200,15 @@ pub struct AsOfOptions { pub right_by: Option>, /// Allow equal matches pub allow_eq: bool, + pub check_sortedness: bool, } fn check_asof_columns( a: &Series, b: &Series, has_tolerance: bool, - check_sorted: bool, + sorted_err: bool, + check_sortedness: bool, ) -> PolarsResult<()> { let dtype_a = a.dtype(); let dtype_b = b.dtype(); @@ -228,9 +230,21 @@ fn check_asof_columns( ComputeError: "mismatching key dtypes in asof-join: `{}` and `{}`", a.dtype(), b.dtype() ); - if check_sorted { - a.ensure_sorted_arg("asof_join")?; - b.ensure_sorted_arg("asof_join")?; + if check_sortedness { + if sorted_err { + a.ensure_sorted_arg("asof_join")?; + b.ensure_sorted_arg("asof_join")?; + } else { + let msg = |side| { + format!("{side} key of asof join is not sorted.\n\nThis can lead to invalid results. Ensure the asof key is sorted") + }; + if a.ensure_sorted_arg("asof_join").is_err() { + polars_warn!(msg("left")) + } + if b.ensure_sorted_arg("asof_join").is_err() { + polars_warn!(msg("right")) + } + } } Ok(()) } @@ -261,10 +275,17 @@ pub trait AsofJoin: IntoDf { slice: Option<(i64, usize)>, coalesce: bool, allow_eq: bool, + check_sortedness: bool, ) -> PolarsResult { let self_df = self.to_df(); - check_asof_columns(left_key, right_key, tolerance.is_some(), true)?; + check_asof_columns( + left_key, + right_key, + tolerance.is_some(), + true, + check_sortedness, + )?; let left_key = left_key.to_physical_repr(); let right_key = right_key.to_physical_repr(); diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index b7d001577acc..b463791e7e05 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -309,6 +309,7 @@ pub trait DataFrameJoinOps: IntoDf { args.slice, should_coalesce, options.allow_eq, + options.check_sortedness, ), (None, None) => left_df._join_asof( other, @@ -320,6 +321,7 @@ pub trait DataFrameJoinOps: IntoDf { args.slice, should_coalesce, options.allow_eq, + options.check_sortedness, ), _ => { panic!("expected by arguments on both sides") diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 4d29e55537fd..ddae833410e0 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -990,7 +990,7 @@ impl PyLazyFrame { } #[cfg(feature = "asof_join")] - #[pyo3(signature = (other, left_on, right_on, left_by, right_by, allow_parallel, force_parallel, suffix, strategy, tolerance, tolerance_str, coalesce, allow_eq))] + #[pyo3(signature = (other, left_on, right_on, left_by, right_by, allow_parallel, force_parallel, suffix, strategy, tolerance, tolerance_str, coalesce, allow_eq, check_sortedness))] fn join_asof( &self, other: Self, @@ -1006,6 +1006,7 @@ impl PyLazyFrame { tolerance_str: Option, coalesce: bool, allow_eq: bool, + check_sortedness: bool, ) -> PyResult { let coalesce = if coalesce { JoinCoalesce::CoalesceColumns @@ -1031,6 +1032,7 @@ impl PyLazyFrame { tolerance: tolerance.map(|t| t.0.into_static()), tolerance_str: tolerance_str.map(|s| s.into()), allow_eq, + check_sortedness, })) .suffix(suffix) .finish() diff --git a/docs/source/src/rust/user-guide/transformations/joins.rs b/docs/source/src/rust/user-guide/transformations/joins.rs index fffef2cdbeb3..45037abf3eeb 100644 --- a/docs/source/src/rust/user-guide/transformations/joins.rs +++ b/docs/source/src/rust/user-guide/transformations/joins.rs @@ -253,6 +253,7 @@ fn main() -> Result<(), Box> { AsofStrategy::Backward, None, true, + true, )?; println!("{}", result); // --8<-- [end:asof] @@ -267,6 +268,7 @@ fn main() -> Result<(), Box> { AsofStrategy::Backward, Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), true, + true, )?; println!("{}", result); // --8<-- [end:asof-tolerance] diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ae1486c0f766..257a19ee6604 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -6981,6 +6981,7 @@ def join_asof( force_parallel: bool = False, coalesce: bool = True, allow_exact_matches: bool = True, + check_sortedness: bool = True, ) -> DataFrame: """ Perform an asof join. @@ -7073,6 +7074,10 @@ def join_asof( (i.e. less-than-or-equal-to / greater-than-or-equal-to) - If False, don't match the same ``on`` value (i.e., strictly less-than / strictly greater-than). + check_sortedness + Check the sortedness of the asof keys. If the keys are not sorted Polars + will error, or in case of 'by' argument raise a warning. This might become + a hard error in the future. Examples -------- @@ -7306,6 +7311,7 @@ def join_asof( force_parallel=force_parallel, coalesce=coalesce, allow_exact_matches=allow_exact_matches, + check_sortedness=check_sortedness, ) .collect(_eager=True) ) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index bf386132bb47..34854c66c437 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4267,6 +4267,7 @@ def join_asof( force_parallel: bool = False, coalesce: bool = True, allow_exact_matches: bool = True, + check_sortedness: bool = True, ) -> LazyFrame: """ Perform an asof join. @@ -4359,7 +4360,10 @@ def join_asof( (i.e. less-than-or-equal-to / greater-than-or-equal-to) - If False, don't match the same ``on`` value (i.e., strictly less-than / strictly greater-than). - + check_sortedness + Check the sortedness of the asof keys. If the keys are not sorted Polars + will error, or in case of 'by' argument raise a warning. This might become + a hard error in the future. Examples @@ -4616,6 +4620,7 @@ def join_asof( tolerance_str, coalesce=coalesce, allow_eq=allow_exact_matches, + check_sortedness=check_sortedness, ) )