Skip to content

Commit a526122

Browse files
adamreeveritchie46
authored andcommitted
Add higher level join_between method
1 parent ece0079 commit a526122

File tree

3 files changed

+156
-0
lines changed

3 files changed

+156
-0
lines changed

py-polars/polars/dataframe/frame.py

+52
Original file line numberDiff line numberDiff line change
@@ -7124,6 +7124,58 @@ def inequality_join(
71247124
.collect(_eager=True)
71257125
)
71267126

7127+
def join_between(
7128+
self,
7129+
other: DataFrame,
7130+
*,
7131+
left_on: str | Expr,
7132+
right_on_lower: str | Expr,
7133+
right_on_upper: str | Expr,
7134+
exclusive_lower: bool = False,
7135+
exclusive_upper: bool = True,
7136+
suffix: str = "_right",
7137+
) -> DataFrame:
7138+
"""
7139+
Join by matching values from this table with an interval in another table.
7140+
7141+
A row from this table may be included in zero or multiple rows in the result,
7142+
and the relative order of rows may differ between the input and output tables.
7143+
7144+
Parameters
7145+
----------
7146+
other
7147+
DataFrame to join with.
7148+
left_on
7149+
Join column of the left table.
7150+
right_on_lower
7151+
Lower bound of the interval in the other table
7152+
right_on_upper
7153+
Upper bound of the interval in the other table
7154+
exclusive_lower
7155+
Whether the lower bound of the interval is an exclusive bound
7156+
exclusive_upper
7157+
Whether the upper bound of the interval is an exclusive bound
7158+
suffix
7159+
Suffix to append to columns with a duplicate name.
7160+
"""
7161+
if not isinstance(other, DataFrame):
7162+
msg = f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
7163+
raise TypeError(msg)
7164+
7165+
return (
7166+
self.lazy()
7167+
.join_between(
7168+
other.lazy(),
7169+
left_on=left_on,
7170+
right_on_lower=right_on_lower,
7171+
right_on_upper=right_on_upper,
7172+
exclusive_lower=exclusive_lower,
7173+
exclusive_upper=exclusive_upper,
7174+
suffix=suffix,
7175+
)
7176+
.collect(_eager=True)
7177+
)
7178+
71277179
def map_rows(
71287180
self,
71297181
function: Callable[[tuple[Any, ...]], Any],

py-polars/polars/lazyframe/frame.py

+57
Original file line numberDiff line numberDiff line change
@@ -4607,6 +4607,63 @@ def join(
46074607
)
46084608
)
46094609

4610+
def join_between(
4611+
self,
4612+
other: LazyFrame,
4613+
*,
4614+
left_on: str | Expr,
4615+
right_on_lower: str | Expr,
4616+
right_on_upper: str | Expr,
4617+
exclusive_lower: bool = False,
4618+
exclusive_upper: bool = True,
4619+
suffix: str = "_right",
4620+
) -> LazyFrame:
4621+
"""
4622+
Join by matching values from this table with an interval in another table.
4623+
4624+
A row from this table may be included in zero or multiple rows in the result,
4625+
and the relative order of rows may differ between the input and output tables.
4626+
4627+
Parameters
4628+
----------
4629+
other
4630+
LazyFrame to join with.
4631+
left_on
4632+
Join column of the left table.
4633+
right_on_lower
4634+
Lower bound of the interval in the other table
4635+
right_on_upper
4636+
Upper bound of the interval in the other table
4637+
exclusive_lower
4638+
Whether the lower bound of the interval is an exclusive bound
4639+
exclusive_upper
4640+
Whether the upper bound of the interval is an exclusive bound
4641+
suffix
4642+
Suffix to append to columns with a duplicate name.
4643+
"""
4644+
if not isinstance(other, LazyFrame):
4645+
msg = f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}"
4646+
raise TypeError(msg)
4647+
4648+
left_on = wrap_expr(parse_into_expression(left_on))
4649+
right_on_lower = wrap_expr(parse_into_expression(right_on_lower))
4650+
right_on_upper = wrap_expr(parse_into_expression(right_on_upper))
4651+
4652+
lower_expr = (
4653+
left_on > right_on_lower if exclusive_lower else left_on >= right_on_lower
4654+
)
4655+
upper_expr = (
4656+
left_on < right_on_upper if exclusive_upper else left_on <= right_on_upper
4657+
)
4658+
4659+
return self._from_pyldf(
4660+
self._ldf.inequality_join(
4661+
other._ldf,
4662+
[lower_expr._pyexpr, upper_expr._pyexpr],
4663+
suffix,
4664+
)
4665+
)
4666+
46104667
def with_columns(
46114668
self,
46124669
*exprs: IntoExpr | Iterable[IntoExpr],

py-polars/tests/unit/operations/test_inequality_join.py

+47
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from datetime import datetime
34
from typing import TYPE_CHECKING, Any
45

56
import hypothesis.strategies as st
@@ -167,6 +168,52 @@ def test_ie_join_with_expressions() -> None:
167168
assert_frame_equal(actual, expected, check_row_order=False, check_exact=True)
168169

169170

171+
def test_join_between() -> None:
172+
left = pl.DataFrame(
173+
{
174+
"id": [0, 1, 2, 3, 4, 5],
175+
"time": [
176+
datetime(2024, 8, 26, 15, 34, 30),
177+
datetime(2024, 8, 26, 15, 35, 30),
178+
datetime(2024, 8, 26, 15, 36, 30),
179+
datetime(2024, 8, 26, 15, 37, 30),
180+
datetime(2024, 8, 26, 15, 38, 0),
181+
datetime(2024, 8, 26, 15, 39, 0),
182+
],
183+
}
184+
)
185+
right = pl.DataFrame(
186+
{
187+
"id": [0, 1, 2],
188+
"start_time": [
189+
datetime(2024, 8, 26, 15, 34, 0),
190+
datetime(2024, 8, 26, 15, 35, 0),
191+
datetime(2024, 8, 26, 15, 38, 0),
192+
],
193+
"end_time": [
194+
datetime(2024, 8, 26, 15, 36, 0),
195+
datetime(2024, 8, 26, 15, 37, 0),
196+
datetime(2024, 8, 26, 15, 39, 0),
197+
],
198+
}
199+
)
200+
201+
actual = left.join_between(
202+
right,
203+
left_on="time",
204+
right_on_lower="start_time",
205+
right_on_upper="end_time",
206+
).select("id", "id_right")
207+
208+
expected = pl.DataFrame(
209+
{
210+
"id": [0, 1, 1, 2, 4],
211+
"id_right": [0, 0, 1, 1, 2],
212+
}
213+
)
214+
assert_frame_equal(actual, expected, check_row_order=False, check_exact=True)
215+
216+
170217
def _inequality_expression(col1: str, op: str, col2: str) -> pl.Expr:
171218
if op == "<":
172219
return pl.col(col1) < pl.col(col2)

0 commit comments

Comments
 (0)