@@ -33,11 +33,11 @@ pub struct IEJoinOptions {
33
33
pub operator2 : InequalityOperator ,
34
34
}
35
35
36
- /// Inequality join. Matches rows from this DataFrame with rows from another DataFrame
37
- /// using two inequality operators (one of [<, <=, >, >=]).
36
+ /// Inequality join. Matches rows between two DataFrames using two inequality operators
37
+ /// (one of [<, <=, >, >=]).
38
38
/// Based on Khayyat et al. 2015, "Lightning Fast and Space Efficient Inequality Joins"
39
39
/// and extended to work with duplicate values.
40
- pub fn join_dataframes (
40
+ pub fn iejoin (
41
41
left : & DataFrame ,
42
42
right : & DataFrame ,
43
43
selected_left : Vec < Series > ,
@@ -221,6 +221,8 @@ trait L1Array {
221
221
fn mark_visited ( & self , index : usize , bit_array : & mut FilteredBitArray ) ;
222
222
}
223
223
224
+ /// Find the position in the L1 array where we should begin checking for matches,
225
+ /// given the index in L1 corresponding to the current position in L2.
224
226
fn find_search_start_index < T > (
225
227
l1_array : & [ L1Item < T > ] ,
226
228
index : usize ,
@@ -284,6 +286,39 @@ where
284
286
}
285
287
}
286
288
289
+ fn find_matches_in_l1 < T > (
290
+ l1_array : & [ L1Item < T > ] ,
291
+ l1_index : usize ,
292
+ row_index : i64 ,
293
+ bit_array : & FilteredBitArray ,
294
+ op1 : InequalityOperator ,
295
+ left_row_ids : & mut PrimitiveChunkedBuilder < IdxType > ,
296
+ right_row_ids : & mut PrimitiveChunkedBuilder < IdxType > ,
297
+ ) -> i64
298
+ where
299
+ T : NumericNative ,
300
+ T : TotalOrd ,
301
+ {
302
+ debug_assert ! ( row_index > 0 ) ;
303
+ let mut match_count = 0 ;
304
+
305
+ // This entry comes from the left hand side DataFrame.
306
+ // Find all following entries in L1 (meaning they satisfy the first operator)
307
+ // that have already been visited (so satisfy the second operator).
308
+ // Because we use a stable sort for l2, we know that we won't find any
309
+ // matches for duplicate y values when traversing forwards in l1.
310
+ let start_index = find_search_start_index ( l1_array, l1_index, op1) ;
311
+ bit_array. on_set_bits_from ( start_index, |set_bit : usize | {
312
+ let right_row_index = l1_array[ set_bit] . row_index ;
313
+ debug_assert ! ( right_row_index < 0 ) ;
314
+ left_row_ids. append_value ( ( row_index - 1 ) as IdxSize ) ;
315
+ right_row_ids. append_value ( ( -right_row_index) as IdxSize - 1 ) ;
316
+ match_count += 1 ;
317
+ } ) ;
318
+
319
+ match_count
320
+ }
321
+
287
322
impl < T > L1Array for Vec < L1Item < T > >
288
323
where
289
324
T : NumericNative ,
@@ -296,27 +331,22 @@ where
296
331
left_row_ids : & mut PrimitiveChunkedBuilder < IdxType > ,
297
332
right_row_ids : & mut PrimitiveChunkedBuilder < IdxType > ,
298
333
) -> i64 {
299
- let mut match_count = 0 ;
300
334
let row_index = self [ l1_index] . row_index ;
301
335
let from_lhs = row_index > 0 ;
302
336
if from_lhs {
303
- // This entry comes from the left hand side DataFrame.
304
- // Find all following entries in L1 (meaning they satisfy the first operator)
305
- // that have already been visited (so satisfy the second operator).
306
- // Because we use a stable sort for l2, we know that we won't find any
307
- // matches for duplicate y values when traversing forwards in l1.
308
- let start_index = find_search_start_index ( self , l1_index, op1) ;
309
- bit_array. on_set_bits_from ( start_index, |set_bit : usize | {
310
- let right_row_index = self [ set_bit] . row_index ;
311
- debug_assert ! ( right_row_index < 0 ) ;
312
- left_row_ids. append_value ( ( row_index - 1 ) as IdxSize ) ;
313
- right_row_ids. append_value ( ( -right_row_index) as IdxSize - 1 ) ;
314
- match_count += 1 ;
315
- } ) ;
337
+ find_matches_in_l1 (
338
+ self ,
339
+ l1_index,
340
+ row_index,
341
+ bit_array,
342
+ op1,
343
+ left_row_ids,
344
+ right_row_ids,
345
+ )
316
346
} else {
317
347
bit_array. set_bit ( l1_index) ;
348
+ 0
318
349
}
319
- match_count
320
350
}
321
351
322
352
fn process_lhs_entry (
@@ -327,30 +357,35 @@ where
327
357
left_row_ids : & mut PrimitiveChunkedBuilder < IdxType > ,
328
358
right_row_ids : & mut PrimitiveChunkedBuilder < IdxType > ,
329
359
) -> i64 {
330
- let mut match_count = 0 ;
331
360
let row_index = self [ l1_index] . row_index ;
332
361
let from_lhs = row_index > 0 ;
333
362
if from_lhs {
334
- let start_index = find_search_start_index ( self , l1_index, op1) ;
335
- bit_array. on_set_bits_from ( start_index, |set_bit : usize | {
336
- let right_row_index = self [ set_bit] . row_index ;
337
- debug_assert ! ( right_row_index < 0 ) ;
338
- left_row_ids. append_value ( ( row_index - 1 ) as IdxSize ) ;
339
- right_row_ids. append_value ( ( -right_row_index) as IdxSize - 1 ) ;
340
- match_count += 1 ;
341
- } ) ;
363
+ find_matches_in_l1 (
364
+ self ,
365
+ l1_index,
366
+ row_index,
367
+ bit_array,
368
+ op1,
369
+ left_row_ids,
370
+ right_row_ids,
371
+ )
372
+ } else {
373
+ 0
342
374
}
343
- match_count
344
375
}
345
376
346
377
fn mark_visited ( & self , index : usize , bit_array : & mut FilteredBitArray ) {
347
378
let from_lhs = self [ index] . row_index > 0 ;
379
+ // We only mark RHS entries as visited,
380
+ // so that we don't try to match LHS entries with other LHS entries.
348
381
if !from_lhs {
349
382
bit_array. set_bit ( index) ;
350
383
}
351
384
}
352
385
}
353
386
387
+ /// Create a vector of L1 items from the array of LHS x values concatenated with RHS x values
388
+ /// and their ordering.
354
389
fn build_l1_array < T > (
355
390
ca : & ChunkedArray < T > ,
356
391
order : & IdxCa ,
@@ -366,15 +401,20 @@ where
366
401
// Nulls should have been skipped over
367
402
. ok_or_else ( || polars_err ! ( ComputeError : "Unexpected null value in IEJoin data" ) ) ?;
368
403
let row_index = if index < right_df_offset {
404
+ // Row from LHS
369
405
index as i64 + 1
370
406
} else {
407
+ // Row from RHS
371
408
-( ( index - right_df_offset) as i64 ) - 1
372
409
} ;
373
410
array. push ( L1Item { row_index, value } ) ;
374
411
}
375
412
Ok ( Box :: new ( array) )
376
413
}
377
414
415
+ /// Create a vector of L2 items from the array of y values ordered according to the L1 order,
416
+ /// and their ordering. We don't need to store actual y values but only track whether we're at
417
+ /// the end of a run of equal values.
378
418
fn build_l2_array < T > ( ca : & ChunkedArray < T > , order : & IdxCa ) -> PolarsResult < Vec < L2Item > >
379
419
where
380
420
T : PolarsNumericType ,
0 commit comments