Skip to content

Commit

Permalink
Speedup date_trunc (~20% time reduction) (#14593)
Browse files Browse the repository at this point in the history
* add bench for date_trunc

* Optimize date_trunc using try_unary

date_trunc_minute_1000  time:   [12.745 µs 12.794 µs 12.849 µs]
                        change: [-19.871% -19.544% -19.210%] (p = 0.00 <
0.05)
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild

remove todo

---------

Co-authored-by: Simon Vandel Sillesen <svs@svs-MacBook-Pro.local>
  • Loading branch information
simonvandel and Simon Vandel Sillesen authored Feb 12, 2025
1 parent 21dcc27 commit abb7669
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 25 deletions.
5 changes: 5 additions & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ harness = false
name = "date_bin"
required-features = ["datetime_expressions"]

[[bench]]
harness = false
name = "date_trunc"
required-features = ["datetime_expressions"]

[[bench]]
harness = false
name = "to_char"
Expand Down
60 changes: 60 additions & 0 deletions datafusion/functions/benches/date_trunc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;

use std::sync::Arc;

use arrow::array::{Array, ArrayRef, TimestampSecondArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_common::ScalarValue;
use rand::rngs::ThreadRng;
use rand::Rng;

use datafusion_expr::ColumnarValue;
use datafusion_functions::datetime::date_trunc;

fn timestamps(rng: &mut ThreadRng) -> TimestampSecondArray {
let mut seconds = vec![];
for _ in 0..1000 {
seconds.push(rng.gen_range(0..1_000_000));
}

TimestampSecondArray::from(seconds)
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("date_trunc_minute_1000", |b| {
let mut rng = rand::thread_rng();
let timestamps_array = Arc::new(timestamps(&mut rng)) as ArrayRef;
let batch_len = timestamps_array.len();
let precision =
ColumnarValue::Scalar(ScalarValue::Utf8(Some("minute".to_string())));
let timestamps = ColumnarValue::Array(timestamps_array);
let udf = date_trunc();

b.iter(|| {
black_box(
udf.invoke_batch(&[precision.clone(), timestamps.clone()], batch_len)
.expect("date_trunc should work on valid values"),
)
})
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
55 changes: 30 additions & 25 deletions datafusion/functions/src/datetime/date_trunc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,10 @@ impl ScalarUDFImpl for DateTruncFunc {
) -> Result<ColumnarValue> {
let parsed_tz = parse_tz(tz_opt)?;
let array = as_primitive_array::<T>(array)?;
let array = array
.iter()
.map(|x| general_date_trunc(T::UNIT, &x, parsed_tz, granularity.as_str()))
.collect::<Result<PrimitiveArray<T>>>()?
let array: PrimitiveArray<T> = array
.try_unary(|x| {
general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str())
})?
.with_timezone_opt(tz_opt.clone());
Ok(ColumnarValue::Array(Arc::new(array)))
}
Expand All @@ -199,7 +199,16 @@ impl ScalarUDFImpl for DateTruncFunc {
tz_opt: &Option<Arc<str>>,
) -> Result<ColumnarValue> {
let parsed_tz = parse_tz(tz_opt)?;
let value = general_date_trunc(T::UNIT, v, parsed_tz, granularity.as_str())?;
let value = if let Some(v) = v {
Some(general_date_trunc(
T::UNIT,
*v,
parsed_tz,
granularity.as_str(),
)?)
} else {
None
};
let value = ScalarValue::new_timestamp::<T>(value, tz_opt.clone());
Ok(ColumnarValue::Scalar(value))
}
Expand Down Expand Up @@ -417,46 +426,42 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option<Tz>) -> Result<i6
// truncates a single value with the given timeunit to the specified granularity
fn general_date_trunc(
tu: TimeUnit,
value: &Option<i64>,
value: i64,
tz: Option<Tz>,
granularity: &str,
) -> Result<Option<i64>, DataFusionError> {
) -> Result<i64, DataFusionError> {
let scale = match tu {
Second => 1_000_000_000,
Millisecond => 1_000_000,
Microsecond => 1_000,
Nanosecond => 1,
};

let Some(value) = value else {
return Ok(None);
};

// convert to nanoseconds
let nano = date_trunc_coarse(granularity, scale * value, tz)?;

let result = match tu {
Second => match granularity {
"minute" => Some(nano / 1_000_000_000 / 60 * 60),
_ => Some(nano / 1_000_000_000),
"minute" => nano / 1_000_000_000 / 60 * 60,
_ => nano / 1_000_000_000,
},
Millisecond => match granularity {
"minute" => Some(nano / 1_000_000 / 1_000 / 60 * 1_000 * 60),
"second" => Some(nano / 1_000_000 / 1_000 * 1_000),
_ => Some(nano / 1_000_000),
"minute" => nano / 1_000_000 / 1_000 / 60 * 1_000 * 60,
"second" => nano / 1_000_000 / 1_000 * 1_000,
_ => nano / 1_000_000,
},
Microsecond => match granularity {
"minute" => Some(nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000),
"second" => Some(nano / 1_000 / 1_000_000 * 1_000_000),
"millisecond" => Some(nano / 1_000 / 1_000 * 1_000),
_ => Some(nano / 1_000),
"minute" => nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000,
"second" => nano / 1_000 / 1_000_000 * 1_000_000,
"millisecond" => nano / 1_000 / 1_000 * 1_000,
_ => nano / 1_000,
},
_ => match granularity {
"minute" => Some(nano / 1_000_000_000 / 60 * 1_000_000_000 * 60),
"second" => Some(nano / 1_000_000_000 * 1_000_000_000),
"millisecond" => Some(nano / 1_000_000 * 1_000_000),
"microsecond" => Some(nano / 1_000 * 1_000),
_ => Some(nano),
"minute" => nano / 1_000_000_000 / 60 * 1_000_000_000 * 60,
"second" => nano / 1_000_000_000 * 1_000_000_000,
"millisecond" => nano / 1_000_000 * 1_000_000,
"microsecond" => nano / 1_000 * 1_000,
_ => nano,
},
};
Ok(result)
Expand Down

0 comments on commit abb7669

Please sign in to comment.