From abb7669098a546c9368f60f752dfa71cdcb99a6d Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Wed, 12 Feb 2025 02:11:56 +0100 Subject: [PATCH] Speedup `date_trunc` (~20% time reduction) (#14593) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add bench for date_trunc * Optimize date_trunc using try_unary date_trunc_minute_1000 time: [12.745 µs 12.794 µs 12.849 µs] change: [-19.871% -19.544% -19.210%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high mild remove todo --------- Co-authored-by: Simon Vandel Sillesen --- datafusion/functions/Cargo.toml | 5 ++ datafusion/functions/benches/date_trunc.rs | 60 +++++++++++++++++++ .../functions/src/datetime/date_trunc.rs | 55 +++++++++-------- 3 files changed, 95 insertions(+), 25 deletions(-) create mode 100644 datafusion/functions/benches/date_trunc.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index a890b7c7d65b..c4186c39317c 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -133,6 +133,11 @@ harness = false name = "date_bin" required-features = ["datetime_expressions"] +[[bench]] +harness = false +name = "date_trunc" +required-features = ["datetime_expressions"] + [[bench]] harness = false name = "to_char" diff --git a/datafusion/functions/benches/date_trunc.rs b/datafusion/functions/benches/date_trunc.rs new file mode 100644 index 000000000000..d420b8f6ac70 --- /dev/null +++ b/datafusion/functions/benches/date_trunc.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, TimestampSecondArray}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_common::ScalarValue; +use rand::rngs::ThreadRng; +use rand::Rng; + +use datafusion_expr::ColumnarValue; +use datafusion_functions::datetime::date_trunc; + +fn timestamps(rng: &mut ThreadRng) -> TimestampSecondArray { + let mut seconds = vec![]; + for _ in 0..1000 { + seconds.push(rng.gen_range(0..1_000_000)); + } + + TimestampSecondArray::from(seconds) +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("date_trunc_minute_1000", |b| { + let mut rng = rand::thread_rng(); + let timestamps_array = Arc::new(timestamps(&mut rng)) as ArrayRef; + let batch_len = timestamps_array.len(); + let precision = + ColumnarValue::Scalar(ScalarValue::Utf8(Some("minute".to_string()))); + let timestamps = ColumnarValue::Array(timestamps_array); + let udf = date_trunc(); + + b.iter(|| { + black_box( + udf.invoke_batch(&[precision.clone(), timestamps.clone()], batch_len) + .expect("date_trunc should work on valid values"), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 4780f5f5b818..7c10cdd0029d 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -185,10 +185,10 @@ impl ScalarUDFImpl for DateTruncFunc { ) -> Result { let parsed_tz = parse_tz(tz_opt)?; let array = as_primitive_array::(array)?; - let array = array - .iter() - .map(|x| general_date_trunc(T::UNIT, &x, parsed_tz, granularity.as_str())) - .collect::>>()? + let array: PrimitiveArray = array + .try_unary(|x| { + general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str()) + })? .with_timezone_opt(tz_opt.clone()); Ok(ColumnarValue::Array(Arc::new(array))) } @@ -199,7 +199,16 @@ impl ScalarUDFImpl for DateTruncFunc { tz_opt: &Option>, ) -> Result { let parsed_tz = parse_tz(tz_opt)?; - let value = general_date_trunc(T::UNIT, v, parsed_tz, granularity.as_str())?; + let value = if let Some(v) = v { + Some(general_date_trunc( + T::UNIT, + *v, + parsed_tz, + granularity.as_str(), + )?) + } else { + None + }; let value = ScalarValue::new_timestamp::(value, tz_opt.clone()); Ok(ColumnarValue::Scalar(value)) } @@ -417,10 +426,10 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option) -> Result, + value: i64, tz: Option, granularity: &str, -) -> Result, DataFusionError> { +) -> Result { let scale = match tu { Second => 1_000_000_000, Millisecond => 1_000_000, @@ -428,35 +437,31 @@ fn general_date_trunc( Nanosecond => 1, }; - let Some(value) = value else { - return Ok(None); - }; - // convert to nanoseconds let nano = date_trunc_coarse(granularity, scale * value, tz)?; let result = match tu { Second => match granularity { - "minute" => Some(nano / 1_000_000_000 / 60 * 60), - _ => Some(nano / 1_000_000_000), + "minute" => nano / 1_000_000_000 / 60 * 60, + _ => nano / 1_000_000_000, }, Millisecond => match granularity { - "minute" => Some(nano / 1_000_000 / 1_000 / 60 * 1_000 * 60), - "second" => Some(nano / 1_000_000 / 1_000 * 1_000), - _ => Some(nano / 1_000_000), + "minute" => nano / 1_000_000 / 1_000 / 60 * 1_000 * 60, + "second" => nano / 1_000_000 / 1_000 * 1_000, + _ => nano / 1_000_000, }, Microsecond => match granularity { - "minute" => Some(nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000), - "second" => Some(nano / 1_000 / 1_000_000 * 1_000_000), - "millisecond" => Some(nano / 1_000 / 1_000 * 1_000), - _ => Some(nano / 1_000), + "minute" => nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000, + "second" => nano / 1_000 / 1_000_000 * 1_000_000, + "millisecond" => nano / 1_000 / 1_000 * 1_000, + _ => nano / 1_000, }, _ => match granularity { - "minute" => Some(nano / 1_000_000_000 / 60 * 1_000_000_000 * 60), - "second" => Some(nano / 1_000_000_000 * 1_000_000_000), - "millisecond" => Some(nano / 1_000_000 * 1_000_000), - "microsecond" => Some(nano / 1_000 * 1_000), - _ => Some(nano), + "minute" => nano / 1_000_000_000 / 60 * 1_000_000_000 * 60, + "second" => nano / 1_000_000_000 * 1_000_000_000, + "millisecond" => nano / 1_000_000 * 1_000_000, + "microsecond" => nano / 1_000 * 1_000, + _ => nano, }, }; Ok(result)