From e4b78c7ed40c248cfc9596d53f1813b62c668249 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 17 Feb 2025 10:22:22 -0500 Subject: [PATCH 01/71] minor: simplify `union_extract` code (#14640) * minor: simplify `union_extract` code * Fix CI tests on main --- .../functions/src/core/union_extract.rs | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/datafusion/functions/src/core/union_extract.rs b/datafusion/functions/src/core/union_extract.rs index d54627f73598..95814197d8df 100644 --- a/datafusion/functions/src/core/union_extract.rs +++ b/datafusion/functions/src/core/union_extract.rs @@ -18,6 +18,7 @@ use arrow::array::Array; use arrow::datatypes::{DataType, FieldRef, UnionFields}; use datafusion_common::cast::as_union_array; +use datafusion_common::utils::take_function_args; use datafusion_common::{ exec_datafusion_err, exec_err, internal_err, Result, ScalarValue, }; @@ -113,22 +114,15 @@ impl ScalarUDFImpl for UnionExtractFun { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let args = args.args; + let [array, target_name] = take_function_args("union_extract", args.args)?; - if args.len() != 2 { - return exec_err!( - "union_extract expects 2 arguments, got {} instead", - args.len() - ); - } - - let target_name = match &args[1] { + let target_name = match target_name { ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_name))) => Ok(target_name), ColumnarValue::Scalar(ScalarValue::Utf8(None)) => exec_err!("union_extract second argument must be a non-null string literal, got a null instead"), - _ => exec_err!("union_extract second argument must be a non-null string literal, got {} instead", &args[1].data_type()), - }; + _ => exec_err!("union_extract second argument must be a non-null string literal, got {} instead", target_name.data_type()), + }?; - match &args[0] { + match array { ColumnarValue::Array(array) => { let union_array = as_union_array(&array).map_err(|_| { exec_datafusion_err!( @@ -140,19 +134,16 @@ impl ScalarUDFImpl for UnionExtractFun { Ok(ColumnarValue::Array( arrow::compute::kernels::union_extract::union_extract( union_array, - target_name?, + &target_name, )?, )) } ColumnarValue::Scalar(ScalarValue::Union(value, fields, _)) => { - let target_name = target_name?; - let (target_type_id, target) = find_field(fields, target_name)?; + let (target_type_id, target) = find_field(&fields, &target_name)?; let result = match value { - Some((type_id, value)) if target_type_id == *type_id => { - *value.clone() - } - _ => ScalarValue::try_from(target.data_type())?, + Some((type_id, value)) if target_type_id == type_id => *value, + _ => ScalarValue::try_new_null(target.data_type())?, }; Ok(ColumnarValue::Scalar(result)) From 19fe44cf2f30cbdd63d4a4f52c74055163c6cc38 Mon Sep 17 00:00:00 2001 From: Gabriel <45515538+gabotechs@users.noreply.github.com> Date: Tue, 18 Feb 2025 01:20:21 +0100 Subject: [PATCH 02/71] make DefaultSubstraitProducer public (#14721) --- datafusion/substrait/src/logical_plan/producer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index d795a869568b..d7cc25d7bf65 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -369,7 +369,7 @@ pub trait SubstraitProducer: Send + Sync + Sized { } } -struct DefaultSubstraitProducer<'a> { +pub struct DefaultSubstraitProducer<'a> { extensions: Extensions, serializer_registry: &'a dyn SerializerRegistry, } From 8f2f5376337cc0b8efd73cee7f9cc0c519d51ed6 Mon Sep 17 00:00:00 2001 From: irenjj Date: Tue, 18 Feb 2025 20:25:03 +0800 Subject: [PATCH 03/71] chore: Migrate Encoding functions to invoke_with_args (#14727) --- datafusion/functions/src/encoding/inner.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 68a6d1006052..51e8c6968866 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -103,12 +103,11 @@ impl ScalarUDFImpl for EncodeFunc { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - encode(args) + encode(&args.args) } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { @@ -183,12 +182,11 @@ impl ScalarUDFImpl for DecodeFunc { Ok(arg_types[0].to_owned()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - decode(args) + decode(&args.args) } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { From 6dfbdba0cfa61810fde49d453661c30c0ebda534 Mon Sep 17 00:00:00 2001 From: niebayes Date: Tue, 18 Feb 2025 20:30:27 +0800 Subject: [PATCH 04/71] chore: Migrate Core Functions to invoke_with_args (#14725) * migrate version * migrate more * also migrate invoke --- datafusion/functions/src/core/arrow_cast.rs | 10 +++------- datafusion/functions/src/core/arrowtypeof.rs | 10 +++------- datafusion/functions/src/core/coalesce.rs | 11 +++++------ datafusion/functions/src/core/getfield.rs | 14 ++++++-------- datafusion/functions/src/core/greatest.rs | 6 +++--- datafusion/functions/src/core/least.rs | 6 +++--- datafusion/functions/src/core/nullif.rs | 10 +++------- datafusion/functions/src/core/nvl.rs | 11 ++++------- datafusion/functions/src/core/nvl2.rs | 10 +++------- datafusion/functions/src/core/version.rs | 20 +++++++++++--------- 10 files changed, 44 insertions(+), 64 deletions(-) diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index 1ba5197fe2fb..2686dbf8be3c 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -29,8 +29,8 @@ use std::any::Any; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ - ColumnarValue, Documentation, Expr, ReturnInfo, ReturnTypeArgs, ScalarUDFImpl, - Signature, Volatility, + ColumnarValue, Documentation, Expr, ReturnInfo, ReturnTypeArgs, ScalarFunctionArgs, + ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; @@ -138,11 +138,7 @@ impl ScalarUDFImpl for ArrowCastFunc { ) } - fn invoke_batch( - &self, - _args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { internal_err!("arrow_cast should have been simplified to cast") } diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs index 653ca6569896..2509ed246ac7 100644 --- a/datafusion/functions/src/core/arrowtypeof.rs +++ b/datafusion/functions/src/core/arrowtypeof.rs @@ -17,7 +17,7 @@ use arrow::datatypes::DataType; use datafusion_common::{utils::take_function_args, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use std::any::Any; @@ -75,12 +75,8 @@ impl ScalarUDFImpl for ArrowTypeOfFunc { Ok(DataType::Utf8) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - let [arg] = take_function_args(self.name(), args)?; + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [arg] = take_function_args(self.name(), args.args)?; let input_data_type = arg.data_type(); Ok(ColumnarValue::Scalar(ScalarValue::from(format!( "{input_data_type}" diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 602fe0fd9585..ba20c23828eb 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -21,7 +21,9 @@ use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, internal_err, Result}; use datafusion_expr::binary::try_type_union_resolution; -use datafusion_expr::{ColumnarValue, Documentation, ReturnInfo, ReturnTypeArgs}; +use datafusion_expr::{ + ColumnarValue, Documentation, ReturnInfo, ReturnTypeArgs, ScalarFunctionArgs, +}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use itertools::Itertools; @@ -93,11 +95,8 @@ impl ScalarUDFImpl for CoalesceFunc { } /// coalesce evaluates to the first value which is not NULL - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let args = args.args; // do not accept 0 arguments. if args.is_empty() { return exec_err!( diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index d971001dbf78..d667d0d8c151 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -24,7 +24,9 @@ use datafusion_common::{ exec_err, internal_err, plan_datafusion_err, utils::take_function_args, Result, ScalarValue, }; -use datafusion_expr::{ColumnarValue, Documentation, Expr, ReturnInfo, ReturnTypeArgs}; +use datafusion_expr::{ + ColumnarValue, Documentation, Expr, ReturnInfo, ReturnTypeArgs, ScalarFunctionArgs, +}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use std::any::Any; @@ -170,12 +172,8 @@ impl ScalarUDFImpl for GetFieldFunc { } } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - let [base, field_name] = take_function_args(self.name(), args)?; + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [base, field_name] = take_function_args(self.name(), args.args)?; if base.data_type().is_null() { return Ok(ColumnarValue::Scalar(ScalarValue::Null)); @@ -229,7 +227,7 @@ impl ScalarUDFImpl for GetFieldFunc { } (DataType::Struct(_), ScalarValue::Utf8(Some(k))) => { let as_struct_array = as_struct_array(&array)?; - match as_struct_array.column_by_name(k) { + match as_struct_array.column_by_name(&k) { None => exec_err!("get indexed field {k} not found in struct"), Some(col) => Ok(ColumnarValue::Array(Arc::clone(col))), } diff --git a/datafusion/functions/src/core/greatest.rs b/datafusion/functions/src/core/greatest.rs index 6864da2d5c06..2d7ad2be3986 100644 --- a/datafusion/functions/src/core/greatest.rs +++ b/datafusion/functions/src/core/greatest.rs @@ -23,7 +23,7 @@ use arrow::compute::SortOptions; use arrow::datatypes::DataType; use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_doc::Documentation; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use std::any::Any; @@ -143,8 +143,8 @@ impl ScalarUDFImpl for GreatestFunc { Ok(arg_types[0].clone()) } - fn invoke(&self, args: &[ColumnarValue]) -> Result { - super::greatest_least_utils::execute_conditional::(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + super::greatest_least_utils::execute_conditional::(&args.args) } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { diff --git a/datafusion/functions/src/core/least.rs b/datafusion/functions/src/core/least.rs index a26b14babf2c..662dac3e699f 100644 --- a/datafusion/functions/src/core/least.rs +++ b/datafusion/functions/src/core/least.rs @@ -23,7 +23,7 @@ use arrow::compute::SortOptions; use arrow::datatypes::DataType; use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_doc::Documentation; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use std::any::Any; @@ -156,8 +156,8 @@ impl ScalarUDFImpl for LeastFunc { Ok(arg_types[0].clone()) } - fn invoke(&self, args: &[ColumnarValue]) -> Result { - super::greatest_least_utils::execute_conditional::(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + super::greatest_least_utils::execute_conditional::(&args.args) } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 14366767523f..ee29714da16b 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -16,7 +16,7 @@ // under the License. use arrow::datatypes::DataType; -use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs}; use arrow::compute::kernels::cmp::eq; use arrow::compute::kernels::nullif::nullif; @@ -101,12 +101,8 @@ impl ScalarUDFImpl for NullIfFunc { Ok(arg_types[0].to_owned()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - nullif_func(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + nullif_func(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index 1e261a9bc055..82d367072a25 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -21,7 +21,8 @@ use arrow::compute::kernels::zip::zip; use arrow::datatypes::DataType; use datafusion_common::{utils::take_function_args, Result}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::sync::Arc; @@ -116,12 +117,8 @@ impl ScalarUDFImpl for NVLFunc { Ok(arg_types[0].clone()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - nvl_func(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + nvl_func(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index 71188441043a..d20b01e29fba 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -22,7 +22,7 @@ use arrow::datatypes::DataType; use datafusion_common::{internal_err, utils::take_function_args, Result}; use datafusion_expr::{ type_coercion::binary::comparison_coercion, ColumnarValue, Documentation, - ScalarUDFImpl, Signature, Volatility, + ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; use std::sync::Arc; @@ -95,12 +95,8 @@ impl ScalarUDFImpl for NVL2Func { Ok(arg_types[1].clone()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - nvl2_func(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + nvl2_func(&args.args) } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { diff --git a/datafusion/functions/src/core/version.rs b/datafusion/functions/src/core/version.rs index 5fa8347c8787..34038022f2dc 100644 --- a/datafusion/functions/src/core/version.rs +++ b/datafusion/functions/src/core/version.rs @@ -20,7 +20,8 @@ use arrow::datatypes::DataType; use datafusion_common::{utils::take_function_args, Result, ScalarValue}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -75,12 +76,8 @@ impl ScalarUDFImpl for VersionFunc { Ok(DataType::Utf8) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - let [] = take_function_args(self.name(), args)?; + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [] = take_function_args(self.name(), args.args)?; // TODO it would be great to add rust version and arrow version, // but that requires a `build.rs` script and/or adding a version const to arrow-rs let version = format!( @@ -105,8 +102,13 @@ mod test { #[tokio::test] async fn test_version_udf() { let version_udf = ScalarUDF::from(VersionFunc::new()); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let version = version_udf.invoke_batch(&[], 1).unwrap(); + let version = version_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![], + number_rows: 0, + return_type: &DataType::Utf8, + }) + .unwrap(); if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(version))) = version { assert!(version.starts_with("Apache DataFusion")); From 04dc656de80b50806c1c9b02d06ed39b7717d826 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 18 Feb 2025 04:35:44 -0800 Subject: [PATCH 05/71] Fix off by 1 in decimal cast to lower precision (#14731) Upgrade to arrow-rs bug fix release. --- Cargo.lock | 50 +++++++++---------- Cargo.toml | 2 +- .../sqllogictest/test_files/decimal.slt | 7 +++ 3 files changed, 33 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc8b2943b246..4e110789bfda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,9 +246,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" dependencies = [ "arrow-arith", "arrow-array", @@ -270,9 +270,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -312,9 +312,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" dependencies = [ "arrow-array", "arrow-cast", @@ -402,9 +402,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" dependencies = [ "arrow-array", "arrow-buffer", @@ -422,9 +422,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -435,9 +435,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -457,9 +457,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -471,9 +471,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" dependencies = [ "arrow-array", "arrow-buffer", @@ -2573,7 +2573,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3470,7 +3470,7 @@ checksum = "e19b23d53f35ce9f56aebc7d1bb4e6ac1e9c0db7ac85c8d1760c04379edced37" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -4639,7 +4639,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5061,7 +5061,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5624,7 +5624,7 @@ dependencies = [ "cfg-if", "libc", "psm", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5859,7 +5859,7 @@ dependencies = [ "getrandom 0.3.1", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6734,7 +6734,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4fcc13144243..3aacf47508da 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,7 +78,7 @@ version = "45.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "54.1.0", features = [ +arrow = { version = "54.2.0", features = [ "prettyprint", "chrono-tz", ] } diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt index f082a79c5508..089910785ad9 100644 --- a/datafusion/sqllogictest/test_files/decimal.slt +++ b/datafusion/sqllogictest/test_files/decimal.slt @@ -740,3 +740,10 @@ query R SELECT CAST('0' AS decimal(38,0)); ---- 0 + +query RR +SELECT + cast(cast('0' as decimal(3,0)) as decimal(2,0)), + cast(cast('5.20' as decimal(4,2)) as decimal(3,2)) +---- +0 5.2 From 2be19e5d22bbe2dea98084079b61266bfbdc0daf Mon Sep 17 00:00:00 2001 From: zjregee Date: Tue, 18 Feb 2025 20:36:37 +0800 Subject: [PATCH 06/71] migrate string functions to `inovke_with_args` (#14722) * migrate string functions to inovke_with_args * move clone of args in bench out of black_box * modify obsolete calls in to_hex bench --- datafusion/functions/benches/concat.rs | 15 ++++- datafusion/functions/benches/lower.rs | 51 ++++++++++++---- datafusion/functions/benches/ltrim.rs | 11 +++- datafusion/functions/benches/repeat.rs | 59 ++++++++++++++----- datafusion/functions/benches/to_hex.rs | 32 +++++++--- datafusion/functions/benches/upper.rs | 11 +++- datafusion/functions/benches/uuid.rs | 10 +++- datafusion/functions/src/string/ascii.rs | 10 +--- datafusion/functions/src/string/bit_length.rs | 10 +--- datafusion/functions/src/string/btrim.rs | 15 ++--- datafusion/functions/src/string/chr.rs | 10 +--- datafusion/functions/src/string/concat.rs | 22 +++---- datafusion/functions/src/string/concat_ws.rs | 32 ++++++---- datafusion/functions/src/string/contains.rs | 24 ++++---- datafusion/functions/src/string/ends_with.rs | 12 ++-- .../functions/src/string/levenshtein.rs | 16 +++-- datafusion/functions/src/string/lower.rs | 22 +++---- datafusion/functions/src/string/ltrim.rs | 14 ++--- .../functions/src/string/octet_length.rs | 10 +--- datafusion/functions/src/string/overlay.rs | 16 +++-- datafusion/functions/src/string/repeat.rs | 10 +--- datafusion/functions/src/string/replace.rs | 18 +++--- datafusion/functions/src/string/rtrim.rs | 14 ++--- datafusion/functions/src/string/split_part.rs | 10 ++-- .../functions/src/string/starts_with.rs | 12 ++-- datafusion/functions/src/string/to_hex.rs | 18 +++--- datafusion/functions/src/string/upper.rs | 22 +++---- datafusion/functions/src/string/uuid.rs | 18 +++--- 28 files changed, 293 insertions(+), 231 deletions(-) diff --git a/datafusion/functions/benches/concat.rs b/datafusion/functions/benches/concat.rs index 0f287ab36dad..45ca076e754f 100644 --- a/datafusion/functions/benches/concat.rs +++ b/datafusion/functions/benches/concat.rs @@ -16,10 +16,11 @@ // under the License. use arrow::array::ArrayRef; +use arrow::datatypes::DataType; use arrow::util::bench_util::create_string_array_with_len; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use datafusion_common::ScalarValue; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string::concat; use std::sync::Arc; @@ -39,8 +40,16 @@ fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("concat function"); group.bench_function(BenchmarkId::new("concat", size), |b| { b.iter(|| { - // TODO use invoke_with_args - criterion::black_box(concat().invoke_batch(&args, size).unwrap()) + let args_cloned = args.clone(); + criterion::black_box( + concat() + .invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + }) + .unwrap(), + ) }) }); group.finish(); diff --git a/datafusion/functions/benches/lower.rs b/datafusion/functions/benches/lower.rs index 114ac4a16fe5..534e5739225d 100644 --- a/datafusion/functions/benches/lower.rs +++ b/datafusion/functions/benches/lower.rs @@ -18,11 +18,12 @@ extern crate criterion; use arrow::array::{ArrayRef, StringArray, StringViewBuilder}; +use arrow::datatypes::DataType; use arrow::util::bench_util::{ create_string_array_with_len, create_string_view_array_with_len, }; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string; use std::sync::Arc; @@ -125,8 +126,12 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args1(size, 32); c.bench_function(&format!("lower_all_values_are_ascii: {}", size), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(lower.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(lower.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }) }); @@ -135,8 +140,12 @@ fn criterion_benchmark(c: &mut Criterion) { &format!("lower_the_first_value_is_nonascii: {}", size), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(lower.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(lower.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }) }, ); @@ -146,8 +155,12 @@ fn criterion_benchmark(c: &mut Criterion) { &format!("lower_the_middle_value_is_nonascii: {}", size), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(lower.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(lower.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }) }, ); @@ -167,8 +180,12 @@ fn criterion_benchmark(c: &mut Criterion) { &format!("lower_all_values_are_ascii_string_views: size: {}, str_len: {}, null_density: {}, mixed: {}", size, str_len, null_density, mixed), |b| b.iter(|| { - // TODO use invoke_with_args - black_box(lower.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(lower.invoke_with_args(ScalarFunctionArgs{ + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }), ); @@ -177,8 +194,12 @@ fn criterion_benchmark(c: &mut Criterion) { &format!("lower_all_values_are_ascii_string_views: size: {}, str_len: {}, null_density: {}, mixed: {}", size, str_len, null_density, mixed), |b| b.iter(|| { - // TODO use invoke_with_args - black_box(lower.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(lower.invoke_with_args(ScalarFunctionArgs{ + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }), ); @@ -187,8 +208,12 @@ fn criterion_benchmark(c: &mut Criterion) { &format!("lower_some_values_are_nonascii_string_views: size: {}, str_len: {}, non_ascii_density: {}, null_density: {}, mixed: {}", size, str_len, 0.1, null_density, mixed), |b| b.iter(|| { - // TODO use invoke_with_args - black_box(lower.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(lower.invoke_with_args(ScalarFunctionArgs{ + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }), ); } diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index fed455eeac91..457fb499f5a1 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -18,12 +18,13 @@ extern crate criterion; use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}; +use arrow::datatypes::DataType; use criterion::{ black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, Criterion, SamplingMode, }; use datafusion_common::ScalarValue; -use datafusion_expr::{ColumnarValue, ScalarUDF}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF}; use datafusion_functions::string; use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; use std::{fmt, sync::Arc}; @@ -141,8 +142,12 @@ fn run_with_string_type( ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(ltrim.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(ltrim.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }) }, ); diff --git a/datafusion/functions/benches/repeat.rs b/datafusion/functions/benches/repeat.rs index 71207a0548fa..5cc6a177d9d9 100644 --- a/datafusion/functions/benches/repeat.rs +++ b/datafusion/functions/benches/repeat.rs @@ -18,11 +18,12 @@ extern crate criterion; use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait}; +use arrow::datatypes::DataType; use arrow::util::bench_util::{ create_string_array_with_len, create_string_view_array_with_len, }; use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string; use std::sync::Arc; use std::time::Duration; @@ -73,8 +74,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, repeat_times as usize)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); @@ -87,8 +92,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, repeat_times as usize)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); @@ -101,8 +110,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, repeat_times as usize)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); @@ -124,8 +137,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, repeat_times as usize)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); @@ -138,8 +155,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); @@ -152,8 +173,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, repeat_times as usize)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); @@ -175,8 +200,12 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(repeat.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(repeat.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: repeat_times as usize, + return_type: &DataType::Utf8, + })) }) }, ); diff --git a/datafusion/functions/benches/to_hex.rs b/datafusion/functions/benches/to_hex.rs index ce3767cc4839..a45d936c0a52 100644 --- a/datafusion/functions/benches/to_hex.rs +++ b/datafusion/functions/benches/to_hex.rs @@ -17,12 +17,10 @@ extern crate criterion; -use arrow::{ - datatypes::{Int32Type, Int64Type}, - util::bench_util::create_primitive_array, -}; +use arrow::datatypes::{DataType, Int32Type, Int64Type}; +use arrow::util::bench_util::create_primitive_array; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string; use std::sync::Arc; @@ -33,13 +31,33 @@ fn criterion_benchmark(c: &mut Criterion) { let batch_len = i32_array.len(); let i32_args = vec![ColumnarValue::Array(i32_array)]; c.bench_function(&format!("to_hex i32 array: {}", size), |b| { - b.iter(|| black_box(hex.invoke_batch(&i32_args, batch_len).unwrap())) + b.iter(|| { + let args_cloned = i32_args.clone(); + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: batch_len, + return_type: &DataType::Utf8, + }) + .unwrap(), + ) + }) }); let i64_array = Arc::new(create_primitive_array::(size, 0.2)); let batch_len = i64_array.len(); let i64_args = vec![ColumnarValue::Array(i64_array)]; c.bench_function(&format!("to_hex i64 array: {}", size), |b| { - b.iter(|| black_box(hex.invoke_batch(&i64_args, batch_len).unwrap())) + b.iter(|| { + let args_cloned = i64_args.clone(); + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: batch_len, + return_type: &DataType::Utf8, + }) + .unwrap(), + ) + }) }); } diff --git a/datafusion/functions/benches/upper.rs b/datafusion/functions/benches/upper.rs index 9b41a15b11c7..f0bee89c7d37 100644 --- a/datafusion/functions/benches/upper.rs +++ b/datafusion/functions/benches/upper.rs @@ -17,9 +17,10 @@ extern crate criterion; +use arrow::datatypes::DataType; use arrow::util::bench_util::create_string_array_with_len; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string; use std::sync::Arc; @@ -38,8 +39,12 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args(size, 32); c.bench_function("upper_all_values_are_ascii", |b| { b.iter(|| { - // TODO use invoke_with_args - black_box(upper.invoke_batch(&args, size)) + let args_cloned = args.clone(); + black_box(upper.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + number_rows: size, + return_type: &DataType::Utf8, + })) }) }); } diff --git a/datafusion/functions/benches/uuid.rs b/datafusion/functions/benches/uuid.rs index 95cf77de3190..7b8d156fec21 100644 --- a/datafusion/functions/benches/uuid.rs +++ b/datafusion/functions/benches/uuid.rs @@ -17,13 +17,21 @@ extern crate criterion; +use arrow::datatypes::DataType; use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ScalarFunctionArgs; use datafusion_functions::string; fn criterion_benchmark(c: &mut Criterion) { let uuid = string::uuid(); c.bench_function("uuid", |b| { - b.iter(|| black_box(uuid.invoke_batch(&[], 1024))) + b.iter(|| { + black_box(uuid.invoke_with_args(ScalarFunctionArgs { + args: vec![], + number_rows: 1024, + return_type: &DataType::Utf8, + })) + }) }); } diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 3832ad2a341d..006492a0e07a 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -22,7 +22,7 @@ use arrow::error::ArrowError; use datafusion_common::types::logical_string; use datafusion_common::{internal_err, Result}; use datafusion_expr::{ColumnarValue, Documentation, TypeSignatureClass}; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_expr_common::signature::Coercion; use datafusion_macros::user_doc; use std::any::Any; @@ -92,12 +92,8 @@ impl ScalarUDFImpl for AsciiFunc { Ok(Int32) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - make_scalar_function(ascii, vec![])(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(ascii, vec![])(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs index f7e9fce960fe..2a782c59963e 100644 --- a/datafusion/functions/src/string/bit_length.rs +++ b/datafusion/functions/src/string/bit_length.rs @@ -22,7 +22,7 @@ use std::any::Any; use crate::utils::utf8_to_int_type; use datafusion_common::{utils::take_function_args, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -77,12 +77,8 @@ impl ScalarUDFImpl for BitLengthFunc { utf8_to_int_type(&arg_types[0], "bit_length") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - let [array] = take_function_args(self.name(), args)?; + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [array] = take_function_args(self.name(), &args.args)?; match array { ColumnarValue::Array(v) => Ok(ColumnarValue::Array(bit_length(v.as_ref())?)), diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 05a2f646e969..89bffa25698e 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -22,7 +22,8 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -101,20 +102,16 @@ impl ScalarUDFImpl for BTrimFunc { } } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8 | DataType::Utf8View => make_scalar_function( btrim::, vec![Hint::Pad, Hint::AcceptsSingular], - )(args), + )(&args.args), DataType::LargeUtf8 => make_scalar_function( btrim::, vec![Hint::Pad, Hint::AcceptsSingular], - )(args), + )(&args.args), other => exec_err!( "Unsupported data type {other:?} for function btrim,\ expected Utf8, LargeUtf8 or Utf8View." diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index 3530e3f22c0f..58aa7ede74c4 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -28,7 +28,7 @@ use crate::utils::make_scalar_function; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; /// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character. @@ -111,12 +111,8 @@ impl ScalarUDFImpl for ChrFunc { Ok(Utf8) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - make_scalar_function(chr, vec![])(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(chr, vec![])(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 9ce732efa0c7..c47d08d579e4 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -30,7 +30,7 @@ use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -105,11 +105,9 @@ impl ScalarUDFImpl for ConcatFunc { /// Concatenates the text representations of all the arguments. NULL arguments are ignored. /// concat('abcde', 2, NULL, 22) = 'abcde222' - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { args, .. } = args; + let mut return_datatype = DataType::Utf8; args.iter().for_each(|col| { if col.data_type() == DataType::Utf8View { @@ -169,7 +167,7 @@ impl ScalarUDFImpl for ConcatFunc { let mut data_size = 0; let mut columns = Vec::with_capacity(args.len()); - for arg in args { + for arg in &args { match arg { ColumnarValue::Scalar(ScalarValue::Utf8(maybe_value)) | ColumnarValue::Scalar(ScalarValue::LargeUtf8(maybe_value)) @@ -470,10 +468,14 @@ mod tests { None, Some("b"), ]))); - let args = &[c0, c1, c2, c3, c4]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ConcatFunc::new().invoke_batch(args, 3)?; + let args = ScalarFunctionArgs { + args: vec![c0, c1, c2, c3, c4], + number_rows: 3, + return_type: &Utf8, + }; + + let result = ConcatFunc::new().invoke_with_args(args)?; let expected = Arc::new(StringViewArray::from(vec!["foo,x,a", "bar,,", "baz,z,b"])) as ArrayRef; diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index 026d167cccd5..c2bad206db15 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -30,7 +30,7 @@ use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -102,11 +102,9 @@ impl ScalarUDFImpl for ConcatWsFunc { /// Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored. /// concat_ws(',', 'abcde', 2, NULL, 22) = 'abcde,2,22' - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { args, .. } = args; + // do not accept 0 arguments. if args.len() < 2 { return exec_err!( @@ -411,7 +409,7 @@ mod tests { use crate::string::concat_ws::ConcatWsFunc; use datafusion_common::Result; use datafusion_common::ScalarValue; - use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; use crate::utils::test::test_function; @@ -482,10 +480,14 @@ mod tests { None, Some("z"), ]))); - let args = &[c0, c1, c2]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ConcatWsFunc::new().invoke_batch(args, 3)?; + let args = ScalarFunctionArgs { + args: vec![c0, c1, c2], + number_rows: 3, + return_type: &Utf8, + }; + + let result = ConcatWsFunc::new().invoke_with_args(args)?; let expected = Arc::new(StringArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef; match &result { @@ -508,10 +510,14 @@ mod tests { Some("y"), Some("z"), ]))); - let args = &[c0, c1, c2]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ConcatWsFunc::new().invoke_batch(args, 3)?; + let args = ScalarFunctionArgs { + args: vec![c0, c1, c2], + number_rows: 3, + return_type: &Utf8, + }; + + let result = ConcatWsFunc::new().invoke_with_args(args)?; let expected = Arc::new(StringArray::from(vec![Some("foo,x"), None, Some("baz+z")])) as ArrayRef; diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 36871f0c3282..77774cdb5e1d 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -24,7 +24,8 @@ use datafusion_common::exec_err; use datafusion_common::DataFusionError; use datafusion_common::Result; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -81,12 +82,8 @@ impl ScalarUDFImpl for ContainsFunc { Ok(Boolean) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - make_scalar_function(contains, vec![])(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(contains, vec![])(&args.args) } fn documentation(&self) -> Option<&Documentation> { @@ -125,8 +122,9 @@ pub fn contains(args: &[ArrayRef]) -> Result { mod test { use super::ContainsFunc; use arrow::array::{BooleanArray, StringArray}; + use arrow::datatypes::DataType; use datafusion_common::ScalarValue; - use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; use std::sync::Arc; #[test] @@ -137,8 +135,14 @@ mod test { Some("yyy?()"), ]))); let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("x?(".to_string()))); - #[allow(deprecated)] // TODO migrate UDF to invoke - let actual = udf.invoke_batch(&[array, scalar], 2).unwrap(); + + let args = ScalarFunctionArgs { + args: vec![array, scalar], + number_rows: 2, + return_type: &DataType::Boolean, + }; + + let actual = udf.invoke_with_args(args).unwrap(); let expect = ColumnarValue::Array(Arc::new(BooleanArray::from(vec![ Some(true), Some(false), diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs index 0a77ec9ebd2c..5cca79de14ff 100644 --- a/datafusion/functions/src/string/ends_with.rs +++ b/datafusion/functions/src/string/ends_with.rs @@ -24,7 +24,7 @@ use arrow::datatypes::DataType; use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -84,14 +84,10 @@ impl ScalarUDFImpl for EndsWithFunc { Ok(DataType::Boolean) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => { - make_scalar_function(ends_with, vec![])(args) + make_scalar_function(ends_with, vec![])(&args.args) } other => { internal_err!("Unsupported data type {other:?} for function ends_with. Expected Utf8, LargeUtf8 or Utf8View")? diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index c2e5dc52f82f..a19fcc5b476c 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -26,7 +26,7 @@ use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, utils::take_function_args, Result}; use datafusion_expr::{ColumnarValue, Documentation}; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; #[user_doc( @@ -86,16 +86,14 @@ impl ScalarUDFImpl for LevenshteinFunc { utf8_to_int_type(&arg_types[0], "levenshtein") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8View | DataType::Utf8 => { - make_scalar_function(levenshtein::, vec![])(args) + make_scalar_function(levenshtein::, vec![])(&args.args) + } + DataType::LargeUtf8 => { + make_scalar_function(levenshtein::, vec![])(&args.args) } - DataType::LargeUtf8 => make_scalar_function(levenshtein::, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function levenshtein") } diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index e90c3804b1ee..375717e23d6d 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -22,7 +22,7 @@ use crate::string::common::to_lower; use crate::utils::utf8_to_str_type; use datafusion_common::Result; use datafusion_expr::{ColumnarValue, Documentation}; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; #[user_doc( @@ -77,12 +77,8 @@ impl ScalarUDFImpl for LowerFunc { utf8_to_str_type(&arg_types[0], "lower") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - to_lower(args, "lower") + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + to_lower(&args.args, "lower") } fn documentation(&self) -> Option<&Documentation> { @@ -98,10 +94,14 @@ mod tests { fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> { let func = LowerFunc::new(); - let batch_len = input.len(); - let args = vec![ColumnarValue::Array(input)]; - #[allow(deprecated)] // TODO migrate UDF to invoke - let result = match func.invoke_batch(&args, batch_len)? { + + let args = ScalarFunctionArgs { + number_rows: input.len(), + args: vec![ColumnarValue::Array(input)], + return_type: &DataType::Utf8, + }; + + let result = match func.invoke_with_args(args)? { ColumnarValue::Array(result) => result, _ => unreachable!("lower"), }; diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 0bc62ee5000d..75c4ff25b7df 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -24,7 +24,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. @@ -104,20 +104,16 @@ impl ScalarUDFImpl for LtrimFunc { } } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8 | DataType::Utf8View => make_scalar_function( ltrim::, vec![Hint::Pad, Hint::AcceptsSingular], - )(args), + )(&args.args), DataType::LargeUtf8 => make_scalar_function( ltrim::, vec![Hint::Pad, Hint::AcceptsSingular], - )(args), + )(&args.args), other => exec_err!( "Unsupported data type {other:?} for function ltrim,\ expected Utf8, LargeUtf8 or Utf8View." diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 7e0187c0b1be..46175c96cdc6 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -22,7 +22,7 @@ use std::any::Any; use crate::utils::utf8_to_int_type; use datafusion_common::{utils::take_function_args, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -77,12 +77,8 @@ impl ScalarUDFImpl for OctetLengthFunc { utf8_to_int_type(&arg_types[0], "octet_length") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - let [array] = take_function_args(self.name(), args)?; + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [array] = take_function_args(self.name(), &args.args)?; match array { ColumnarValue::Array(v) => Ok(ColumnarValue::Array(length(v.as_ref())?)), diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs index 3389da0968f7..0ea5359e9621 100644 --- a/datafusion/functions/src/string/overlay.rs +++ b/datafusion/functions/src/string/overlay.rs @@ -27,7 +27,7 @@ use datafusion_common::cast::{ }; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -100,16 +100,14 @@ impl ScalarUDFImpl for OverlayFunc { utf8_to_str_type(&arg_types[0], "overlay") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8View | DataType::Utf8 => { - make_scalar_function(overlay::, vec![])(args) + make_scalar_function(overlay::, vec![])(&args.args) + } + DataType::LargeUtf8 => { + make_scalar_function(overlay::, vec![])(&args.args) } - DataType::LargeUtf8 => make_scalar_function(overlay::, vec![])(args), other => exec_err!("Unsupported data type {other:?} for function overlay"), } } diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 8fdbc3dd296f..2d36cb8356a0 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -29,7 +29,7 @@ use datafusion_common::cast::as_int64_array; use datafusion_common::types::{logical_int64, logical_string, NativeType}; use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -98,12 +98,8 @@ impl ScalarUDFImpl for RepeatFunc { utf8_to_str_type(&arg_types[0], "repeat") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - make_scalar_function(repeat, vec![])(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(repeat, vec![])(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 9b6afc546994..a3488b561fd2 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -25,7 +25,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( doc_section(label = "String Functions"), @@ -82,15 +82,13 @@ impl ScalarUDFImpl for ReplaceFunc { utf8_to_str_type(&arg_types[0], "replace") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { - DataType::Utf8 => make_scalar_function(replace::, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(replace::, vec![])(args), - DataType::Utf8View => make_scalar_function(replace_view, vec![])(args), + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { + DataType::Utf8 => make_scalar_function(replace::, vec![])(&args.args), + DataType::LargeUtf8 => { + make_scalar_function(replace::, vec![])(&args.args) + } + DataType::Utf8View => make_scalar_function(replace_view, vec![])(&args.args), other => { exec_err!("Unsupported data type {other:?} for function replace") } diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 3fb208bb7198..71c4286150e5 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -24,7 +24,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. @@ -104,20 +104,16 @@ impl ScalarUDFImpl for RtrimFunc { } } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8 | DataType::Utf8View => make_scalar_function( rtrim::, vec![Hint::Pad, Hint::AcceptsSingular], - )(args), + )(&args.args), DataType::LargeUtf8 => make_scalar_function( rtrim::, vec![Hint::Pad, Hint::AcceptsSingular], - )(args), + )(&args.args), other => exec_err!( "Unsupported data type {other:?} for function rtrim,\ expected Utf8, LargeUtf8 or Utf8View." diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index a597e1be5d02..724d9c278cca 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -26,7 +26,7 @@ use datafusion_common::cast::as_int64_array; use datafusion_common::ScalarValue; use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; use std::any::Any; use std::sync::Arc; @@ -97,11 +97,9 @@ impl ScalarUDFImpl for SplitPartFunc { utf8_to_str_type(&arg_types[0], "split_part") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { args, .. } = args; + // First, determine if any of the arguments is an Array let len = args.iter().find_map(|arg| match arg { ColumnarValue::Array(a) => Some(a.len()), diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 74d0fbdc4033..f1344780eb4c 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -25,7 +25,7 @@ use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, Documentation, Expr, Like}; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; /// Returns true if string starts with prefix. @@ -86,14 +86,10 @@ impl ScalarUDFImpl for StartsWithFunc { Ok(DataType::Boolean) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => { - make_scalar_function(starts_with, vec![])(args) + make_scalar_function(starts_with, vec![])(&args.args) } _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?, } diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index 5c7c92cc34ed..a3a1acfcf1f0 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -30,7 +30,7 @@ use datafusion_common::Result; use datafusion_common::{exec_err, plan_err}; use datafusion_expr::{ColumnarValue, Documentation}; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; /// Converts the number to its equivalent hexadecimal representation. @@ -127,14 +127,14 @@ impl ScalarUDFImpl for ToHexFunc { }) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - match args[0].data_type() { - DataType::Int32 => make_scalar_function(to_hex::, vec![])(args), - DataType::Int64 => make_scalar_function(to_hex::, vec![])(args), + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match args.args[0].data_type() { + DataType::Int32 => { + make_scalar_function(to_hex::, vec![])(&args.args) + } + DataType::Int64 => { + make_scalar_function(to_hex::, vec![])(&args.args) + } other => exec_err!("Unsupported data type {other:?} for function to_hex"), } } diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index 7bab33e68a4d..d27b54d29bc6 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -20,7 +20,7 @@ use crate::utils::utf8_to_str_type; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ColumnarValue, Documentation}; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use std::any::Any; @@ -76,12 +76,8 @@ impl ScalarUDFImpl for UpperFunc { utf8_to_str_type(&arg_types[0], "upper") } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - to_upper(args, "upper") + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + to_upper(&args.args, "upper") } fn documentation(&self) -> Option<&Documentation> { @@ -97,10 +93,14 @@ mod tests { fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> { let func = UpperFunc::new(); - let batch_len = input.len(); - let args = vec![ColumnarValue::Array(input)]; - #[allow(deprecated)] // TODO migrate UDF to invoke - let result = match func.invoke_batch(&args, batch_len)? { + + let args = ScalarFunctionArgs { + number_rows: input.len(), + args: vec![ColumnarValue::Array(input)], + return_type: &DataType::Utf8, + }; + + let result = match func.invoke_with_args(args)? { ColumnarValue::Array(result) => result, _ => unreachable!("upper"), }; diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs index 64065c26b7d4..d1f43d548066 100644 --- a/datafusion/functions/src/string/uuid.rs +++ b/datafusion/functions/src/string/uuid.rs @@ -26,7 +26,7 @@ use uuid::Uuid; use datafusion_common::{internal_err, Result}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; #[user_doc( @@ -80,22 +80,20 @@ impl ScalarUDFImpl for UuidFunc { /// Prints random (v4) uuid values per row /// uuid() = 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11' - fn invoke_batch( - &self, - args: &[ColumnarValue], - num_rows: usize, - ) -> Result { - if !args.is_empty() { + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + if !args.args.is_empty() { return internal_err!("{} function does not accept arguments", self.name()); } // Generate random u128 values let mut rng = rand::thread_rng(); - let mut randoms = vec![0u128; num_rows]; + let mut randoms = vec![0u128; args.number_rows]; rng.fill(&mut randoms[..]); - let mut builder = - GenericStringBuilder::::with_capacity(num_rows, num_rows * 36); + let mut builder = GenericStringBuilder::::with_capacity( + args.number_rows, + args.number_rows * 36, + ); let mut buffer = [0u8; 36]; for x in &mut randoms { From 873b5f70e3292bcad4e92630a6fa10870d716984 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 18 Feb 2025 07:48:02 -0500 Subject: [PATCH 07/71] Specify rust toolchain explicitly, document how to change it (#14655) --- docs/source/contributor-guide/howtos.md | 6 ++++++ rust-toolchain.toml | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 rust-toolchain.toml diff --git a/docs/source/contributor-guide/howtos.md b/docs/source/contributor-guide/howtos.md index e406804caa44..556242751ff4 100644 --- a/docs/source/contributor-guide/howtos.md +++ b/docs/source/contributor-guide/howtos.md @@ -19,6 +19,12 @@ # HOWTOs +## How to update the version of Rust used in CI tests + +- Make a PR to update the [rust-toolchain] file in the root of the repository: + +[rust-toolchain]: https://github.com/apache/datafusion/blob/main/rust-toolchain.toml + ## How to add a new scalar function Below is a checklist of what you need to do to add a new scalar function to DataFusion: diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 000000000000..bd764d201018 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file specifies the default version of Rust used +# to compile this workspace and run CI jobs. + +[toolchain] +channel = "1.84.1" +components = ["rustfmt", "clippy"] From 45d9820aa7d858adc48ddcf88feb240cbb639fc6 Mon Sep 17 00:00:00 2001 From: irenjj Date: Tue, 18 Feb 2025 21:36:16 +0800 Subject: [PATCH 08/71] chore: Migrate Array Functions to invoke_with_args (#14726) * chore: Migrate Array Functions to invoke_with_args * fix clippy * fix issues * fix tests * fix --- datafusion/functions-nested/src/array_has.rs | 29 ++++++++------- .../functions-nested/src/cardinality.rs | 7 ++-- datafusion/functions-nested/src/concat.rs | 21 +++++------ datafusion/functions-nested/src/dimension.rs | 14 ++++---- datafusion/functions-nested/src/distance.rs | 7 ++-- datafusion/functions-nested/src/empty.rs | 7 ++-- datafusion/functions-nested/src/except.rs | 7 ++-- datafusion/functions-nested/src/extract.rs | 36 +++++++++---------- datafusion/functions-nested/src/flatten.rs | 7 ++-- datafusion/functions-nested/src/length.rs | 7 ++-- datafusion/functions-nested/src/make_array.rs | 7 ++-- datafusion/functions-nested/src/map.rs | 7 ++-- .../functions-nested/src/map_extract.rs | 7 ++-- datafusion/functions-nested/src/map_keys.rs | 7 ++-- datafusion/functions-nested/src/map_values.rs | 7 ++-- datafusion/functions-nested/src/position.rs | 14 ++++---- datafusion/functions-nested/src/range.rs | 14 ++++---- datafusion/functions-nested/src/remove.rs | 21 +++++------ datafusion/functions-nested/src/repeat.rs | 7 ++-- datafusion/functions-nested/src/replace.rs | 21 +++++------ datafusion/functions-nested/src/resize.rs | 7 ++-- datafusion/functions-nested/src/reverse.rs | 7 ++-- datafusion/functions-nested/src/set_ops.rs | 21 +++++------ datafusion/functions-nested/src/sort.rs | 7 ++-- datafusion/functions-nested/src/string.rs | 13 ++++--- 25 files changed, 137 insertions(+), 172 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 5c694600b822..5a29cf962817 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -25,6 +25,7 @@ use arrow::datatypes::DataType; use arrow::row::{RowConverter, Rows, SortField}; use datafusion_common::cast::as_generic_list_array; use datafusion_common::utils::string_utils::string_array_to_vec; +use datafusion_common::utils::take_function_args; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, @@ -120,15 +121,15 @@ impl ScalarUDFImpl for ArrayHas { Ok(DataType::Boolean) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - match &args[1] { + let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?; + match &second_arg { ColumnarValue::Array(array_needle) => { // the needle is already an array, convert the haystack to an array of the same length - let haystack = args[0].to_array(array_needle.len())?; + let haystack = first_arg.to_array(array_needle.len())?; let array = array_has_inner_for_array(&haystack, array_needle)?; Ok(ColumnarValue::Array(array)) } @@ -140,11 +141,11 @@ impl ScalarUDFImpl for ArrayHas { } // since the needle is a scalar, convert it to an array of size 1 - let haystack = args[0].to_array(1)?; + let haystack = first_arg.to_array(1)?; let needle = scalar_needle.to_array_of_size(1)?; let needle = Scalar::new(needle); let array = array_has_inner_for_scalar(&haystack, &needle)?; - if let ColumnarValue::Scalar(_) = &args[0] { + if let ColumnarValue::Scalar(_) = &first_arg { // If both inputs are scalar, keeps output as scalar let scalar_value = ScalarValue::try_from_array(&array, 0)?; Ok(ColumnarValue::Scalar(scalar_value)) @@ -332,12 +333,11 @@ impl ScalarUDFImpl for ArrayHasAll { Ok(DataType::Boolean) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_has_all_inner)(args) + make_scalar_function(array_has_all_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -407,12 +407,11 @@ impl ScalarUDFImpl for ArrayHasAny { Ok(DataType::Boolean) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_has_any_inner)(args) + make_scalar_function(array_has_any_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/cardinality.rs b/datafusion/functions-nested/src/cardinality.rs index 886709779917..f2f23841586c 100644 --- a/datafusion/functions-nested/src/cardinality.rs +++ b/datafusion/functions-nested/src/cardinality.rs @@ -112,12 +112,11 @@ impl ScalarUDFImpl for Cardinality { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(cardinality_inner)(args) + make_scalar_function(cardinality_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/concat.rs b/datafusion/functions-nested/src/concat.rs index 17fb1a3731df..f4b9208e5c83 100644 --- a/datafusion/functions-nested/src/concat.rs +++ b/datafusion/functions-nested/src/concat.rs @@ -109,12 +109,11 @@ impl ScalarUDFImpl for ArrayAppend { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_append_inner)(args) + make_scalar_function(array_append_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -205,12 +204,11 @@ impl ScalarUDFImpl for ArrayPrepend { Ok(arg_types[1].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_prepend_inner)(args) + make_scalar_function(array_prepend_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -324,12 +322,11 @@ impl ScalarUDFImpl for ArrayConcat { Ok(expr_type) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_concat_inner)(args) + make_scalar_function(array_concat_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/dimension.rs b/datafusion/functions-nested/src/dimension.rs index dc1547b7b437..a7d033641413 100644 --- a/datafusion/functions-nested/src/dimension.rs +++ b/datafusion/functions-nested/src/dimension.rs @@ -106,12 +106,11 @@ impl ScalarUDFImpl for ArrayDims { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_dims_inner)(args) + make_scalar_function(array_dims_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -184,12 +183,11 @@ impl ScalarUDFImpl for ArrayNdims { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_ndims_inner)(args) + make_scalar_function(array_ndims_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/distance.rs b/datafusion/functions-nested/src/distance.rs index fc33828078c0..cfc7fccdd70c 100644 --- a/datafusion/functions-nested/src/distance.rs +++ b/datafusion/functions-nested/src/distance.rs @@ -124,12 +124,11 @@ impl ScalarUDFImpl for ArrayDistance { Ok(result) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_distance_inner)(args) + make_scalar_function(array_distance_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/empty.rs b/datafusion/functions-nested/src/empty.rs index 07e5d41b8023..dcefd583e937 100644 --- a/datafusion/functions-nested/src/empty.rs +++ b/datafusion/functions-nested/src/empty.rs @@ -98,12 +98,11 @@ impl ScalarUDFImpl for ArrayEmpty { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_empty_inner)(args) + make_scalar_function(array_empty_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/except.rs b/datafusion/functions-nested/src/except.rs index f7958caa6379..2385f6d12d43 100644 --- a/datafusion/functions-nested/src/except.rs +++ b/datafusion/functions-nested/src/except.rs @@ -106,12 +106,11 @@ impl ScalarUDFImpl for ArrayExcept { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_except_inner)(args) + make_scalar_function(array_except_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index 6bf4d16db636..422b1b612850 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -172,12 +172,11 @@ impl ScalarUDFImpl for ArrayElement { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_element_inner)(args) + make_scalar_function(array_element_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -395,12 +394,11 @@ impl ScalarUDFImpl for ArraySlice { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_slice_inner)(args) + make_scalar_function(array_slice_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -704,12 +702,11 @@ impl ScalarUDFImpl for ArrayPopFront { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_pop_front_inner)(args) + make_scalar_function(array_pop_front_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -812,12 +809,11 @@ impl ScalarUDFImpl for ArrayPopBack { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_pop_back_inner)(args) + make_scalar_function(array_pop_back_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -918,13 +914,13 @@ impl ScalarUDFImpl for ArrayAnyValue { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_any_value_inner)(args) + make_scalar_function(array_any_value_inner)(&args.args) } + fn aliases(&self) -> &[String] { &self.aliases } diff --git a/datafusion/functions-nested/src/flatten.rs b/datafusion/functions-nested/src/flatten.rs index 0003db38e0e4..f288035948dc 100644 --- a/datafusion/functions-nested/src/flatten.rs +++ b/datafusion/functions-nested/src/flatten.rs @@ -124,12 +124,11 @@ impl ScalarUDFImpl for Flatten { Ok(data_type) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(flatten_inner)(args) + make_scalar_function(flatten_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/length.rs b/datafusion/functions-nested/src/length.rs index a8190a610844..3c3a42da0d69 100644 --- a/datafusion/functions-nested/src/length.rs +++ b/datafusion/functions-nested/src/length.rs @@ -103,12 +103,11 @@ impl ScalarUDFImpl for ArrayLength { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_length_inner)(args) + make_scalar_function(array_length_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs index 7c41238af5c4..4daaafc5a888 100644 --- a/datafusion/functions-nested/src/make_array.rs +++ b/datafusion/functions-nested/src/make_array.rs @@ -117,12 +117,11 @@ impl ScalarUDFImpl for MakeArray { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(make_array_inner)(args) + make_scalar_function(make_array_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/map.rs b/datafusion/functions-nested/src/map.rs index 67ff9182517e..828f2e244112 100644 --- a/datafusion/functions-nested/src/map.rs +++ b/datafusion/functions-nested/src/map.rs @@ -273,12 +273,11 @@ impl ScalarUDFImpl for MapFunc { )) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_map_batch(args) + make_map_batch(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions-nested/src/map_extract.rs b/datafusion/functions-nested/src/map_extract.rs index ddc12482e380..55ab8447c54f 100644 --- a/datafusion/functions-nested/src/map_extract.rs +++ b/datafusion/functions-nested/src/map_extract.rs @@ -110,12 +110,11 @@ impl ScalarUDFImpl for MapExtract { )))) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(map_extract_inner)(args) + make_scalar_function(map_extract_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/map_keys.rs b/datafusion/functions-nested/src/map_keys.rs index c58624e12c60..0f15c06d86d1 100644 --- a/datafusion/functions-nested/src/map_keys.rs +++ b/datafusion/functions-nested/src/map_keys.rs @@ -100,12 +100,11 @@ impl ScalarUDFImpl for MapKeysFunc { )))) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(map_keys_inner)(args) + make_scalar_function(map_keys_inner)(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions-nested/src/map_values.rs b/datafusion/functions-nested/src/map_values.rs index d4a67b7f67a7..f82e4bfa1a89 100644 --- a/datafusion/functions-nested/src/map_values.rs +++ b/datafusion/functions-nested/src/map_values.rs @@ -100,12 +100,11 @@ impl ScalarUDFImpl for MapValuesFunc { )))) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(map_values_inner)(args) + make_scalar_function(map_values_inner)(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs index 9adb174c4f2f..b186b65407c3 100644 --- a/datafusion/functions-nested/src/position.rs +++ b/datafusion/functions-nested/src/position.rs @@ -120,12 +120,11 @@ impl ScalarUDFImpl for ArrayPosition { Ok(UInt64) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_position_inner)(args) + make_scalar_function(array_position_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -274,12 +273,11 @@ impl ScalarUDFImpl for ArrayPositions { Ok(List(Arc::new(Field::new_list_field(UInt64, true)))) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_positions_inner)(args) + make_scalar_function(array_positions_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index dcf5f33ea2c2..637a78d158ab 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -155,11 +155,12 @@ impl ScalarUDFImpl for Range { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; + if args.iter().any(|arg| arg.data_type().is_null()) { return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1)))); } @@ -278,11 +279,12 @@ impl ScalarUDFImpl for GenSeries { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; + if args.iter().any(|arg| arg.data_type().is_null()) { return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1)))); } diff --git a/datafusion/functions-nested/src/remove.rs b/datafusion/functions-nested/src/remove.rs index f9539dbc1621..7f5baa18e769 100644 --- a/datafusion/functions-nested/src/remove.rs +++ b/datafusion/functions-nested/src/remove.rs @@ -101,12 +101,11 @@ impl ScalarUDFImpl for ArrayRemove { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_remove_inner)(args) + make_scalar_function(array_remove_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -180,12 +179,11 @@ impl ScalarUDFImpl for ArrayRemoveN { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_remove_n_inner)(args) + make_scalar_function(array_remove_n_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -258,12 +256,11 @@ impl ScalarUDFImpl for ArrayRemoveAll { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_remove_all_inner)(args) + make_scalar_function(array_remove_all_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/repeat.rs b/datafusion/functions-nested/src/repeat.rs index 16d7c1912f6d..26d67ad3113f 100644 --- a/datafusion/functions-nested/src/repeat.rs +++ b/datafusion/functions-nested/src/repeat.rs @@ -115,12 +115,11 @@ impl ScalarUDFImpl for ArrayRepeat { )))) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_repeat_inner)(args) + make_scalar_function(array_repeat_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs index 6d84e64cba4d..71bfedb72d1c 100644 --- a/datafusion/functions-nested/src/replace.rs +++ b/datafusion/functions-nested/src/replace.rs @@ -128,12 +128,11 @@ impl ScalarUDFImpl for ArrayReplace { Ok(args[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_replace_inner)(args) + make_scalar_function(array_replace_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -210,12 +209,11 @@ impl ScalarUDFImpl for ArrayReplaceN { Ok(args[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_replace_n_inner)(args) + make_scalar_function(array_replace_n_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -290,12 +288,11 @@ impl ScalarUDFImpl for ArrayReplaceAll { Ok(args[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_replace_all_inner)(args) + make_scalar_function(array_replace_all_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/resize.rs b/datafusion/functions-nested/src/resize.rs index 83d04ec1b7ed..6c0b91a678e7 100644 --- a/datafusion/functions-nested/src/resize.rs +++ b/datafusion/functions-nested/src/resize.rs @@ -112,12 +112,11 @@ impl ScalarUDFImpl for ArrayResize { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_resize_inner)(args) + make_scalar_function(array_resize_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs index b7c4274ca436..140cd19aeff9 100644 --- a/datafusion/functions-nested/src/reverse.rs +++ b/datafusion/functions-nested/src/reverse.rs @@ -96,12 +96,11 @@ impl ScalarUDFImpl for ArrayReverse { Ok(arg_types[0].clone()) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_reverse_inner)(args) + make_scalar_function(array_reverse_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/set_ops.rs b/datafusion/functions-nested/src/set_ops.rs index 97ccc035a046..a67945b1f1e1 100644 --- a/datafusion/functions-nested/src/set_ops.rs +++ b/datafusion/functions-nested/src/set_ops.rs @@ -131,12 +131,11 @@ impl ScalarUDFImpl for ArrayUnion { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_union_inner)(args) + make_scalar_function(array_union_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -211,12 +210,11 @@ impl ScalarUDFImpl for ArrayIntersect { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_intersect_inner)(args) + make_scalar_function(array_intersect_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -288,12 +286,11 @@ impl ScalarUDFImpl for ArrayDistinct { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_distinct_inner)(args) + make_scalar_function(array_distinct_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs index 163ca0c05bad..7dbf9f2b211e 100644 --- a/datafusion/functions-nested/src/sort.rs +++ b/datafusion/functions-nested/src/sort.rs @@ -121,12 +121,11 @@ impl ScalarUDFImpl for ArraySort { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_sort_inner)(args) + make_scalar_function(array_sort_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 9b5e0c952a3b..99af3e95c804 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -192,12 +192,11 @@ impl ScalarUDFImpl for ArrayToString { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_to_string_inner)(args) + make_scalar_function(array_to_string_inner)(&args.args) } fn aliases(&self) -> &[String] { @@ -286,11 +285,11 @@ impl ScalarUDFImpl for StringToArray { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match args[0].data_type() { Utf8 | Utf8View => make_scalar_function(string_to_array_inner::)(args), LargeUtf8 => make_scalar_function(string_to_array_inner::)(args), From ed986906bddb26b3d94c67113cbe4f5bc72c1ded Mon Sep 17 00:00:00 2001 From: irenjj Date: Tue, 18 Feb 2025 21:37:54 +0800 Subject: [PATCH 09/71] chore: Migrate Regex function to invoke_with_args (#14728) * chore" Migrate Regex function to invoke_with_args * fix * fix issues --- datafusion/functions/src/regex/regexpcount.rs | 134 +++++++++--------- datafusion/functions/src/regex/regexplike.rs | 7 +- datafusion/functions/src/regex/regexpmatch.rs | 7 +- .../functions/src/regex/regexpreplace.rs | 8 +- 4 files changed, 81 insertions(+), 75 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index a81133713360..8cb1a4ff3d60 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -108,11 +108,12 @@ impl ScalarUDFImpl for RegexpCountFunc { Ok(Int64) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; + let len = args .iter() .fold(Option::::None, |acc, arg| match arg { @@ -618,6 +619,7 @@ fn count_matches( mod tests { use super::*; use arrow::array::{GenericStringArray, StringViewArray}; + use datafusion_expr::ScalarFunctionArgs; #[test] fn test_regexp_count() { @@ -655,11 +657,11 @@ mod tests { let v_sv = ScalarValue::Utf8(Some(v.to_string())); let regex_sv = ScalarValue::Utf8(Some(regex.to_string())); let expected = expected.get(pos).cloned(); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv)], - 1, - ); + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv)], + number_rows: 2, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -670,11 +672,11 @@ mod tests { // largeutf8 let v_sv = ScalarValue::LargeUtf8(Some(v.to_string())); let regex_sv = ScalarValue::LargeUtf8(Some(regex.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv)], - 1, - ); + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv)], + number_rows: 2, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -685,11 +687,11 @@ mod tests { // utf8view let v_sv = ScalarValue::Utf8View(Some(v.to_string())); let regex_sv = ScalarValue::Utf8View(Some(regex.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv)], - 1, - ); + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv)], + number_rows: 2, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -711,15 +713,15 @@ mod tests { let regex_sv = ScalarValue::Utf8(Some(regex.to_string())); let start_sv = ScalarValue::Int64(Some(start)); let expected = expected.get(pos).cloned(); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), ColumnarValue::Scalar(start_sv.clone()), ], - 1, - ); + number_rows: 3, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -730,15 +732,15 @@ mod tests { // largeutf8 let v_sv = ScalarValue::LargeUtf8(Some(v.to_string())); let regex_sv = ScalarValue::LargeUtf8(Some(regex.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), ColumnarValue::Scalar(start_sv.clone()), ], - 1, - ); + number_rows: 3, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -749,15 +751,15 @@ mod tests { // utf8view let v_sv = ScalarValue::Utf8View(Some(v.to_string())); let regex_sv = ScalarValue::Utf8View(Some(regex.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), - ColumnarValue::Scalar(start_sv), + ColumnarValue::Scalar(start_sv.clone()), ], - 1, - ); + number_rows: 3, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -781,16 +783,16 @@ mod tests { let start_sv = ScalarValue::Int64(Some(start)); let flags_sv = ScalarValue::Utf8(Some(flags.to_string())); let expected = expected.get(pos).cloned(); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), ColumnarValue::Scalar(start_sv.clone()), ColumnarValue::Scalar(flags_sv.clone()), ], - 1, - ); + number_rows: 4, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -802,16 +804,16 @@ mod tests { let v_sv = ScalarValue::LargeUtf8(Some(v.to_string())); let regex_sv = ScalarValue::LargeUtf8(Some(regex.to_string())); let flags_sv = ScalarValue::LargeUtf8(Some(flags.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), ColumnarValue::Scalar(start_sv.clone()), ColumnarValue::Scalar(flags_sv.clone()), ], - 1, - ); + number_rows: 4, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -823,16 +825,16 @@ mod tests { let v_sv = ScalarValue::Utf8View(Some(v.to_string())); let regex_sv = ScalarValue::Utf8View(Some(regex.to_string())); let flags_sv = ScalarValue::Utf8View(Some(flags.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), - ColumnarValue::Scalar(start_sv), + ColumnarValue::Scalar(start_sv.clone()), ColumnarValue::Scalar(flags_sv.clone()), ], - 1, - ); + number_rows: 4, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -905,16 +907,16 @@ mod tests { let start_sv = ScalarValue::Int64(Some(start)); let flags_sv = ScalarValue::Utf8(flags.get(pos).map(|f| f.to_string())); let expected = expected.get(pos).cloned(); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), ColumnarValue::Scalar(start_sv.clone()), ColumnarValue::Scalar(flags_sv.clone()), ], - 1, - ); + number_rows: 4, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -926,16 +928,16 @@ mod tests { let v_sv = ScalarValue::LargeUtf8(Some(v.to_string())); let regex_sv = ScalarValue::LargeUtf8(regex.get(pos).map(|s| s.to_string())); let flags_sv = ScalarValue::LargeUtf8(flags.get(pos).map(|f| f.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), ColumnarValue::Scalar(start_sv.clone()), ColumnarValue::Scalar(flags_sv.clone()), ], - 1, - ); + number_rows: 4, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); @@ -947,16 +949,16 @@ mod tests { let v_sv = ScalarValue::Utf8View(Some(v.to_string())); let regex_sv = ScalarValue::Utf8View(regex.get(pos).map(|s| s.to_string())); let flags_sv = ScalarValue::Utf8View(flags.get(pos).map(|f| f.to_string())); - #[allow(deprecated)] // TODO: migrate to invoke_with_args - let re = RegexpCountFunc::new().invoke_batch( - &[ + let re = RegexpCountFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ ColumnarValue::Scalar(v_sv), ColumnarValue::Scalar(regex_sv), - ColumnarValue::Scalar(start_sv), + ColumnarValue::Scalar(start_sv.clone()), ColumnarValue::Scalar(flags_sv.clone()), ], - 1, - ); + number_rows: 4, + return_type: &Int64, + }); match re { Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => { assert_eq!(v, expected, "regexp_count scalar test failed"); diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 296ec339a623..6006309306d5 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -110,11 +110,12 @@ impl ScalarUDFImpl for RegexpLikeFunc { }) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; + let len = args .iter() .fold(Option::::None, |acc, arg| match arg { diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 57207ecfdacd..1119e66398d1 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -118,11 +118,12 @@ impl ScalarUDFImpl for RegexpMatchFunc { other => DataType::List(Arc::new(Field::new_list_field(other.clone(), true))), }) } - fn invoke_batch( + + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; let len = args .iter() .fold(Option::::None, |acc, arg| match arg { diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 130c66caeecd..3a83564ff11f 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -147,11 +147,13 @@ impl ScalarUDFImpl for RegexpReplaceFunc { } }) } - fn invoke_batch( + + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; + let len = args .iter() .fold(Option::::None, |acc, arg| match arg { From 81b12d3bdb0a6125d9735c4573700e3ad55ac470 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Wed, 19 Feb 2025 00:49:38 +0800 Subject: [PATCH 10/71] bug: Fix memory reservation and allocation problems for SortExec (#14644) * Fix double memory allocation caused by collecting the merged batches; Fix batch memory consumption growth after sorting; Reserve memory more aggressively to compensate for memory needed for merging. --- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 214 ++++++++++++-- datafusion/core/tests/memory_limit/mod.rs | 8 +- datafusion/physical-plan/src/sorts/cursor.rs | 22 +- datafusion/physical-plan/src/sorts/sort.rs | 262 ++++++++++++++---- datafusion/physical-plan/src/sorts/stream.rs | 18 +- .../src/sorts/streaming_merge.rs | 3 +- datafusion/physical-plan/src/spill.rs | 10 +- datafusion/physical-plan/src/test.rs | 35 ++- 8 files changed, 487 insertions(+), 85 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index ecc077261acc..51a5bc87efd9 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use arrow::{ - array::{ArrayRef, Int32Array}, + array::{as_string_array, ArrayRef, Int32Array, StringArray}, compute::SortOptions, record_batch::RecordBatch, }; @@ -29,6 +29,7 @@ use datafusion::physical_plan::expressions::PhysicalSortExpr; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::{collect, ExecutionPlan}; use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_common::cast::as_int32_array; use datafusion_execution::memory_pool::GreedyMemoryPool; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -42,42 +43,139 @@ const KB: usize = 1 << 10; #[cfg_attr(tarpaulin, ignore)] async fn test_sort_10k_mem() { for (batch_size, should_spill) in [(5, false), (20000, true), (500000, true)] { - SortTest::new() + let (input, collected) = SortTest::new() .with_int32_batches(batch_size) + .with_sort_columns(vec!["x"]) .with_pool_size(10 * KB) .with_should_spill(should_spill) .run() .await; + + let expected = partitions_to_sorted_vec(&input); + let actual = batches_to_vec(&collected); + assert_eq!(expected, actual, "failure in @ batch_size {batch_size:?}"); } } #[tokio::test] #[cfg_attr(tarpaulin, ignore)] async fn test_sort_100k_mem() { - for (batch_size, should_spill) in [(5, false), (20000, false), (1000000, true)] { - SortTest::new() + for (batch_size, should_spill) in + [(5, false), (10000, false), (20000, true), (1000000, true)] + { + let (input, collected) = SortTest::new() .with_int32_batches(batch_size) + .with_sort_columns(vec!["x"]) .with_pool_size(100 * KB) .with_should_spill(should_spill) .run() .await; + + let expected = partitions_to_sorted_vec(&input); + let actual = batches_to_vec(&collected); + assert_eq!(expected, actual, "failure in @ batch_size {batch_size:?}"); + } +} + +#[tokio::test] +#[cfg_attr(tarpaulin, ignore)] +async fn test_sort_strings_100k_mem() { + for (batch_size, should_spill) in + [(5, false), (1000, false), (10000, true), (20000, true)] + { + let (input, collected) = SortTest::new() + .with_utf8_batches(batch_size) + .with_sort_columns(vec!["x"]) + .with_pool_size(100 * KB) + .with_should_spill(should_spill) + .run() + .await; + + let mut input = input + .iter() + .flat_map(|p| p.iter()) + .flat_map(|b| { + let array = b.column(0); + as_string_array(array) + .iter() + .map(|s| s.unwrap().to_string()) + }) + .collect::>(); + input.sort_unstable(); + let actual = collected + .iter() + .flat_map(|b| { + let array = b.column(0); + as_string_array(array) + .iter() + .map(|s| s.unwrap().to_string()) + }) + .collect::>(); + assert_eq!(input, actual); + } +} + +#[tokio::test] +#[cfg_attr(tarpaulin, ignore)] +async fn test_sort_multi_columns_100k_mem() { + for (batch_size, should_spill) in + [(5, false), (1000, false), (10000, true), (20000, true)] + { + let (input, collected) = SortTest::new() + .with_int32_utf8_batches(batch_size) + .with_sort_columns(vec!["x", "y"]) + .with_pool_size(100 * KB) + .with_should_spill(should_spill) + .run() + .await; + + fn record_batch_to_vec(b: &RecordBatch) -> Vec<(i32, String)> { + let mut rows: Vec<_> = Vec::new(); + let i32_array = as_int32_array(b.column(0)).unwrap(); + let string_array = as_string_array(b.column(1)); + for i in 0..b.num_rows() { + let str = string_array.value(i).to_string(); + let i32 = i32_array.value(i); + rows.push((i32, str)); + } + rows + } + let mut input = input + .iter() + .flat_map(|p| p.iter()) + .flat_map(record_batch_to_vec) + .collect::>(); + input.sort_unstable(); + let actual = collected + .iter() + .flat_map(record_batch_to_vec) + .collect::>(); + assert_eq!(input, actual); } } #[tokio::test] async fn test_sort_unlimited_mem() { for (batch_size, should_spill) in [(5, false), (20000, false), (1000000, false)] { - SortTest::new() + let (input, collected) = SortTest::new() .with_int32_batches(batch_size) + .with_sort_columns(vec!["x"]) .with_pool_size(usize::MAX) .with_should_spill(should_spill) .run() .await; + + let expected = partitions_to_sorted_vec(&input); + let actual = batches_to_vec(&collected); + assert_eq!(expected, actual, "failure in @ batch_size {batch_size:?}"); } } + #[derive(Debug, Default)] struct SortTest { input: Vec>, + /// The names of the columns to sort by + sort_columns: Vec, /// GreedyMemoryPool size, if specified pool_size: Option, /// If true, expect the sort to spill @@ -89,12 +187,29 @@ impl SortTest { Default::default() } + fn with_sort_columns(mut self, sort_columns: Vec<&str>) -> Self { + self.sort_columns = sort_columns.iter().map(|s| s.to_string()).collect(); + self + } + /// Create batches of int32 values of rows fn with_int32_batches(mut self, rows: usize) -> Self { self.input = vec![make_staggered_i32_batches(rows)]; self } + /// Create batches of utf8 values of rows + fn with_utf8_batches(mut self, rows: usize) -> Self { + self.input = vec![make_staggered_utf8_batches(rows)]; + self + } + + /// Create batches of int32 and utf8 values of rows + fn with_int32_utf8_batches(mut self, rows: usize) -> Self { + self.input = vec![make_staggered_i32_utf8_batches(rows)]; + self + } + /// specify that this test should use a memory pool of the specified size fn with_pool_size(mut self, pool_size: usize) -> Self { self.pool_size = Some(pool_size); @@ -108,7 +223,7 @@ impl SortTest { /// Sort the input using SortExec and ensure the results are /// correct according to `Vec::sort` both with and without spilling - async fn run(&self) { + async fn run(&self) -> (Vec>, Vec) { let input = self.input.clone(); let first_batch = input .iter() @@ -117,16 +232,21 @@ impl SortTest { .expect("at least one batch"); let schema = first_batch.schema(); - let sort = LexOrdering::new(vec![PhysicalSortExpr { - expr: col("x", &schema).unwrap(), - options: SortOptions { - descending: false, - nulls_first: true, - }, - }]); + let sort_ordering = LexOrdering::new( + self.sort_columns + .iter() + .map(|c| PhysicalSortExpr { + expr: col(c, &schema).unwrap(), + options: SortOptions { + descending: false, + nulls_first: true, + }, + }) + .collect(), + ); let exec = MemorySourceConfig::try_new_exec(&input, schema, None).unwrap(); - let sort = Arc::new(SortExec::new(sort, exec)); + let sort = Arc::new(SortExec::new(sort_ordering, exec)); let session_config = SessionConfig::new(); let session_ctx = if let Some(pool_size) = self.pool_size { @@ -151,9 +271,6 @@ impl SortTest { let task_ctx = session_ctx.task_ctx(); let collected = collect(sort.clone(), task_ctx).await.unwrap(); - let expected = partitions_to_sorted_vec(&input); - let actual = batches_to_vec(&collected); - if self.should_spill { assert_ne!( sort.metrics().unwrap().spill_count().unwrap(), @@ -173,7 +290,8 @@ impl SortTest { 0, "The sort should have returned all memory used back to the memory pool" ); - assert_eq!(expected, actual, "failure in @ pool_size {self:?}"); + + (input, collected) } } @@ -201,3 +319,63 @@ fn make_staggered_i32_batches(len: usize) -> Vec { } batches } + +/// Return randomly sized record batches in a field named 'x' of type `Utf8` +/// with randomized content +fn make_staggered_utf8_batches(len: usize) -> Vec { + let mut rng = rand::thread_rng(); + let max_batch = 1024; + + let mut batches = vec![]; + let mut remaining = len; + while remaining != 0 { + let to_read = rng.gen_range(0..=remaining.min(max_batch)); + remaining -= to_read; + + batches.push( + RecordBatch::try_from_iter(vec![( + "x", + Arc::new(StringArray::from_iter_values( + (0..to_read).map(|_| format!("test_string_{}", rng.gen::())), + )) as ArrayRef, + )]) + .unwrap(), + ) + } + batches +} + +/// Return randomly sized record batches in a field named 'x' of type `Int32` +/// with randomized i32 content and a field named 'y' of type `Utf8` +/// with randomized content +fn make_staggered_i32_utf8_batches(len: usize) -> Vec { + let mut rng = rand::thread_rng(); + let max_batch = 1024; + + let mut batches = vec![]; + let mut remaining = len; + while remaining != 0 { + let to_read = rng.gen_range(0..=remaining.min(max_batch)); + remaining -= to_read; + + batches.push( + RecordBatch::try_from_iter(vec![ + ( + "x", + Arc::new(Int32Array::from_iter_values( + (0..to_read).map(|_| rng.gen()), + )) as ArrayRef, + ), + ( + "y", + Arc::new(StringArray::from_iter_values( + (0..to_read).map(|_| format!("test_string_{}", rng.gen::())), + )) as ArrayRef, + ), + ]) + .unwrap(), + ) + } + + batches +} diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 719faed4e454..669294d38af1 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -69,7 +69,7 @@ async fn oom_sort() { .with_expected_errors(vec![ "Resources exhausted: Memory Exhausted while Sorting (DiskManager is disabled)", ]) - .with_memory_limit(200_000) + .with_memory_limit(500_000) .run() .await } @@ -271,7 +271,8 @@ async fn sort_spill_reservation() { // Merge operation needs extra memory to do row conversion, so make the // memory limit larger. - let mem_limit = partition_size * 2; + let mem_limit = + ((partition_size * 2 + 1024) as f64 / MEMORY_FRACTION).ceil() as usize; let test = TestCase::new() // This query uses a different order than the input table to // force a sort. It also needs to have multiple columns to @@ -308,7 +309,8 @@ async fn sort_spill_reservation() { test.clone() .with_expected_errors(vec![ - "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as: ExternalSorterMerge", + "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:", + "bytes for ExternalSorterMerge", ]) .with_config(config) .run() diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index e6986b86046c..8ea7c43d2613 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -291,6 +291,10 @@ pub struct ArrayValues { // Otherwise, the first null index null_threshold: usize, options: SortOptions, + + /// Tracks the memory used by the values array, + /// freed on drop. + _reservation: MemoryReservation, } impl ArrayValues { @@ -298,7 +302,11 @@ impl ArrayValues { /// to `options`. /// /// Panics if the array is empty - pub fn new>(options: SortOptions, array: &A) -> Self { + pub fn new>( + options: SortOptions, + array: &A, + reservation: MemoryReservation, + ) -> Self { assert!(array.len() > 0, "Empty array passed to FieldCursor"); let null_threshold = match options.nulls_first { true => array.null_count(), @@ -309,6 +317,7 @@ impl ArrayValues { values: array.values(), null_threshold, options, + _reservation: reservation, } } @@ -360,6 +369,12 @@ impl CursorValues for ArrayValues { #[cfg(test)] mod tests { + use std::sync::Arc; + + use datafusion_execution::memory_pool::{ + GreedyMemoryPool, MemoryConsumer, MemoryPool, + }; + use super::*; fn new_primitive( @@ -372,10 +387,15 @@ mod tests { false => values.len() - null_count, }; + let memory_pool: Arc = Arc::new(GreedyMemoryPool::new(10000)); + let consumer = MemoryConsumer::new("test"); + let reservation = consumer.register(&memory_pool); + let values = ArrayValues { values: PrimitiveValues(values), null_threshold, options, + _reservation: reservation, }; Cursor::new(values) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 9f7e82f026bd..649468260e56 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -24,7 +24,7 @@ use std::fmt; use std::fmt::{Debug, Formatter}; use std::sync::Arc; -use crate::common::spawn_buffered; +use crate::common::{spawn_buffered, IPCWriter}; use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType}; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; @@ -279,7 +279,7 @@ impl ExternalSorter { Self { schema, in_mem_batches: vec![], - in_mem_batches_sorted: true, + in_mem_batches_sorted: false, spills: vec![], expr: expr.into(), metrics, @@ -302,27 +302,13 @@ impl ExternalSorter { } self.reserve_memory_for_merge()?; - let size = get_record_batch_memory_size(&input); - + let size = get_reserved_byte_for_record_batch(&input); if self.reservation.try_grow(size).is_err() { - let before = self.reservation.size(); - self.in_mem_sort().await?; - - // Sorting may have freed memory, especially if fetch is `Some` - // - // As such we check again, and if the memory usage has dropped by - // a factor of 2, and we can allocate the necessary capacity, - // we don't spill - // - // The factor of 2 aims to avoid a degenerate case where the - // memory required for `fetch` is just under the memory available, - // causing repeated re-sorting of data - if self.reservation.size() > before / 2 - || self.reservation.try_grow(size).is_err() - { - self.spill().await?; - self.reservation.try_grow(size)? - } + self.sort_or_spill_in_mem_batches().await?; + // We've already freed more than half of reserved memory, + // so we can grow the reservation again. There's nothing we can do + // if this try_grow fails. + self.reservation.try_grow(size)?; } self.in_mem_batches.push(input); @@ -344,6 +330,11 @@ impl ExternalSorter { /// 2. A combined streaming merge incorporating both in-memory /// batches and data from spill files on disk. fn sort(&mut self) -> Result { + // Release the memory reserved for merge back to the pool so + // there is some left when `in_mem_sort_stream` requests an + // allocation. + self.merge_reservation.free(); + if self.spilled_before() { let mut streams = vec![]; if !self.in_mem_batches.is_empty() { @@ -369,7 +360,7 @@ impl ExternalSorter { .with_metrics(self.metrics.baseline.clone()) .with_batch_size(self.batch_size) .with_fetch(self.fetch) - .with_reservation(self.reservation.new_empty()) + .with_reservation(self.merge_reservation.new_empty()) .build() } else { self.in_mem_sort_stream(self.metrics.baseline.clone()) @@ -408,50 +399,102 @@ impl ExternalSorter { debug!("Spilling sort data of ExternalSorter to disk whilst inserting"); - self.in_mem_sort().await?; - let spill_file = self.runtime.disk_manager.create_tmp_file("Sorting")?; let batches = std::mem::take(&mut self.in_mem_batches); - let spilled_rows = spill_record_batches( + let (spilled_rows, spilled_bytes) = spill_record_batches( batches, spill_file.path().into(), Arc::clone(&self.schema), )?; let used = self.reservation.free(); self.metrics.spill_count.add(1); - self.metrics.spilled_bytes.add(used); + self.metrics.spilled_bytes.add(spilled_bytes); self.metrics.spilled_rows.add(spilled_rows); self.spills.push(spill_file); Ok(used) } /// Sorts the in_mem_batches in place - async fn in_mem_sort(&mut self) -> Result<()> { - if self.in_mem_batches_sorted { - return Ok(()); - } - + /// + /// Sorting may have freed memory, especially if fetch is `Some`. If + /// the memory usage has dropped by a factor of 2, then we don't have + /// to spill. Otherwise, we spill to free up memory for inserting + /// more batches. + /// + /// The factor of 2 aims to avoid a degenerate case where the + /// memory required for `fetch` is just under the memory available, + // causing repeated re-sorting of data + async fn sort_or_spill_in_mem_batches(&mut self) -> Result<()> { // Release the memory reserved for merge back to the pool so - // there is some left when `in_memo_sort_stream` requests an - // allocation. + // there is some left when `in_mem_sort_stream` requests an + // allocation. At the end of this function, memory will be + // reserved again for the next spill. self.merge_reservation.free(); - self.in_mem_batches = self - .in_mem_sort_stream(self.metrics.baseline.intermediate())? - .try_collect() - .await?; + let before = self.reservation.size(); + + let mut sorted_stream = + self.in_mem_sort_stream(self.metrics.baseline.intermediate())?; + + // `self.in_mem_batches` is already taken away by the sort_stream, now it is empty. + // We'll gradually collect the sorted stream into self.in_mem_batches, or directly + // write sorted batches to disk when the memory is insufficient. + let mut spill_writer: Option = None; + while let Some(batch) = sorted_stream.next().await { + let batch = batch?; + match &mut spill_writer { + None => { + let sorted_size = get_reserved_byte_for_record_batch(&batch); + if self.reservation.try_grow(sorted_size).is_err() { + // Directly write in_mem_batches as well as all the remaining batches in + // sorted_stream to disk. Further batches fetched from `sorted_stream` will + // be handled by the `Some(writer)` matching arm. + let spill_file = + self.runtime.disk_manager.create_tmp_file("Sorting")?; + let mut writer = IPCWriter::new(spill_file.path(), &self.schema)?; + // Flush everything in memory to the spill file + for batch in self.in_mem_batches.drain(..) { + writer.write(&batch)?; + } + // as well as the newly sorted batch + writer.write(&batch)?; + spill_writer = Some(writer); + self.reservation.free(); + self.spills.push(spill_file); + } else { + self.in_mem_batches.push(batch); + self.in_mem_batches_sorted = true; + } + } + Some(writer) => { + writer.write(&batch)?; + } + } + } + + // Drop early to free up memory reserved by the sorted stream, otherwise the + // upcoming `self.reserve_memory_for_merge()` may fail due to insufficient memory. + drop(sorted_stream); - let size: usize = self - .in_mem_batches - .iter() - .map(get_record_batch_memory_size) - .sum(); + if let Some(writer) = &mut spill_writer { + writer.finish()?; + self.metrics.spill_count.add(1); + self.metrics.spilled_rows.add(writer.num_rows); + self.metrics.spilled_bytes.add(writer.num_bytes); + } + + // Sorting may free up some memory especially when fetch is `Some`. If we have + // not freed more than 50% of the memory, then we have to spill to free up more + // memory for inserting more batches. + if spill_writer.is_none() && self.reservation.size() > before / 2 { + // We have not freed more than 50% of the memory, so we have to spill to + // free up more memory + self.spill().await?; + } // Reserve headroom for next sort/merge self.reserve_memory_for_merge()?; - self.reservation.try_resize(size)?; - self.in_mem_batches_sorted = true; Ok(()) } @@ -528,6 +571,12 @@ impl ExternalSorter { let elapsed_compute = metrics.elapsed_compute().clone(); let _timer = elapsed_compute.timer(); + // Please pay attention that any operation inside of `in_mem_sort_stream` will + // not perform any memory reservation. This is for avoiding the need of handling + // reservation failure and spilling in the middle of the sort/merge. The memory + // space for batches produced by the resulting stream will be reserved by the + // consumer of the stream. + if self.in_mem_batches.len() == 1 { let batch = self.in_mem_batches.swap_remove(0); let reservation = self.reservation.take(); @@ -540,7 +589,7 @@ impl ExternalSorter { let batch = concat_batches(&self.schema, &self.in_mem_batches)?; self.in_mem_batches.clear(); self.reservation - .try_resize(get_record_batch_memory_size(&batch))?; + .try_resize(get_reserved_byte_for_record_batch(&batch))?; let reservation = self.reservation.take(); return self.sort_batch_stream(batch, metrics, reservation); } @@ -549,8 +598,9 @@ impl ExternalSorter { .into_iter() .map(|batch| { let metrics = self.metrics.baseline.intermediate(); - let reservation = - self.reservation.split(get_record_batch_memory_size(&batch)); + let reservation = self + .reservation + .split(get_reserved_byte_for_record_batch(&batch)); let input = self.sort_batch_stream(batch, metrics, reservation)?; Ok(spawn_buffered(input, 1)) }) @@ -579,7 +629,10 @@ impl ExternalSorter { metrics: BaselineMetrics, reservation: MemoryReservation, ) -> Result { - assert_eq!(get_record_batch_memory_size(&batch), reservation.size()); + assert_eq!( + get_reserved_byte_for_record_batch(&batch), + reservation.size() + ); let schema = batch.schema(); let fetch = self.fetch; @@ -612,6 +665,20 @@ impl ExternalSorter { } } +/// Estimate how much memory is needed to sort a `RecordBatch`. +/// +/// This is used to pre-reserve memory for the sort/merge. The sort/merge process involves +/// creating sorted copies of sorted columns in record batches for speeding up comparison +/// in sorting and merging. The sorted copies are in either row format or array format. +/// Please refer to cursor.rs and stream.rs for more details. No matter what format the +/// sorted copies are, they will use more memory than the original record batch. +fn get_reserved_byte_for_record_batch(batch: &RecordBatch) -> usize { + // 2x may not be enough for some cases, but it's a good start. + // If 2x is not enough, user can set a larger value for `sort_spill_reservation_bytes` + // to compensate for the extra memory needed. + get_record_batch_memory_size(batch) * 2 +} + impl Debug for ExternalSorter { fn fmt(&self, f: &mut Formatter) -> fmt::Result { f.debug_struct("ExternalSorter") @@ -641,7 +708,15 @@ pub fn sort_batch( lexsort_to_indices(&sort_columns, fetch)? }; - let columns = take_arrays(batch.columns(), &indices, None)?; + let mut columns = take_arrays(batch.columns(), &indices, None)?; + + // The columns may be larger than the unsorted columns in `batch` especially for variable length + // data types due to exponential growth when building the sort columns. We shrink the columns + // to prevent memory reservation failures, as well as excessive memory allocation when running + // merges in `SortPreservingMergeStream`. + columns.iter_mut().for_each(|c| { + c.shrink_to_fit(); + }); let options = RecordBatchOptions::new().with_row_count(Some(indices.len())); Ok(RecordBatch::try_new_with_options( @@ -1246,6 +1321,9 @@ mod tests { .with_runtime(runtime), ); + // The input has 100 partitions, each partition has a batch containing 100 rows. + // Each row has a single Int32 column with values 0..100. The total size of the + // input is roughly 40000 bytes. let partitions = 100; let input = test::scan_partitioned(partitions); let schema = input.schema(); @@ -1271,9 +1349,16 @@ mod tests { assert_eq!(metrics.output_rows().unwrap(), 10000); assert!(metrics.elapsed_compute().unwrap() > 0); - assert_eq!(metrics.spill_count().unwrap(), 3); - assert_eq!(metrics.spilled_bytes().unwrap(), 36000); - assert_eq!(metrics.spilled_rows().unwrap(), 9000); + + let spill_count = metrics.spill_count().unwrap(); + let spilled_rows = metrics.spilled_rows().unwrap(); + let spilled_bytes = metrics.spilled_bytes().unwrap(); + // Processing 40000 bytes of data using 12288 bytes of memory requires 3 spills + // unless we do something really clever. It will spill roughly 9000+ rows and 36000 + // bytes. We leave a little wiggle room for the actual numbers. + assert!((3..=10).contains(&spill_count)); + assert!((9000..=10000).contains(&spilled_rows)); + assert!((36000..=40000).contains(&spilled_bytes)); let columns = result[0].columns(); @@ -1290,6 +1375,77 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_sort_spill_utf8_strings() -> Result<()> { + let session_config = SessionConfig::new() + .with_batch_size(100) + .with_sort_in_place_threshold_bytes(20 * 1024) + .with_sort_spill_reservation_bytes(100 * 1024); + let runtime = RuntimeEnvBuilder::new() + .with_memory_limit(500 * 1024, 1.0) + .build_arc()?; + let task_ctx = Arc::new( + TaskContext::default() + .with_session_config(session_config) + .with_runtime(runtime), + ); + + // The input has 200 partitions, each partition has a batch containing 100 rows. + // Each row has a single Utf8 column, the Utf8 string values are roughly 42 bytes. + // The total size of the input is roughly 8.4 KB. + let input = test::scan_partitioned_utf8(200); + let schema = input.schema(); + + let sort_exec = Arc::new(SortExec::new( + LexOrdering::new(vec![PhysicalSortExpr { + expr: col("i", &schema)?, + options: SortOptions::default(), + }]), + Arc::new(CoalescePartitionsExec::new(input)), + )); + + let result = collect( + Arc::clone(&sort_exec) as Arc, + Arc::clone(&task_ctx), + ) + .await?; + + let num_rows = result.iter().map(|batch| batch.num_rows()).sum::(); + assert_eq!(num_rows, 20000); + + // Now, validate metrics + let metrics = sort_exec.metrics().unwrap(); + + assert_eq!(metrics.output_rows().unwrap(), 20000); + assert!(metrics.elapsed_compute().unwrap() > 0); + + let spill_count = metrics.spill_count().unwrap(); + let spilled_rows = metrics.spilled_rows().unwrap(); + let spilled_bytes = metrics.spilled_bytes().unwrap(); + // Processing 840 KB of data using 400 KB of memory requires at least 2 spills + // It will spill roughly 18000 rows and 800 KBytes. + // We leave a little wiggle room for the actual numbers. + assert!((2..=10).contains(&spill_count)); + assert!((15000..=20000).contains(&spilled_rows)); + assert!((700000..=900000).contains(&spilled_bytes)); + + // Verify that the result is sorted + let concated_result = concat_batches(&schema, &result)?; + let columns = concated_result.columns(); + let string_array = as_string_array(&columns[0]); + for i in 0..string_array.len() - 1 { + assert!(string_array.value(i) <= string_array.value(i + 1)); + } + + assert_eq!( + task_ctx.runtime_env().memory_pool.reserved(), + 0, + "The sort should have returned all memory used back to the memory manager" + ); + + Ok(()) + } + #[tokio::test] async fn test_sort_fetch_memory_calculation() -> Result<()> { // This test mirrors down the size from the example above. diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs index ab8054be59a8..e029c60b285b 100644 --- a/datafusion/physical-plan/src/sorts/stream.rs +++ b/datafusion/physical-plan/src/sorts/stream.rs @@ -159,6 +159,8 @@ pub struct FieldCursorStream { sort: PhysicalSortExpr, /// Input streams streams: FusedStreams, + /// Create new reservations for each array + reservation: MemoryReservation, phantom: PhantomData T>, } @@ -171,11 +173,16 @@ impl std::fmt::Debug for FieldCursorStream { } impl FieldCursorStream { - pub fn new(sort: PhysicalSortExpr, streams: Vec) -> Self { + pub fn new( + sort: PhysicalSortExpr, + streams: Vec, + reservation: MemoryReservation, + ) -> Self { let streams = streams.into_iter().map(|s| s.fuse()).collect(); Self { sort, streams: FusedStreams(streams), + reservation, phantom: Default::default(), } } @@ -183,8 +190,15 @@ impl FieldCursorStream { fn convert_batch(&mut self, batch: &RecordBatch) -> Result> { let value = self.sort.expr.evaluate(batch)?; let array = value.into_array(batch.num_rows())?; + let size_in_mem = array.get_buffer_memory_size(); let array = array.as_any().downcast_ref::().expect("field values"); - Ok(ArrayValues::new(self.sort.options, array)) + let mut array_reservation = self.reservation.new_empty(); + array_reservation.try_grow(size_in_mem)?; + Ok(ArrayValues::new( + self.sort.options, + array, + array_reservation, + )) } } diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index 909b5875c8c5..a541f79dc717 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -38,7 +38,8 @@ macro_rules! primitive_merge_helper { macro_rules! merge_helper { ($t:ty, $sort:ident, $streams:ident, $schema:ident, $tracking_metrics:ident, $batch_size:ident, $fetch:ident, $reservation:ident, $enable_round_robin_tie_breaker:ident) => {{ - let streams = FieldCursorStream::<$t>::new($sort, $streams); + let streams = + FieldCursorStream::<$t>::new($sort, $streams, $reservation.new_empty()); return Ok(Box::pin(SortPreservingMergeStream::new( Box::new(streams), $schema, diff --git a/datafusion/physical-plan/src/spill.rs b/datafusion/physical-plan/src/spill.rs index dbcc46baf8ca..b45353ae13f0 100644 --- a/datafusion/physical-plan/src/spill.rs +++ b/datafusion/physical-plan/src/spill.rs @@ -62,7 +62,7 @@ pub(crate) fn spill_record_batches( batches: Vec, path: PathBuf, schema: SchemaRef, -) -> Result { +) -> Result<(usize, usize)> { let mut writer = IPCWriter::new(path.as_ref(), schema.as_ref())?; for batch in batches { writer.write(&batch)?; @@ -74,7 +74,7 @@ pub(crate) fn spill_record_batches( writer.num_rows, human_readable_size(writer.num_bytes), ); - Ok(writer.num_rows) + Ok((writer.num_rows, writer.num_bytes)) } fn read_spill(sender: Sender>, path: &Path) -> Result<()> { @@ -213,12 +213,12 @@ mod tests { let spill_file = disk_manager.create_tmp_file("Test Spill")?; let schema = batch1.schema(); let num_rows = batch1.num_rows() + batch2.num_rows(); - let cnt = spill_record_batches( + let (spilled_rows, _) = spill_record_batches( vec![batch1, batch2], spill_file.path().into(), Arc::clone(&schema), - ); - assert_eq!(cnt.unwrap(), num_rows); + )?; + assert_eq!(spilled_rows, num_rows); let file = BufReader::new(File::open(spill_file.path())?); let reader = FileReader::try_new(file, None)?; diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs index e73d6d97e986..ad0e43503b2b 100644 --- a/datafusion/physical-plan/src/test.rs +++ b/datafusion/physical-plan/src/test.rs @@ -21,8 +21,8 @@ use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; -use arrow::array::{ArrayRef, Int32Array, RecordBatch}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::array::{Array, ArrayRef, Int32Array, RecordBatch}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use futures::{Future, FutureExt}; @@ -132,11 +132,30 @@ pub fn make_partition(sz: i32) -> RecordBatch { RecordBatch::try_new(schema, vec![arr]).unwrap() } +pub fn make_partition_utf8(sz: i32) -> RecordBatch { + let seq_start = 0; + let seq_end = sz; + let values = (seq_start..seq_end) + .map(|i| format!("test_long_string_that_is_roughly_42_bytes_{}", i)) + .collect::>(); + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Utf8, true)])); + let mut string_array = arrow::array::StringArray::from(values); + string_array.shrink_to_fit(); + let arr = Arc::new(string_array); + let arr = arr as ArrayRef; + + RecordBatch::try_new(schema, vec![arr]).unwrap() +} + /// Returns a `DataSourceExec` that scans `partitions` of 100 batches each pub fn scan_partitioned(partitions: usize) -> Arc { Arc::new(mem_exec(partitions)) } +pub fn scan_partitioned_utf8(partitions: usize) -> Arc { + Arc::new(mem_exec_utf8(partitions)) +} + /// Returns a `DataSourceExec` that scans `partitions` of 100 batches each pub fn mem_exec(partitions: usize) -> DataSourceExec { let data: Vec> = (0..partitions).map(|_| vec![make_partition(100)]).collect(); @@ -148,6 +167,18 @@ pub fn mem_exec(partitions: usize) -> DataSourceExec { )) } +pub fn mem_exec_utf8(partitions: usize) -> DataSourceExec { + let data: Vec> = (0..partitions) + .map(|_| vec![make_partition_utf8(100)]) + .collect(); + + let schema = data[0][0].schema(); + let projection = None; + DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&data, schema, projection).unwrap(), + )) +} + // Construct a stream partition for test purposes #[derive(Debug)] pub struct TestPartitionStream { From ed517efcf329de55e7a9d881d6d76d885bc3598d Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 18 Feb 2025 09:27:07 -0800 Subject: [PATCH 11/71] Skip target in taplo checks (#14747) `target` can contain arbitrary files. Contents depend on tools used and tests run, and may even include `Cargo.toml` files that are not properly formatted. The taplo check should never complain on files that are not meant to be part of the repository, and `target` contents clearly falls into that category. `trybuild` is an example of a tool that creates cargo files inside target directory. --- taplo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/taplo.toml b/taplo.toml index b7089c501680..47b33161c37e 100644 --- a/taplo.toml +++ b/taplo.toml @@ -18,6 +18,7 @@ ## https://taplo.tamasfe.dev/configuration/file.html include = ["**/Cargo.toml"] +exclude = ["target/*"] [formatting] # Align consecutive entries vertically. From ecd43ec06fc8130b704f9eb41ad178ac219442e8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:33:02 -0800 Subject: [PATCH 12/71] chore(deps): bump uuid from 1.13.1 to 1.13.2 (#14739) Bumps [uuid](https://github.com/uuid-rs/uuid) from 1.13.1 to 1.13.2. - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/1.13.1...v1.13.2) --- updated-dependencies: - dependency-name: uuid dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- datafusion-examples/Cargo.toml | 2 +- datafusion/core/Cargo.toml | 2 +- datafusion/functions/Cargo.toml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e110789bfda..25a223e0db4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6502,9 +6502,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6" dependencies = [ "getrandom 0.3.1", "js-sys", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index ec6e0ab71d50..feafa48b3954 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -74,7 +74,7 @@ test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tonic = "0.12.1" url = { workspace = true } -uuid = "1.7" +uuid = "1.13" [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 784b2a89aae9..8a706ca19f4d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -126,7 +126,7 @@ sqlparser = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } url = { workspace = true } -uuid = { version = "1.7", features = ["v4", "js"] } +uuid = { version = "1.13", features = ["v4", "js"] } xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index c77e58f0c022..97c9e0dbd475 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -85,7 +85,7 @@ rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "1.7", features = ["v4"], optional = true } +uuid = { version = "1.13", features = ["v4"], optional = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } From 740759a697226ca68a7f38544a7f153eed1f5e19 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:33:24 -0800 Subject: [PATCH 13/71] chore(deps): bump blake3 from 1.5.5 to 1.6.0 (#14741) Bumps [blake3](https://github.com/BLAKE3-team/BLAKE3) from 1.5.5 to 1.6.0. - [Release notes](https://github.com/BLAKE3-team/BLAKE3/releases) - [Commits](https://github.com/BLAKE3-team/BLAKE3/compare/1.5.5...1.6.0) --- updated-dependencies: - dependency-name: blake3 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 14 ++++++++++++-- datafusion/functions/Cargo.toml | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 25a223e0db4c..4c47b23c4233 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1035,15 +1035,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.5" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +checksum = "1230237285e3e10cde447185e8975408ae24deaa67205ce684805c25bc0c7937" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", + "memmap2", ] [[package]] @@ -3750,6 +3751,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.9.1" diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 97c9e0dbd475..7455c177086c 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -69,7 +69,7 @@ arrow = { workspace = true } arrow-buffer = { workspace = true } base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } -blake3 = { version = "1.0", optional = true } +blake3 = { version = "1.6", optional = true } chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-doc = { workspace = true } From b1df89e7b126157dee254b25ef939a4789352514 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:33:45 -0800 Subject: [PATCH 14/71] chore(deps): bump tempfile from 3.17.0 to 3.17.1 (#14742) Bumps [tempfile](https://github.com/Stebalien/tempfile) from 3.17.0 to 3.17.1. - [Changelog](https://github.com/Stebalien/tempfile/blob/master/CHANGELOG.md) - [Commits](https://github.com/Stebalien/tempfile/compare/v3.17.0...v3.17.1) --- updated-dependencies: - dependency-name: tempfile dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c47b23c4233..33e1910e3249 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5860,9 +5860,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.17.0" +version = "3.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a40f762a77d2afa88c2d919489e390a12bdd261ed568e60cfa7e48d4e20f0d33" +checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230" dependencies = [ "cfg-if", "fastrand", From 0ef8984c981efa30a3600562194f1facf0447039 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:34:16 -0800 Subject: [PATCH 15/71] chore(deps): bump clap from 4.5.29 to 4.5.30 (#14743) Bumps [clap](https://github.com/clap-rs/clap) from 4.5.29 to 4.5.30. - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.29...clap_complete-v4.5.30) --- updated-dependencies: - dependency-name: clap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 16 ++++++++-------- datafusion-cli/Cargo.toml | 2 +- datafusion/sqllogictest/Cargo.toml | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 33e1910e3249..419853f0ac7a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1346,9 +1346,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.29" +version = "4.5.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184" +checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d" dependencies = [ "clap_builder", "clap_derive", @@ -1356,9 +1356,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.29" +version = "4.5.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9" +checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c" dependencies = [ "anstream", "anstyle", @@ -1550,7 +1550,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.29", + "clap 4.5.30", "criterion-plot", "futures", "is-terminal", @@ -1851,7 +1851,7 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", - "clap 4.5.29", + "clap 4.5.30", "ctor", "datafusion", "dirs", @@ -2344,7 +2344,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.29", + "clap 4.5.30", "datafusion", "env_logger", "futures", @@ -3677,7 +3677,7 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.29", + "clap 4.5.30", "escape8259", ] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index cef32279371e..d88f8fccb928 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -32,7 +32,7 @@ arrow = { workspace = true } async-trait = { workspace = true } aws-config = "1.5.16" aws-credential-types = "1.2.0" -clap = { version = "4.5.29", features = ["derive", "cargo"] } +clap = { version = "4.5.30", features = ["derive", "cargo"] } datafusion = { workspace = true, features = [ "avro", "crypto_expressions", diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index ddafcfbd3d9f..f1d37c7202d6 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -40,7 +40,7 @@ async-trait = { workspace = true } bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } -clap = { version = "4.5.29", features = ["derive", "env"] } +clap = { version = "4.5.30", features = ["derive", "env"] } datafusion = { workspace = true, default-features = true, features = ["avro"] } futures = { workspace = true } half = { workspace = true, default-features = true } From 481515e1a093f363252c2d790c6301a5e26e9a69 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:34:59 -0800 Subject: [PATCH 16/71] chore(deps): bump parquet from 54.1.0 to 54.2.0 (#14744) Bumps [parquet](https://github.com/apache/arrow-rs) from 54.1.0 to 54.2.0. - [Release notes](https://github.com/apache/arrow-rs/releases) - [Changelog](https://github.com/apache/arrow-rs/blob/main/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/54.1.0...54.2.0) --- updated-dependencies: - dependency-name: parquet dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 419853f0ac7a..88102bb3fdc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4053,9 +4053,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb" dependencies = [ "ahash 0.8.11", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index 3aacf47508da..cc12d6b2e429 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -134,7 +134,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "54.1.0", default-features = false, features = [ +parquet = { version = "54.2.0", default-features = false, features = [ "arrow", "async", "object_store", From 6a036ae1662acf125346a856ec091a6cd4a8b144 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Wed, 19 Feb 2025 01:35:51 +0100 Subject: [PATCH 17/71] Speed up `chr` UDF (~4x faster) (#14700) * add chr bench * speed up chr * 1 byte assumption --- datafusion/functions/Cargo.toml | 5 +++ datafusion/functions/benches/chr.rs | 52 +++++++++++++++++++++++++ datafusion/functions/src/string/chr.rs | 53 ++++++++++++++++---------- 3 files changed, 90 insertions(+), 20 deletions(-) create mode 100644 datafusion/functions/benches/chr.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 7455c177086c..c00997853bb3 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -108,6 +108,11 @@ harness = false name = "encoding" required-features = ["encoding_expressions"] +[[bench]] +harness = false +name = "chr" +required-features = ["string_expressions"] + [[bench]] harness = false name = "uuid" diff --git a/datafusion/functions/benches/chr.rs b/datafusion/functions/benches/chr.rs new file mode 100644 index 000000000000..58c5ee3d68f6 --- /dev/null +++ b/datafusion/functions/benches/chr.rs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::{array::PrimitiveArray, datatypes::Int64Type, util::test_util::seedable_rng}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::string::chr; +use rand::Rng; + +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + let cot_fn = chr(); + let size = 1024; + let input: PrimitiveArray = { + let null_density = 0.2; + let mut rng = seedable_rng(); + (0..size) + .map(|_| { + if rng.gen::() < null_density { + None + } else { + Some(rng.gen_range::(1i64..10_000)) + } + }) + .collect() + }; + let input = Arc::new(input); + let args = vec![ColumnarValue::Array(input)]; + c.bench_function("chr", |b| { + b.iter(|| black_box(cot_fn.invoke_batch(&args, size).unwrap())) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index 58aa7ede74c4..a811de7fccf0 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::ArrayRef; -use arrow::array::StringArray; +use arrow::array::GenericStringBuilder; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use arrow::datatypes::DataType::Utf8; @@ -36,26 +36,39 @@ use datafusion_macros::user_doc; pub fn chr(args: &[ArrayRef]) -> Result { let integer_array = as_int64_array(&args[0])?; - // first map is the iterator, second is for the `Option<_>` - let result = integer_array - .iter() - .map(|integer: Option| { - integer - .map(|integer| { - if integer == 0 { - exec_err!("null character not permitted.") - } else { - match core::char::from_u32(integer as u32) { - Some(integer) => Ok(integer.to_string()), - None => { - exec_err!("requested character too large for encoding.") - } + let mut builder = GenericStringBuilder::::with_capacity( + integer_array.len(), + // 1 byte per character, assuming that is the common case + integer_array.len(), + ); + + let mut buf = [0u8; 4]; + + for integer in integer_array { + match integer { + Some(integer) => { + if integer == 0 { + return exec_err!("null character not permitted."); + } else { + match core::char::from_u32(integer as u32) { + Some(c) => { + builder.append_value(c.encode_utf8(&mut buf)); + } + None => { + return exec_err!( + "requested character too large for encoding." + ); } } - }) - .transpose() - }) - .collect::>()?; + } + } + None => { + builder.append_null(); + } + } + } + + let result = builder.finish(); Ok(Arc::new(result) as ArrayRef) } @@ -70,7 +83,7 @@ pub fn chr(args: &[ArrayRef]) -> Result { | chr(Int64(128640)) | +--------------------+ | 🚀 | -+--------------------+ ++--------------------+ ```"#, standard_argument(name = "expression", prefix = "String"), related_udf(name = "ascii") From c176533d185b76bf4728c21d3b83ca00c633614f Mon Sep 17 00:00:00 2001 From: Georgi Krastev Date: Wed, 19 Feb 2025 01:41:05 +0100 Subject: [PATCH 18/71] Support aliases in ConstEvaluator (#14734) Not sure why they are not supported. It seems that if we're not careful, some transformations can introduce aliases nested inside other expressions. --- .../core/tests/expr_api/simplification.rs | 27 ++++++++++++ .../simplify_expressions/expr_simplifier.rs | 41 ++++++++----------- .../sqllogictest/test_files/subquery.slt | 6 +-- 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/datafusion/core/tests/expr_api/simplification.rs b/datafusion/core/tests/expr_api/simplification.rs index 83e96bffdf48..7bb21725ef40 100644 --- a/datafusion/core/tests/expr_api/simplification.rs +++ b/datafusion/core/tests/expr_api/simplification.rs @@ -365,6 +365,33 @@ fn test_const_evaluator() { ); } +#[test] +fn test_const_evaluator_alias() { + // true --> true + test_evaluate(lit(true).alias("a"), lit(true)); + // true or true --> true + test_evaluate(lit(true).alias("a").or(lit(true).alias("b")), lit(true)); + // "foo" == "foo" --> true + test_evaluate(lit("foo").alias("a").eq(lit("foo").alias("b")), lit(true)); + // c = 1 + 2 --> c + 3 + test_evaluate( + col("c") + .alias("a") + .eq(lit(1).alias("b") + lit(2).alias("c")), + col("c").alias("a").eq(lit(3)), + ); + // (foo != foo) OR (c = 1) --> false OR (c = 1) + test_evaluate( + lit("foo") + .alias("a") + .not_eq(lit("foo").alias("b")) + .alias("c") + .or(col("c").alias("d").eq(lit(1).alias("e"))) + .alias("f"), + col("c").alias("d").eq(lit(1)).alias("f"), + ); +} + #[test] fn test_const_evaluator_scalar_functions() { // concat("foo", "bar") --> "foobar" diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 29f3d7cbda39..e43e2e704080 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -44,6 +44,8 @@ use datafusion_expr::{ }; use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps}; +use super::inlist_simplifier::ShortenInListSimplifier; +use super::utils::*; use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::guarantees::GuaranteeRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; @@ -51,9 +53,6 @@ use crate::simplify_expressions::SimplifyInfo; use indexmap::IndexSet; use regex::Regex; -use super::inlist_simplifier::ShortenInListSimplifier; -use super::utils::*; - /// This structure handles API for expression simplification /// /// Provides simplification information based on DFSchema and @@ -515,30 +514,27 @@ impl TreeNodeRewriter for ConstEvaluator<'_> { // NB: do not short circuit recursion even if we find a non // evaluatable node (so we can fold other children, args to - // functions, etc) + // functions, etc.) Ok(Transformed::no(expr)) } fn f_up(&mut self, expr: Expr) -> Result> { match self.can_evaluate.pop() { - // Certain expressions such as `CASE` and `COALESCE` are short circuiting - // and may not evaluate all their sub expressions. Thus if - // if any error is countered during simplification, return the original + // Certain expressions such as `CASE` and `COALESCE` are short-circuiting + // and may not evaluate all their sub expressions. Thus, if + // any error is countered during simplification, return the original // so that normal evaluation can occur - Some(true) => { - let result = self.evaluate_to_scalar(expr); - match result { - ConstSimplifyResult::Simplified(s) => { - Ok(Transformed::yes(Expr::Literal(s))) - } - ConstSimplifyResult::NotSimplified(s) => { - Ok(Transformed::no(Expr::Literal(s))) - } - ConstSimplifyResult::SimplifyRuntimeError(_, expr) => { - Ok(Transformed::yes(expr)) - } + Some(true) => match self.evaluate_to_scalar(expr) { + ConstSimplifyResult::Simplified(s) => { + Ok(Transformed::yes(Expr::Literal(s))) } - } + ConstSimplifyResult::NotSimplified(s) => { + Ok(Transformed::no(Expr::Literal(s))) + } + ConstSimplifyResult::SimplifyRuntimeError(_, expr) => { + Ok(Transformed::yes(expr)) + } + }, Some(false) => Ok(Transformed::no(expr)), _ => internal_err!("Failed to pop can_evaluate"), } @@ -586,9 +582,7 @@ impl<'a> ConstEvaluator<'a> { // added they can be checked for their ability to be evaluated // at plan time match expr { - // Has no runtime cost, but needed during planning - Expr::Alias(..) - | Expr::AggregateFunction { .. } + Expr::AggregateFunction { .. } | Expr::ScalarVariable(_, _) | Expr::Column(_) | Expr::OuterReferenceColumn(_, _) @@ -603,6 +597,7 @@ impl<'a> ConstEvaluator<'a> { Self::volatility_ok(func.signature().volatility) } Expr::Literal(_) + | Expr::Alias(..) | Expr::Unnest(_) | Expr::BinaryExpr { .. } | Expr::Not(_) diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index b0c9ad93e155..264392fc1017 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -834,7 +834,7 @@ query TT explain SELECT t1_id, (SELECT count(*) as _cnt FROM t2 WHERE t2.t2_int = t1.t1_int) as cnt from t1 ---- logical_plan -01)Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) AS _cnt ELSE __scalar_sq_1._cnt END AS cnt +01)Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1._cnt END AS cnt 02)--Left Join: t1.t1_int = __scalar_sq_1.t2_int 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 @@ -855,7 +855,7 @@ query TT explain SELECT t1_id, (SELECT count(*) + 2 as _cnt FROM t2 WHERE t2.t2_int = t1.t1_int) from t1 ---- logical_plan -01)Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) AS _cnt ELSE __scalar_sq_1._cnt END AS _cnt +01)Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) ELSE __scalar_sq_1._cnt END AS _cnt 02)--Left Join: t1.t1_int = __scalar_sq_1.t2_int 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 @@ -922,7 +922,7 @@ query TT explain SELECT t1_id, (SELECT count(*) + 2 as cnt_plus_2 FROM t2 WHERE t2.t2_int = t1.t1_int having count(*) = 0) from t1 ---- logical_plan -01)Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) AS cnt_plus_2 WHEN __scalar_sq_1.count(*) != Int64(0) THEN NULL ELSE __scalar_sq_1.cnt_plus_2 END AS cnt_plus_2 +01)Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) WHEN __scalar_sq_1.count(*) != Int64(0) THEN NULL ELSE __scalar_sq_1.cnt_plus_2 END AS cnt_plus_2 02)--Left Join: t1.t1_int = __scalar_sq_1.t2_int 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 From 8503ecea70bc161c27d833c76339574443f92c52 Mon Sep 17 00:00:00 2001 From: oznur-synnada Date: Wed, 19 Feb 2025 14:46:10 +0300 Subject: [PATCH 19/71] Create gsoc_project_ideas.md (#14774) * Create gsoc_project_ideas.md Create new page under Contributor Guide for GSoC 2025 * fmt * Update gsoc_project_ideas.md --------- Co-authored-by: berkaysynnada --- .../contributor-guide/gsoc_project_ideas.md | 112 ++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 113 insertions(+) create mode 100644 docs/source/contributor-guide/gsoc_project_ideas.md diff --git a/docs/source/contributor-guide/gsoc_project_ideas.md b/docs/source/contributor-guide/gsoc_project_ideas.md new file mode 100644 index 000000000000..3feaba559a48 --- /dev/null +++ b/docs/source/contributor-guide/gsoc_project_ideas.md @@ -0,0 +1,112 @@ +# GSoC Project Ideas + +## Introduction + +Welcome to the Apache DataFusion Google Summer of Code (GSoC) 2025 project ideas list. Below you can find information about the projects. Please refer to [this page](https://datafusion.apache.org/contributor-guide/gsoc_application_guidelines.html) for application guidelines. + +## Projects + +### [Implement Continuous Monitoring of DataFusion Performance](https://github.com/apache/datafusion/issues/5504) + +- **Description and Outcomes:** DataFusion lacks continuous monitoring of how performance evolves over time -- we do this somewhat manually today. Even though performance has been one of our top priorities for a while now, we didn't build a continuous monitoring system yet. This linked issue contains a summary of all the previous efforts that made us inch closer to having such a system, but a functioning system needs to built on top of that progress. A student successfully completing this project would gain experience in building an end-to-end monitoring system that integrates with GitHub, scheduling/running benchmarks on some sort of a cloud infrastructure, and building a versatile web UI to expose the results. The outcome of this project will benefit Apache DataFusion on an ongoing basis in its quest for ever-more performance. +- **Category:** Tooling +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [alamb](https://github.com/alamb) and [mertak-synnada](https://github.com/mertak-synnada) +- **Skills:** DevOps, Cloud Computing, Web Development, Integrations +- **Expected Project Size:** 175 to 350 hours\* + +### [Supporting Correlated Subqueries](https://github.com/apache/datafusion/issues/5483) + +- **Description and Outcomes:** Correlated subqueries are an important SQL feature that enables some users to express their business logic more intuitively without thinking about "joins". Even though DataFusion has decent join support, it doesn't fully support correlated subqueries. The linked epic contains bite-size pieces of the steps necessary to achieve full support. For students interested in internals of data systems and databases, this project is a good opportunity to apply and/or improve their computer science knowledge. The experience of adding such a feature to a widely-used foundational query engine can also serve as a good opportunity to kickstart a career in the area of databases and data systems. +- **Category:** Core +- **Difficulty:** Advanced +- **Possible Mentor(s) and/or Helper(s):** [jayzhan-synnada](https://github.com/jayzhan-synnada) and [xudong963](https://github.com/xudong963) +- **Skills:** Databases, Algorithms, Data Structures, Testing Techniques +- **Expected Project Size:** 350 hours + +### Improving DataFusion DX (e.g. [1](https://github.com/apache/datafusion/issues/9371) and [2](https://github.com/apache/datafusion/issues/14429)) + +- **Description and Outcomes:** While performance, extensibility and customizability is DataFusion's strong aspects, we have much work to do in terms of user-friendliness and ease of debug-ability. This project aims to make strides in these areas by improving terminal visualizations of query plans and increasing the "deployment" of the newly-added diagnostics framework. This project is a potential high-impact project with high output visibility, and reduce the barrier to entry to new users. +- **Category:** DX +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [eliaperantoni](https://github.com/eliaperantoni) and [mkarbo](https://github.com/mkarbo) +- **Skills:** Software Engineering, Terminal Visualizations +- **Expected Project Size:** 175 to 350 hours\* + +### [Robust WASM Support](https://github.com/apache/datafusion/issues/13815) + +- **Description and Outcomes:** DataFusion can be compiled today to WASM with some care. However, it is somewhat tricky and brittle. Having robust WASM support improves the _embeddability_ aspect of DataFusion, and can enable many practical use cases. A good conclusion of this project would be the addition of a live demo sub-page to the DataFusion homepage. +- **Category:** Build +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [alamb](https://github.com/alamb) and [waynexia](https://github.com/waynexia) +- **Skills:** WASM, Advanced Rust, Web Development, Software Engineering +- **Expected Project Size:** 175 to 350 hours\* + +### [High Performance Aggregations](https://github.com/apache/datafusion/issues/7000) + +- **Description and Outcomes:** An aggregation is one of the most fundamental operations within a query engine. Practical performance in many use cases, and results in many well-known benchmarks (e.g. [ClickBench](https://benchmark.clickhouse.com/)), depend heavily on aggregation performance. DataFusion community has been working on improving aggregation performance for a while now, but there is still work to do. A student working on this project will get the chance to hone their skills on high-performance, low(ish) level coding, intricacies of measuring performance, data structures and others. +- **Category:** Core +- **Difficulty:** Advanced +- **Possible Mentor(s) and/or Helper(s):** [jayzhan-synnada](https://github.com/jayzhan-synnada) and [Rachelint](https://github.com/Rachelint) +- **Skills:** Algorithms, Data Structures, Advanced Rust, Databases, Benchmarking Techniques +- **Expected Project Size:** 350 hours + +### [Improving Python Bindings](https://github.com/apache/datafusion-python) + +- **Description and Outcomes:** DataFusion offers Python bindings that enable users to build data systems using Python. However, the Python bindings are still relatively low-level, and do not expose all APIs libraries like [Pandas](https://pandas.pydata.org/) and [Polars](https://pola.rs/) with a end-user focus offer. This project aims to improve DataFusion's Python bindings to make progress towards moving it closer to such libraries in terms of built-in APIs and functionality. +- **Category:** Python Bindings +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [timsaucer](https://github.com/timsaucer) +- **Skills:** APIs, FFIs, DataFrame Libraries +- **Expected Project Size:** 175 to 350 hours\* + +### [Optimizing DataFusion Binary Size](https://github.com/apache/datafusion/issues/13816) + +- **Description and Outcomes:** DataFusion is a foundational library with a large feature set. Even though we try to avoid adding too many dependencies and implement many low-level functionalities inside the codebase, the fast moving nature of the project results in an accumulation of dependencies over time. This inflates DataFusion's binary size over time, which reduces portability and embeddability. This project involves a study of the codebase, using compiler tooling, to understand where code bloat comes from, simplifying/reducing the number of dependencies by efficient in-house implementations, and avoiding code duplications. +- **Category:** Core/Build +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [comphead](https://github.com/comphead) and [alamb](https://github.com/alamb) +- **Skills:** Software Engineering, Refactoring, Dependency Management, Compilers +- **Expected Project Size:** 175 to 350 hours\* + +### [Ergonomic SQL Features](https://github.com/apache/datafusion/issues/14514) + +- **Description and Outcomes:** [DuckDB](https://duckdb.org/) has many innovative features that significantly improve the SQL UX. Even though some of those features are already implemented in DataFusion, there are many others we can implement (and get inspiration from). [This page](https://duckdb.org/docs/sql/dialect/friendly_sql.html) contains a good summary of such features. Each such feature will serve as a bite-size, achievable milestone for a cool GSoC project that will have user-facing impact improving the UX on a broad basis. The project will start with a survey of what is already implemented, what is missing, and kick off with a prioritization proposal/implementation plan. +- **Category:** SQL FE +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [berkaysynnada](https://github.com/berkaysynnada) +- **Skills:** SQL, Planning, Parsing, Software Engineering +- **Expected Project Size:** 350 hours + +### [Advanced Interval Analysis](https://github.com/apache/datafusion/issues/14515) + +- **Description and Outcomes:** DataFusion implements interval arithmetic and utilizes it for range estimations, which enables use cases in data pruning, optimizations and statistics. However, the current implementation only works efficiently for forward evaluation; i.e. calculating the output range of an expression given input ranges (ranges of columns). When propagating constraints using the same graph, the current approach requires multiple bottom-up and top-down traversals to narrow column bounds fully. This project aims to fix this deficiency by utilizing a better algorithmic approach. Note that this is a _very advanced_ project for students with a deep interest in computational methods, expression graphs, and constraint solvers. +- **Category:** Core +- **Difficulty:** Advanced +- **Possible Mentor(s) and/or Helper(s):** [ozankabak](https://github.com/ozankabak) and [berkaysynnada](https://github.com/berkaysynnada) +- **Skills:** Algorithms, Data Structures, Applied Mathematics, Software Engineering +- **Expected Project Size:** 350 hours + +### [Spark-Compatible Functions Crate](https://github.com/apache/datafusion/issues/5600) + +- **Description and Outcomes:** In general, DataFusion aims to be compatible with PostgreSQL in terms of functions and behaviors. However, there are many users (and downstream projects, such as [DataFusion Comet](https://datafusion.apache.org/comet/)) that desire compatibility with [Apache Spark](https://spark.apache.org/). This project aims to collect Spark-compatible functions into a separate crate to help such users and/or projects. The project will be an exercise in creating the right APIs, explaining how to use them, and then telling the world about them (e.g. via creating a compatibility-tracking page cataloging such functions, writing blog posts etc.). +- **Category:** Extensions +- **Difficulty:** Medium +- **Possible Mentor(s) and/or Helper(s):** [alamb](https://github.com/alamb) and [andygrove](https://github.com/andygrove) +- **Skills:** SQL, Spark, Software Engineering +- **Expected Project Size:** 175 to 350 hours\* + +### [SQL Fuzzing Framework in Rust](https://github.com/apache/datafusion/issues/14535) + +- **Description and Outcomes:** Fuzz testing is a very important technique we utilize often in DataFusion. Having SQL-level fuzz testing enables us to battle-test DataFusion in an end-to-end fashion. Initial version of our fuzzing framework is Java-based, but the time has come to migrate to Rust-native solution. This will simplify the overall implementation (by avoiding things like JDBC), enable us to implement more advanced algorithms for query generation, and attract more contributors over time. This project is a good blend of software engineering, algorithms and testing techniques (i.e. fuzzing techniques). +- **Category:** Extensions +- **Difficulty:** Advanced +- **Possible Mentor(s) and/or Helper(s):** [2010YOUY01](https://github.com/2010YOUY01) +- **Skills:** SQL, Testing Techniques, Advanced Rust, Software Engineering +- **Expected Project Size:** 175 to 350 hours\* + +\*_There is enough material to make this a 350-hour project, but it is granular enough to make it a 175-hour project as well._ + +## Contact Us + +You can join our [mailing list](mailto:dev%40datafusion.apache.org) and [Discord](https://discord.gg/Q9eh6S2T) to introduce yourself and ask questions. diff --git a/docs/source/index.rst b/docs/source/index.rst index 45c4ffafe7f2..d9b0c126ab12 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -150,6 +150,7 @@ To get started, see contributor-guide/inviting contributor-guide/specification/index contributor-guide/gsoc_application_guidelines + contributor-guide/gsoc_project_ideas .. _toc.subprojects: From c3fdb1263eff51809072195bcad19db73d79e9e5 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Wed, 19 Feb 2025 20:25:20 +0800 Subject: [PATCH 20/71] `AggregateUDFImpl::window_function_schema_name` and `AggregateUDFImpl::window_function_display_name` for window aggregate function (#14750) * window params * window display * doc * fix --- datafusion-examples/examples/advanced_udwf.rs | 14 +- datafusion/core/src/physical_planner.rs | 31 ++- datafusion/expr/src/expr.rs | 189 +++++++++++------- datafusion/expr/src/expr_fn.rs | 24 ++- datafusion/expr/src/expr_schema.rs | 7 +- datafusion/expr/src/logical_plan/plan.rs | 5 +- datafusion/expr/src/tree_node.rs | 25 ++- datafusion/expr/src/udaf.rs | 138 +++++++++++-- datafusion/expr/src/utils.rs | 4 +- .../src/analyzer/count_wildcard_rule.rs | 4 +- .../optimizer/src/analyzer/type_coercion.rs | 13 +- datafusion/optimizer/src/push_down_filter.rs | 3 +- datafusion/proto/src/logical_plan/to_proto.rs | 15 +- datafusion/sql/src/unparser/expr.rs | 55 ++--- datafusion/sql/src/utils.rs | 16 +- .../substrait/src/logical_plan/consumer.rs | 21 +- .../substrait/src/logical_plan/producer.rs | 15 +- 17 files changed, 394 insertions(+), 185 deletions(-) diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/advanced_udwf.rs index ac326be9cb04..8330e783319d 100644 --- a/datafusion-examples/examples/advanced_udwf.rs +++ b/datafusion-examples/examples/advanced_udwf.rs @@ -26,7 +26,7 @@ use arrow::{ use datafusion::common::ScalarValue; use datafusion::error::Result; use datafusion::functions_aggregate::average::avg_udaf; -use datafusion::logical_expr::expr::WindowFunction; +use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams}; use datafusion::logical_expr::function::{ PartitionEvaluatorArgs, WindowFunctionSimplification, WindowUDFFieldArgs, }; @@ -192,11 +192,13 @@ impl WindowUDFImpl for SimplifySmoothItUdf { let simplify = |window_function: WindowFunction, _: &dyn SimplifyInfo| { Ok(Expr::WindowFunction(WindowFunction { fun: WindowFunctionDefinition::AggregateUDF(avg_udaf()), - args: window_function.args, - partition_by: window_function.partition_by, - order_by: window_function.order_by, - window_frame: window_function.window_frame, - null_treatment: window_function.null_treatment, + params: WindowFunctionParams { + args: window_function.params.args, + partition_by: window_function.params.partition_by, + order_by: window_function.params.order_by, + window_frame: window_function.params.window_frame, + null_treatment: window_function.params.null_treatment, + }, })) }; diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index bce1aab16e5e..d73b7d81536a 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -71,7 +71,7 @@ use datafusion_common::{ use datafusion_expr::dml::{CopyTo, InsertOp}; use datafusion_expr::expr::{ physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, - WindowFunction, + WindowFunction, WindowFunctionParams, }; use datafusion_expr::expr_rewriter::unnormalize_cols; use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary; @@ -569,16 +569,24 @@ impl DefaultPhysicalPlanner { let get_sort_keys = |expr: &Expr| match expr { Expr::WindowFunction(WindowFunction { - ref partition_by, - ref order_by, + params: + WindowFunctionParams { + ref partition_by, + ref order_by, + .. + }, .. }) => generate_sort_key(partition_by, order_by), Expr::Alias(Alias { expr, .. }) => { // Convert &Box to &T match &**expr { Expr::WindowFunction(WindowFunction { - ref partition_by, - ref order_by, + params: + WindowFunctionParams { + ref partition_by, + ref order_by, + .. + }, .. }) => generate_sort_key(partition_by, order_by), _ => unreachable!(), @@ -1509,11 +1517,14 @@ pub fn create_window_expr_with_name( match e { Expr::WindowFunction(WindowFunction { fun, - args, - partition_by, - order_by, - window_frame, - null_treatment, + params: + WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + }, }) => { let physical_args = create_physical_exprs(args, logical_schema, execution_props)?; diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 84ff36a9317d..df79b3568ce6 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -819,6 +819,11 @@ impl From> for WindowFunctionDefinition { pub struct WindowFunction { /// Name of the function pub fun: WindowFunctionDefinition, + pub params: WindowFunctionParams, +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +pub struct WindowFunctionParams { /// List of expressions to feed to the functions as arguments pub args: Vec, /// List of partition by expressions @@ -837,11 +842,13 @@ impl WindowFunction { pub fn new(fun: impl Into, args: Vec) -> Self { Self { fun: fun.into(), - args, - partition_by: Vec::default(), - order_by: Vec::default(), - window_frame: WindowFrame::new(None), - null_treatment: None, + params: WindowFunctionParams { + args, + partition_by: Vec::default(), + order_by: Vec::default(), + window_frame: WindowFrame::new(None), + null_treatment: None, + }, } } } @@ -1922,21 +1929,30 @@ impl NormalizeEq for Expr { ( Expr::WindowFunction(WindowFunction { fun: self_fun, - args: self_args, - partition_by: self_partition_by, - order_by: self_order_by, - window_frame: self_window_frame, - null_treatment: self_null_treatment, + params: self_params, }), Expr::WindowFunction(WindowFunction { fun: other_fun, - args: other_args, - partition_by: other_partition_by, - order_by: other_order_by, - window_frame: other_window_frame, - null_treatment: other_null_treatment, + params: other_params, }), ) => { + let ( + WindowFunctionParams { + args: self_args, + window_frame: self_window_frame, + partition_by: self_partition_by, + order_by: self_order_by, + null_treatment: self_null_treatment, + }, + WindowFunctionParams { + args: other_args, + window_frame: other_window_frame, + partition_by: other_partition_by, + order_by: other_order_by, + null_treatment: other_null_treatment, + }, + ) = (self_params, other_params); + self_fun.name() == other_fun.name() && self_window_frame == other_window_frame && self_null_treatment == other_null_treatment @@ -2179,14 +2195,14 @@ impl HashNode for Expr { distinct.hash(state); null_treatment.hash(state); } - Expr::WindowFunction(WindowFunction { - fun, - args: _args, - partition_by: _partition_by, - order_by: _order_by, - window_frame, - null_treatment, - }) => { + Expr::WindowFunction(WindowFunction { fun, params }) => { + let WindowFunctionParams { + args: _args, + partition_by: _, + order_by: _, + window_frame, + null_treatment, + } = params; fun.hash(state); window_frame.hash(state); null_treatment.hash(state); @@ -2467,39 +2483,52 @@ impl Display for SchemaDisplay<'_> { Ok(()) } - Expr::WindowFunction(WindowFunction { - fun, - args, - partition_by, - order_by, - window_frame, - null_treatment, - }) => { - write!( - f, - "{}({})", - fun, - schema_name_from_exprs_comma_separated_without_space(args)? - )?; - - if let Some(null_treatment) = null_treatment { - write!(f, " {}", null_treatment)?; + Expr::WindowFunction(WindowFunction { fun, params }) => match fun { + WindowFunctionDefinition::AggregateUDF(fun) => { + match fun.window_function_schema_name(params) { + Ok(name) => { + write!(f, "{name}") + } + Err(e) => { + write!(f, "got error from window_function_schema_name {}", e) + } + } } + _ => { + let WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = params; - if !partition_by.is_empty() { write!( f, - " PARTITION BY [{}]", - schema_name_from_exprs(partition_by)? + "{}({})", + fun, + schema_name_from_exprs_comma_separated_without_space(args)? )?; - } - if !order_by.is_empty() { - write!(f, " ORDER BY [{}]", schema_name_from_sorts(order_by)?)?; - }; + if let Some(null_treatment) = null_treatment { + write!(f, " {}", null_treatment)?; + } - write!(f, " {window_frame}") - } + if !partition_by.is_empty() { + write!( + f, + " PARTITION BY [{}]", + schema_name_from_exprs(partition_by)? + )?; + } + + if !order_by.is_empty() { + write!(f, " ORDER BY [{}]", schema_name_from_sorts(order_by)?)?; + }; + + write!(f, " {window_frame}") + } + }, } } } @@ -2621,33 +2650,47 @@ impl Display for Expr { // Expr::ScalarFunction(ScalarFunction { func, args }) => { // write!(f, "{}", func.display_name(args).unwrap()) // } - Expr::WindowFunction(WindowFunction { - fun, - args, - partition_by, - order_by, - window_frame, - null_treatment, - }) => { - fmt_function(f, &fun.to_string(), false, args, true)?; - - if let Some(nt) = null_treatment { - write!(f, "{}", nt)?; + Expr::WindowFunction(WindowFunction { fun, params }) => match fun { + WindowFunctionDefinition::AggregateUDF(fun) => { + match fun.window_function_display_name(params) { + Ok(name) => { + write!(f, "{}", name) + } + Err(e) => { + write!(f, "got error from window_function_display_name {}", e) + } + } } + WindowFunctionDefinition::WindowUDF(fun) => { + let WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = params; - if !partition_by.is_empty() { - write!(f, " PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; - } - if !order_by.is_empty() { - write!(f, " ORDER BY [{}]", expr_vec_fmt!(order_by))?; + fmt_function(f, &fun.to_string(), false, args, true)?; + + if let Some(nt) = null_treatment { + write!(f, "{}", nt)?; + } + + if !partition_by.is_empty() { + write!(f, " PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; + } + if !order_by.is_empty() { + write!(f, " ORDER BY [{}]", expr_vec_fmt!(order_by))?; + } + write!( + f, + " {} BETWEEN {} AND {}", + window_frame.units, + window_frame.start_bound, + window_frame.end_bound + ) } - write!( - f, - " {} BETWEEN {} AND {}", - window_frame.units, window_frame.start_bound, window_frame.end_bound - )?; - Ok(()) - } + }, Expr::AggregateFunction(AggregateFunction { func, params }) => { match func.display_name(params) { Ok(name) => { diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index a0425bf847f7..f47de4a8178f 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -19,7 +19,7 @@ use crate::expr::{ AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery, - Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, + Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, WindowFunctionParams, }; use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, @@ -832,14 +832,22 @@ impl ExprFuncBuilder { udaf.params.null_treatment = null_treatment; Expr::AggregateFunction(udaf) } - ExprFuncKind::Window(mut udwf) => { + ExprFuncKind::Window(WindowFunction { + fun, + params: WindowFunctionParams { args, .. }, + }) => { let has_order_by = order_by.as_ref().map(|o| !o.is_empty()); - udwf.order_by = order_by.unwrap_or_default(); - udwf.partition_by = partition_by.unwrap_or_default(); - udwf.window_frame = - window_frame.unwrap_or(WindowFrame::new(has_order_by)); - udwf.null_treatment = null_treatment; - Expr::WindowFunction(udwf) + Expr::WindowFunction(WindowFunction { + fun, + params: WindowFunctionParams { + args, + partition_by: partition_by.unwrap_or_default(), + order_by: order_by.unwrap_or_default(), + window_frame: window_frame + .unwrap_or(WindowFrame::new(has_order_by)), + null_treatment, + }, + }) } }; diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index becb7c14397d..ce1dd2f34c05 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -19,6 +19,7 @@ use super::{Between, Expr, Like}; use crate::expr::{ AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, + WindowFunctionParams, }; use crate::type_coercion::functions::{ data_types_with_aggregate_udf, data_types_with_scalar_udf, data_types_with_window_udf, @@ -510,7 +511,11 @@ impl Expr { schema: &dyn ExprSchema, window_function: &WindowFunction, ) -> Result<(DataType, bool)> { - let WindowFunction { fun, args, .. } = window_function; + let WindowFunction { + fun, + params: WindowFunctionParams { args, .. }, + .. + } = window_function; let data_types = args .iter() diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index a07da8adde78..870b0751c923 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -30,7 +30,7 @@ use super::invariants::{ }; use super::DdlStatement; use crate::builder::{change_redundant_column, unnest_with_options}; -use crate::expr::{Placeholder, Sort as SortExpr, WindowFunction}; +use crate::expr::{Placeholder, Sort as SortExpr, WindowFunction, WindowFunctionParams}; use crate::expr_rewriter::{ create_col_from_scalar_expr, normalize_cols, normalize_sorts, NamePreserver, }; @@ -2429,8 +2429,7 @@ impl Window { .filter_map(|(idx, expr)| { if let Expr::WindowFunction(WindowFunction { fun: WindowFunctionDefinition::WindowUDF(udwf), - partition_by, - .. + params: WindowFunctionParams { partition_by, .. }, }) = expr { // When there is no PARTITION BY, row number will be unique diff --git a/datafusion/expr/src/tree_node.rs b/datafusion/expr/src/tree_node.rs index 7801d564135e..50af62060346 100644 --- a/datafusion/expr/src/tree_node.rs +++ b/datafusion/expr/src/tree_node.rs @@ -20,7 +20,7 @@ use crate::expr::{ AggregateFunction, AggregateFunctionParams, Alias, Between, BinaryExpr, Case, Cast, GroupingSet, InList, InSubquery, Like, Placeholder, ScalarFunction, TryCast, Unnest, - WindowFunction, + WindowFunction, WindowFunctionParams, }; use crate::{Expr, ExprFunctionExt}; @@ -91,11 +91,11 @@ impl TreeNode for Expr { Expr::AggregateFunction(AggregateFunction { params: AggregateFunctionParams { args, filter, order_by, ..}, .. }) => (args, filter, order_by).apply_ref_elements(f), Expr::WindowFunction(WindowFunction { - args, - partition_by, - order_by, - .. - }) => { + params : WindowFunctionParams { + args, + partition_by, + order_by, + ..}, ..}) => { (args, partition_by, order_by).apply_ref_elements(f) } Expr::InList(InList { expr, list, .. }) => { @@ -224,12 +224,15 @@ impl TreeNode for Expr { })? } Expr::WindowFunction(WindowFunction { - args, fun, - partition_by, - order_by, - window_frame, - null_treatment, + params: + WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + }, }) => (args, partition_by, order_by).map_elements(f)?.update_data( |(new_args, new_partition_by, new_order_by)| { Expr::WindowFunction(WindowFunction::new(fun, new_args)) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index bf8f34f949e0..2b9e2bddd184 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -30,8 +30,9 @@ use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue, Statistics} use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use crate::expr::{ - schema_name_from_exprs_comma_separated_without_space, schema_name_from_sorts, - AggregateFunction, AggregateFunctionParams, + schema_name_from_exprs, schema_name_from_exprs_comma_separated_without_space, + schema_name_from_sorts, AggregateFunction, AggregateFunctionParams, + WindowFunctionParams, }; use crate::function::{ AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs, @@ -39,7 +40,7 @@ use crate::function::{ use crate::groups_accumulator::GroupsAccumulator; use crate::utils::format_state_name; use crate::utils::AggregateOrderSensitivity; -use crate::{Accumulator, Expr}; +use crate::{expr_vec_fmt, Accumulator, Expr}; use crate::{Documentation, Signature}; /// Logical representation of a user-defined [aggregate function] (UDAF). @@ -173,11 +174,25 @@ impl AggregateUDF { self.inner.schema_name(params) } + pub fn window_function_schema_name( + &self, + params: &WindowFunctionParams, + ) -> Result { + self.inner.window_function_schema_name(params) + } + /// See [`AggregateUDFImpl::display_name`] for more details. pub fn display_name(&self, params: &AggregateFunctionParams) -> Result { self.inner.display_name(params) } + pub fn window_function_display_name( + &self, + params: &WindowFunctionParams, + ) -> Result { + self.inner.window_function_display_name(params) + } + pub fn is_nullable(&self) -> bool { self.inner.is_nullable() } @@ -436,6 +451,55 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { Ok(schema_name) } + /// Returns the name of the column this expression would create + /// + /// See [`Expr::schema_name`] for details + /// + /// Different from `schema_name` in that it is used for window aggregate function + /// + /// Example of schema_name: count(DISTINCT column1) FILTER (WHERE column2 > 10) [PARTITION BY [..]] [ORDER BY [..]] + fn window_function_schema_name( + &self, + params: &WindowFunctionParams, + ) -> Result { + let WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = params; + + let mut schema_name = String::new(); + schema_name.write_fmt(format_args!( + "{}({})", + self.name(), + schema_name_from_exprs(args)? + ))?; + + if let Some(null_treatment) = null_treatment { + schema_name.write_fmt(format_args!(" {}", null_treatment))?; + } + + if !partition_by.is_empty() { + schema_name.write_fmt(format_args!( + " PARTITION BY [{}]", + schema_name_from_exprs(partition_by)? + ))?; + } + + if !order_by.is_empty() { + schema_name.write_fmt(format_args!( + " ORDER BY [{}]", + schema_name_from_sorts(order_by)? + ))?; + }; + + schema_name.write_fmt(format_args!(" {window_frame}"))?; + + Ok(schema_name) + } + /// Returns the user-defined display name of function, given the arguments /// /// This can be used to customize the output column name generated by this @@ -457,10 +521,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { "{}({}{})", self.name(), if *distinct { "DISTINCT " } else { "" }, - args.iter() - .map(|arg| format!("{arg}")) - .collect::>() - .join(", ") + expr_vec_fmt!(args) ))?; if let Some(nt) = null_treatment { @@ -469,19 +530,66 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { if let Some(fe) = filter { schema_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?; } - if let Some(ob) = order_by { - schema_name.write_fmt(format_args!( - " ORDER BY [{}]", - ob.iter() - .map(|o| format!("{o}")) - .collect::>() - .join(", ") - ))?; + if let Some(order_by) = order_by { + schema_name + .write_fmt(format_args!(" ORDER BY [{}]", expr_vec_fmt!(order_by)))?; } Ok(schema_name) } + /// Returns the user-defined display name of function, given the arguments + /// + /// This can be used to customize the output column name generated by this + /// function. + /// + /// Different from `display_name` in that it is used for window aggregate function + /// + /// Defaults to `function_name([DISTINCT] column1, column2, ..) [null_treatment] [partition by [..]] [order_by [..]]` + fn window_function_display_name( + &self, + params: &WindowFunctionParams, + ) -> Result { + let WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = params; + + let mut display_name = String::new(); + + display_name.write_fmt(format_args!( + "{}({})", + self.name(), + expr_vec_fmt!(args) + ))?; + + if let Some(null_treatment) = null_treatment { + display_name.write_fmt(format_args!(" {}", null_treatment))?; + } + + if !partition_by.is_empty() { + display_name.write_fmt(format_args!( + " PARTITION BY [{}]", + expr_vec_fmt!(partition_by) + ))?; + } + + if !order_by.is_empty() { + display_name + .write_fmt(format_args!(" ORDER BY [{}]", expr_vec_fmt!(order_by)))?; + }; + + display_name.write_fmt(format_args!( + " {} BETWEEN {} AND {}", + window_frame.units, window_frame.start_bound, window_frame.end_bound + ))?; + + Ok(display_name) + } + /// Returns the function's [`Signature`] for information about what input /// types are accepted and the function's Volatility. fn signature(&self) -> &Signature; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 049926fb0bcd..86c0f9ad637c 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -22,7 +22,7 @@ use std::collections::{BTreeSet, HashSet}; use std::ops::Deref; use std::sync::Arc; -use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction}; +use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction, WindowFunctionParams}; use crate::expr_rewriter::strip_outer_reference; use crate::{ and, BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator, @@ -588,7 +588,7 @@ pub fn group_window_expr_by_sort_keys( ) -> Result)>> { let mut result = vec![]; window_expr.into_iter().try_for_each(|expr| match &expr { - Expr::WindowFunction( WindowFunction{ partition_by, order_by, .. }) => { + Expr::WindowFunction( WindowFunction{ params: WindowFunctionParams { partition_by, order_by, ..}, .. }) => { let sort_key = generate_sort_key(partition_by, order_by)?; if let Some((_, values)) = result.iter_mut().find( |group: &&mut (WindowSortKey, Vec)| matches!(group, (key, _) if *key == sort_key), diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs index 7e73474cf6f5..f517761b1e33 100644 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs @@ -60,7 +60,7 @@ fn is_count_star_aggregate(aggregate_function: &AggregateFunction) -> bool { } fn is_count_star_window_aggregate(window_function: &WindowFunction) -> bool { - let args = &window_function.args; + let args = &window_function.params.args; matches!(window_function.fun, WindowFunctionDefinition::AggregateUDF(ref udaf) if udaf.name() == "count" && (args.len() == 1 && is_wildcard(&args[0]) || args.is_empty())) @@ -74,7 +74,7 @@ fn analyze_internal(plan: LogicalPlan) -> Result> { Expr::WindowFunction(mut window_function) if is_count_star_window_aggregate(&window_function) => { - window_function.args = vec![lit(COUNT_STAR_EXPANSION)]; + window_function.params.args = vec![lit(COUNT_STAR_EXPANSION)]; Ok(Transformed::yes(Expr::WindowFunction(window_function))) } Expr::AggregateFunction(mut aggregate_function) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index c7c84dc3d873..d1d491cc7a64 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -533,11 +533,14 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { } Expr::WindowFunction(WindowFunction { fun, - args, - partition_by, - order_by, - window_frame, - null_treatment, + params: + expr::WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + }, }) => { let window_frame = coerce_window_frame(window_frame, self.schema, &order_by)?; diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 1dda1c4c0ea1..c38dd35abd36 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -1003,7 +1003,8 @@ impl OptimizerRule for PushDownFilter { // Therefore, we need to ensure that any potential partition key returned is used in // ALL window functions. Otherwise, filters cannot be pushed by through that column. let extract_partition_keys = |func: &WindowFunction| { - func.partition_by + func.params + .partition_by .iter() .map(|c| Column::from_qualified_name(c.schema_name().to_string())) .collect::>() diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 228437271694..5785bc0c4966 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -300,12 +300,15 @@ pub fn serialize_expr( } Expr::WindowFunction(expr::WindowFunction { ref fun, - ref args, - ref partition_by, - ref order_by, - ref window_frame, - // TODO: support null treatment in proto - null_treatment: _, + params: + expr::WindowFunctionParams { + ref args, + ref partition_by, + ref order_by, + ref window_frame, + // TODO: support null treatment in proto + null_treatment: _, + }, }) => { let (window_function, fun_definition) = match fun { WindowFunctionDefinition::AggregateUDF(aggr_udf) => { diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 90630cf82f8e..7c56969d47cd 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::expr::{AggregateFunctionParams, Unnest}; +use datafusion_expr::expr::{AggregateFunctionParams, Unnest, WindowFunctionParams}; use sqlparser::ast::Value::SingleQuotedString; use sqlparser::ast::{ self, Array, BinaryOperator, Expr as AstExpr, Function, Ident, Interval, ObjectName, @@ -189,11 +189,14 @@ impl Unparser<'_> { Expr::Alias(Alias { expr, name: _, .. }) => self.expr_to_sql_inner(expr), Expr::WindowFunction(WindowFunction { fun, - args, - partition_by, - order_by, - window_frame, - null_treatment: _, + params: + WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + .. + }, }) => { let func_name = fun.name(); @@ -1929,30 +1932,34 @@ mod tests { ( Expr::WindowFunction(WindowFunction { fun: WindowFunctionDefinition::WindowUDF(row_number_udwf()), - args: vec![col("col")], - partition_by: vec![], - order_by: vec![], - window_frame: WindowFrame::new(None), - null_treatment: None, + params: WindowFunctionParams { + args: vec![col("col")], + partition_by: vec![], + order_by: vec![], + window_frame: WindowFrame::new(None), + null_treatment: None, + }, }), r#"row_number(col) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)"#, ), ( Expr::WindowFunction(WindowFunction { fun: WindowFunctionDefinition::AggregateUDF(count_udaf()), - args: vec![wildcard()], - partition_by: vec![], - order_by: vec![Sort::new(col("a"), false, true)], - window_frame: WindowFrame::new_bounds( - datafusion_expr::WindowFrameUnits::Range, - datafusion_expr::WindowFrameBound::Preceding( - ScalarValue::UInt32(Some(6)), - ), - datafusion_expr::WindowFrameBound::Following( - ScalarValue::UInt32(Some(2)), + params: WindowFunctionParams { + args: vec![wildcard()], + partition_by: vec![], + order_by: vec![Sort::new(col("a"), false, true)], + window_frame: WindowFrame::new_bounds( + datafusion_expr::WindowFrameUnits::Range, + datafusion_expr::WindowFrameBound::Preceding( + ScalarValue::UInt32(Some(6)), + ), + datafusion_expr::WindowFrameBound::Following( + ScalarValue::UInt32(Some(2)), + ), ), - ), - null_treatment: None, + null_treatment: None, + }, }), r#"count(*) OVER (ORDER BY a DESC NULLS FIRST RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING)"#, ), @@ -2785,7 +2792,7 @@ mod tests { let unparser = Unparser::new(dialect.as_ref()); let func = WindowFunctionDefinition::WindowUDF(rank_udwf()); let mut window_func = WindowFunction::new(func, vec![]); - window_func.order_by = vec![Sort::new(col("a"), true, true)]; + window_func.params.order_by = vec![Sort::new(col("a"), true, true)]; let expr = Expr::WindowFunction(window_func); let ast = unparser.expr_to_sql(&expr)?; diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index a9838ee68c44..3f093afaf26a 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -30,7 +30,9 @@ use datafusion_common::{ HashMap, Result, ScalarValue, }; use datafusion_expr::builder::get_struct_unnested_columns; -use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; +use datafusion_expr::expr::{ + Alias, GroupingSet, Unnest, WindowFunction, WindowFunctionParams, +}; use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs}; use datafusion_expr::{ col, expr_vec_fmt, ColumnUnnestList, Expr, ExprSchemable, LogicalPlan, @@ -240,11 +242,15 @@ pub fn window_expr_common_partition_keys(window_exprs: &[Expr]) -> Result<&[Expr let all_partition_keys = window_exprs .iter() .map(|expr| match expr { - Expr::WindowFunction(WindowFunction { partition_by, .. }) => Ok(partition_by), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { partition_by, .. }, + .. + }) => Ok(partition_by), Expr::Alias(Alias { expr, .. }) => match expr.as_ref() { - Expr::WindowFunction(WindowFunction { partition_by, .. }) => { - Ok(partition_by) - } + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { partition_by, .. }, + .. + }) => Ok(partition_by), expr => exec_err!("Impossibly got non-window expr {expr:?}"), }, expr => exec_err!("Impossibly got non-window expr {expr:?}"), diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 89112e3fe84e..da8613781d69 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -27,7 +27,7 @@ use datafusion::common::{ substrait_datafusion_err, substrait_err, DFSchema, DFSchemaRef, TableReference, }; use datafusion::datasource::provider_as_source; -use datafusion::logical_expr::expr::{Exists, InSubquery, Sort}; +use datafusion::logical_expr::expr::{Exists, InSubquery, Sort, WindowFunctionParams}; use datafusion::logical_expr::{ Aggregate, BinaryExpr, Case, Cast, EmptyRelation, Expr, ExprSchemable, Extension, @@ -2223,12 +2223,19 @@ pub async fn from_window_function( Ok(Expr::WindowFunction(expr::WindowFunction { fun, - args: from_substrait_func_args(consumer, &window.arguments, input_schema).await?, - partition_by: from_substrait_rex_vec(consumer, &window.partitions, input_schema) + params: WindowFunctionParams { + args: from_substrait_func_args(consumer, &window.arguments, input_schema) + .await?, + partition_by: from_substrait_rex_vec( + consumer, + &window.partitions, + input_schema, + ) .await?, - order_by, - window_frame, - null_treatment: None, + order_by, + window_frame, + null_treatment: None, + }, })) } @@ -3361,7 +3368,7 @@ mod test { match from_substrait_rex(&consumer, &substrait, &DFSchema::empty()).await? { Expr::WindowFunction(window_function) => { - assert_eq!(window_function.order_by.len(), 1) + assert_eq!(window_function.params.order_by.len(), 1) } _ => panic!("expr was not a WindowFunction"), }; diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index d7cc25d7bf65..36e89b8205ea 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -53,7 +53,7 @@ use datafusion::execution::registry::SerializerRegistry; use datafusion::execution::SessionState; use datafusion::logical_expr::expr::{ AggregateFunctionParams, Alias, BinaryExpr, Case, Cast, GroupingSet, InList, - InSubquery, WindowFunction, + InSubquery, WindowFunction, WindowFunctionParams, }; use datafusion::logical_expr::{expr, Between, JoinConstraint, LogicalPlan, Operator}; use datafusion::prelude::Expr; @@ -1616,11 +1616,14 @@ pub fn from_window_function( ) -> Result { let WindowFunction { fun, - args, - partition_by, - order_by, - window_frame, - null_treatment: _, + params: + WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment: _, + }, } = window_fn; // function reference let function_anchor = producer.register_function(fun.to_string()); From a6a1be21e0c2afd3bbfeb5f9bd947b152323e06e Mon Sep 17 00:00:00 2001 From: Ian Lai <108986288+Chen-Yuan-Lai@users.noreply.github.com> Date: Wed, 19 Feb 2025 21:41:30 +0800 Subject: [PATCH 21/71] chore: migrate crypto functions to invoke_with_args (#14764) * chore: migrate crypto functions to invoke_with_args * fix: fmt --------- Co-authored-by: Cheng-Yuan-Lai Co-authored-by: Ian Lai --- datafusion/functions/src/crypto/digest.rs | 11 ++++------- datafusion/functions/src/crypto/md5.rs | 11 ++++------- datafusion/functions/src/crypto/sha224.rs | 11 ++++------- datafusion/functions/src/crypto/sha256.rs | 11 ++++------- datafusion/functions/src/crypto/sha384.rs | 11 ++++------- datafusion/functions/src/crypto/sha512.rs | 11 ++++------- 6 files changed, 24 insertions(+), 42 deletions(-) diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index cc52f32614fd..4f9d4605fe07 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -20,7 +20,8 @@ use super::basic::{digest, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::*, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + TypeSignature::*, Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -94,12 +95,8 @@ impl ScalarUDFImpl for DigestFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - digest(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + digest(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 636ca65735c9..18ad0d6a7ded 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -20,7 +20,8 @@ use crate::crypto::basic::md5; use arrow::datatypes::DataType; use datafusion_common::{plan_err, Result}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -98,12 +99,8 @@ impl ScalarUDFImpl for Md5Func { } }) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - md5(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + md5(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 341b3495f9c6..24fe5e119df3 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -20,7 +20,8 @@ use super::basic::{sha224, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -80,12 +81,8 @@ impl ScalarUDFImpl for SHA224Func { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - sha224(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + sha224(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index f40dd99c59fe..c48dda19cbc5 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -20,7 +20,8 @@ use super::basic::{sha256, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -78,12 +79,8 @@ impl ScalarUDFImpl for SHA256Func { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - sha256(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + sha256(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs index e38a755826f8..11d1d130e929 100644 --- a/datafusion/functions/src/crypto/sha384.rs +++ b/datafusion/functions/src/crypto/sha384.rs @@ -20,7 +20,8 @@ use super::basic::{sha384, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -78,12 +79,8 @@ impl ScalarUDFImpl for SHA384Func { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - sha384(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + sha384(&args.args) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs index 7fe2a26ebbce..26fa85a5da3a 100644 --- a/datafusion/functions/src/crypto/sha512.rs +++ b/datafusion/functions/src/crypto/sha512.rs @@ -20,7 +20,8 @@ use super::basic::{sha512, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -78,12 +79,8 @@ impl ScalarUDFImpl for SHA512Func { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } - fn invoke_batch( - &self, - args: &[ColumnarValue], - _number_rows: usize, - ) -> Result { - sha512(args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + sha512(&args.args) } fn documentation(&self) -> Option<&Documentation> { From ee2d2a4f9615605644d8c8bea1d0e8c84d04bbc7 Mon Sep 17 00:00:00 2001 From: Christian <9384305+ctsk@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:18:18 +0100 Subject: [PATCH 22/71] remove custom extract_ok! macro (#14733) --- .../physical-plan/src/aggregates/row_hash.rs | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index a0251857c272..05122d5a5403 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -632,16 +632,6 @@ pub(crate) fn create_group_accumulator( } } -/// Extracts a successful Ok(_) or returns Poll::Ready(Some(Err(e))) with errors -macro_rules! extract_ok { - ($RES: expr) => {{ - match $RES { - Ok(v) => v, - Err(e) => return Poll::Ready(Some(Err(e))), - } - }}; -} - impl Stream for GroupedHashAggregateStream { type Item = Result; @@ -661,7 +651,7 @@ impl Stream for GroupedHashAggregateStream { let input_rows = batch.num_rows(); // Do the grouping - extract_ok!(self.group_aggregate_batch(batch)); + self.group_aggregate_batch(batch)?; self.update_skip_aggregation_probe(input_rows); @@ -673,16 +663,14 @@ impl Stream for GroupedHashAggregateStream { // emit all groups and switch to producing output if self.hit_soft_group_limit() { timer.done(); - extract_ok!(self.set_input_done_and_produce_output()); + self.set_input_done_and_produce_output()?; // make sure the exec_state just set is not overwritten below break 'reading_input; } if let Some(to_emit) = self.group_ordering.emit_to() { timer.done(); - if let Some(batch) = - extract_ok!(self.emit(to_emit, false)) - { + if let Some(batch) = self.emit(to_emit, false)? { self.exec_state = ExecutionState::ProducingOutput(batch); }; @@ -690,9 +678,9 @@ impl Stream for GroupedHashAggregateStream { break 'reading_input; } - extract_ok!(self.emit_early_if_necessary()); + self.emit_early_if_necessary()?; - extract_ok!(self.switch_to_skip_aggregation()); + self.switch_to_skip_aggregation()?; timer.done(); } @@ -703,10 +691,10 @@ impl Stream for GroupedHashAggregateStream { let timer = elapsed_compute.timer(); // Make sure we have enough capacity for `batch`, otherwise spill - extract_ok!(self.spill_previous_if_necessary(&batch)); + self.spill_previous_if_necessary(&batch)?; // Do the grouping - extract_ok!(self.group_aggregate_batch(batch)); + self.group_aggregate_batch(batch)?; // If we can begin emitting rows, do so, // otherwise keep consuming input @@ -716,16 +704,14 @@ impl Stream for GroupedHashAggregateStream { // emit all groups and switch to producing output if self.hit_soft_group_limit() { timer.done(); - extract_ok!(self.set_input_done_and_produce_output()); + self.set_input_done_and_produce_output()?; // make sure the exec_state just set is not overwritten below break 'reading_input; } if let Some(to_emit) = self.group_ordering.emit_to() { timer.done(); - if let Some(batch) = - extract_ok!(self.emit(to_emit, false)) - { + if let Some(batch) = self.emit(to_emit, false)? { self.exec_state = ExecutionState::ProducingOutput(batch); }; @@ -745,7 +731,7 @@ impl Stream for GroupedHashAggregateStream { // Found end from input stream None => { // inner is done, emit all rows and switch to producing output - extract_ok!(self.set_input_done_and_produce_output()); + self.set_input_done_and_produce_output()?; } } } From 8ab0661a39bd69783b31b949e7a768fb518629e7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 19 Feb 2025 13:17:05 -0500 Subject: [PATCH 23/71] feat: Add ScalarUDF support in FFI crate (#14579) * initial commit for scalar udf in ffi crate * Add utility functions for converting back and forth to RResult * License text * There is no need to repeat the trait doc strings * Resolve clippy warning * Add unit tests for ffi scalar udfs * Add license text * Switch over ffi modules to use the new macros for conversion back and forth between result and rresult * Attempting to fix CI based on recommendation to try running clean, but this shouldn't be necessary * Revert "Attempting to fix CI based on recommendation to try running clean, but this shouldn't be necessary" This reverts commit 10248c27f6df3ff741a68824d85c729aa52ddf49. * arrow_schema was removed during rebase * Switch from trying to expose the entire type signature to using the user_defined type * Call function to get valid types for scalar udf * Adding documentation * Resolve doctest failure --- datafusion/ffi/src/arrow_wrappers.rs | 13 +- datafusion/ffi/src/execution_plan.rs | 25 +- datafusion/ffi/src/lib.rs | 3 + datafusion/ffi/src/plan_properties.rs | 116 +++--- datafusion/ffi/src/record_batch_stream.rs | 16 +- datafusion/ffi/src/table_provider.rs | 82 ++-- datafusion/ffi/src/tests/mod.rs | 26 +- datafusion/ffi/src/tests/udf_udaf_udwf.rs | 27 ++ datafusion/ffi/src/udf.rs | 351 ++++++++++++++++++ datafusion/ffi/src/util.rs | 135 +++++++ datafusion/ffi/src/volatility.rs | 48 +++ .../{table_provider.rs => ffi_integration.rs} | 77 +++- 12 files changed, 741 insertions(+), 178 deletions(-) create mode 100644 datafusion/ffi/src/tests/udf_udaf_udwf.rs create mode 100644 datafusion/ffi/src/udf.rs create mode 100644 datafusion/ffi/src/util.rs create mode 100644 datafusion/ffi/src/volatility.rs rename datafusion/ffi/tests/{table_provider.rs => ffi_integration.rs} (68%) diff --git a/datafusion/ffi/src/arrow_wrappers.rs b/datafusion/ffi/src/arrow_wrappers.rs index c5add8782c51..a18e6df59bf1 100644 --- a/datafusion/ffi/src/arrow_wrappers.rs +++ b/datafusion/ffi/src/arrow_wrappers.rs @@ -19,8 +19,9 @@ use std::sync::Arc; use abi_stable::StableAbi; use arrow::{ + array::{make_array, ArrayRef}, datatypes::{Schema, SchemaRef}, - ffi::{FFI_ArrowArray, FFI_ArrowSchema}, + ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}, }; use log::error; @@ -68,3 +69,13 @@ pub struct WrappedArray { pub schema: WrappedSchema, } + +impl TryFrom for ArrayRef { + type Error = arrow::error::ArrowError; + + fn try_from(value: WrappedArray) -> Result { + let data = unsafe { from_ffi(value.array, &value.schema.0)? }; + + Ok(make_array(data)) + } +} diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 6c5db1218563..8087acfa33c8 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -30,7 +30,8 @@ use datafusion::{ use tokio::runtime::Handle; use crate::{ - plan_properties::FFI_PlanProperties, record_batch_stream::FFI_RecordBatchStream, + df_result, plan_properties::FFI_PlanProperties, + record_batch_stream::FFI_RecordBatchStream, rresult, }; /// A stable struct for sharing a [`ExecutionPlan`] across FFI boundaries. @@ -112,13 +113,11 @@ unsafe extern "C" fn execute_fn_wrapper( let ctx = &(*private_data).context; let runtime = (*private_data).runtime.clone(); - match plan.execute(partition, Arc::clone(ctx)) { - Ok(rbs) => RResult::ROk(FFI_RecordBatchStream::new(rbs, runtime)), - Err(e) => RResult::RErr( - format!("Error occurred during FFI_ExecutionPlan execute: {}", e).into(), - ), - } + rresult!(plan + .execute(partition, Arc::clone(ctx)) + .map(|rbs| FFI_RecordBatchStream::new(rbs, runtime))) } + unsafe extern "C" fn name_fn_wrapper(plan: &FFI_ExecutionPlan) -> RString { let private_data = plan.private_data as *const ExecutionPlanPrivateData; let plan = &(*private_data).plan; @@ -274,16 +273,8 @@ impl ExecutionPlan for ForeignExecutionPlan { _context: Arc, ) -> Result { unsafe { - match (self.plan.execute)(&self.plan, partition) { - RResult::ROk(stream) => { - let stream = Pin::new(Box::new(stream)) as SendableRecordBatchStream; - Ok(stream) - } - RResult::RErr(e) => Err(DataFusionError::Execution(format!( - "Error occurred during FFI call to FFI_ExecutionPlan execute. {}", - e - ))), - } + df_result!((self.plan.execute)(&self.plan, partition)) + .map(|stream| Pin::new(Box::new(stream)) as SendableRecordBatchStream) } } } diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs index b25528234773..bbcdd85ff80a 100644 --- a/datafusion/ffi/src/lib.rs +++ b/datafusion/ffi/src/lib.rs @@ -26,6 +26,9 @@ pub mod record_batch_stream; pub mod session_config; pub mod table_provider; pub mod table_source; +pub mod udf; +pub mod util; +pub mod volatility; #[cfg(feature = "integration-tests")] pub mod tests; diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 3c7bc886aede..3592c16b8fab 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -19,8 +19,8 @@ use std::{ffi::c_void, sync::Arc}; use abi_stable::{ std_types::{ - RResult::{self, RErr, ROk}, - RStr, RVec, + RResult::{self, ROk}, + RString, RVec, }, StableAbi, }; @@ -44,7 +44,7 @@ use datafusion_proto::{ }; use prost::Message; -use crate::arrow_wrappers::WrappedSchema; +use crate::{arrow_wrappers::WrappedSchema, df_result, rresult_return}; /// A stable struct for sharing [`PlanProperties`] across FFI boundaries. #[repr(C)] @@ -54,7 +54,7 @@ pub struct FFI_PlanProperties { /// The output partitioning is a [`Partitioning`] protobuf message serialized /// into bytes to pass across the FFI boundary. pub output_partitioning: - unsafe extern "C" fn(plan: &Self) -> RResult, RStr<'static>>, + unsafe extern "C" fn(plan: &Self) -> RResult, RString>, /// Return the emission type of the plan. pub emission_type: unsafe extern "C" fn(plan: &Self) -> FFI_EmissionType, @@ -64,8 +64,7 @@ pub struct FFI_PlanProperties { /// The output ordering is a [`PhysicalSortExprNodeCollection`] protobuf message /// serialized into bytes to pass across the FFI boundary. - pub output_ordering: - unsafe extern "C" fn(plan: &Self) -> RResult, RStr<'static>>, + pub output_ordering: unsafe extern "C" fn(plan: &Self) -> RResult, RString>, /// Return the schema of the plan. pub schema: unsafe extern "C" fn(plan: &Self) -> WrappedSchema, @@ -84,21 +83,13 @@ struct PlanPropertiesPrivateData { unsafe extern "C" fn output_partitioning_fn_wrapper( properties: &FFI_PlanProperties, -) -> RResult, RStr<'static>> { +) -> RResult, RString> { let private_data = properties.private_data as *const PlanPropertiesPrivateData; let props = &(*private_data).props; let codec = DefaultPhysicalExtensionCodec {}; let partitioning_data = - match serialize_partitioning(props.output_partitioning(), &codec) { - Ok(p) => p, - Err(_) => { - return RErr( - "unable to serialize output_partitioning in FFI_PlanProperties" - .into(), - ) - } - }; + rresult_return!(serialize_partitioning(props.output_partitioning(), &codec)); let output_partitioning = partitioning_data.encode_to_vec(); ROk(output_partitioning.into()) @@ -122,31 +113,24 @@ unsafe extern "C" fn boundedness_fn_wrapper( unsafe extern "C" fn output_ordering_fn_wrapper( properties: &FFI_PlanProperties, -) -> RResult, RStr<'static>> { +) -> RResult, RString> { let private_data = properties.private_data as *const PlanPropertiesPrivateData; let props = &(*private_data).props; let codec = DefaultPhysicalExtensionCodec {}; - let output_ordering = - match props.output_ordering() { - Some(ordering) => { - let physical_sort_expr_nodes = - match serialize_physical_sort_exprs(ordering.to_owned(), &codec) { - Ok(v) => v, - Err(_) => return RErr( - "unable to serialize output_ordering in FFI_PlanProperties" - .into(), - ), - }; - - let ordering_data = PhysicalSortExprNodeCollection { - physical_sort_expr_nodes, - }; - - ordering_data.encode_to_vec() - } - None => Vec::default(), - }; + let output_ordering = match props.output_ordering() { + Some(ordering) => { + let physical_sort_expr_nodes = rresult_return!( + serialize_physical_sort_exprs(ordering.to_owned(), &codec) + ); + let ordering_data = PhysicalSortExprNodeCollection { + physical_sort_expr_nodes, + }; + + ordering_data.encode_to_vec() + } + None => Vec::default(), + }; ROk(output_ordering.into()) } @@ -200,40 +184,32 @@ impl TryFrom for PlanProperties { let codex = DefaultPhysicalExtensionCodec {}; let ffi_orderings = unsafe { (ffi_props.output_ordering)(&ffi_props) }; - let orderings = match ffi_orderings { - ROk(ordering_vec) => { - let proto_output_ordering = - PhysicalSortExprNodeCollection::decode(ordering_vec.as_ref()) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - Some(parse_physical_sort_exprs( - &proto_output_ordering.physical_sort_expr_nodes, - &default_ctx, - &schema, - &codex, - )?) - } - RErr(e) => return Err(DataFusionError::Plan(e.to_string())), - }; - let ffi_partitioning = unsafe { (ffi_props.output_partitioning)(&ffi_props) }; - let partitioning = match ffi_partitioning { - ROk(partitioning_vec) => { - let proto_output_partitioning = - Partitioning::decode(partitioning_vec.as_ref()) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - parse_protobuf_partitioning( - Some(&proto_output_partitioning), - &default_ctx, - &schema, - &codex, - )? - .ok_or(DataFusionError::Plan( - "Unable to deserialize partitioning protobuf in FFI_PlanProperties" - .to_string(), - )) - } - RErr(e) => Err(DataFusionError::Plan(e.to_string())), - }?; + let proto_output_ordering = + PhysicalSortExprNodeCollection::decode(df_result!(ffi_orderings)?.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let orderings = Some(parse_physical_sort_exprs( + &proto_output_ordering.physical_sort_expr_nodes, + &default_ctx, + &schema, + &codex, + )?); + + let partitioning_vec = + unsafe { df_result!((ffi_props.output_partitioning)(&ffi_props))? }; + let proto_output_partitioning = + Partitioning::decode(partitioning_vec.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let partitioning = parse_protobuf_partitioning( + Some(&proto_output_partitioning), + &default_ctx, + &schema, + &codex, + )? + .ok_or(DataFusionError::Plan( + "Unable to deserialize partitioning protobuf in FFI_PlanProperties" + .to_string(), + ))?; let eq_properties = match orderings { Some(ordering) => { diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index 466ce247678a..939c4050028c 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -35,7 +35,10 @@ use datafusion::{ use futures::{Stream, TryStreamExt}; use tokio::runtime::Handle; -use crate::arrow_wrappers::{WrappedArray, WrappedSchema}; +use crate::{ + arrow_wrappers::{WrappedArray, WrappedSchema}, + rresult, +}; /// A stable struct for sharing [`RecordBatchStream`] across FFI boundaries. /// We use the async-ffi crate for handling async calls across libraries. @@ -97,13 +100,12 @@ fn record_batch_to_wrapped_array( record_batch: RecordBatch, ) -> RResult { let struct_array = StructArray::from(record_batch); - match to_ffi(&struct_array.to_data()) { - Ok((array, schema)) => RResult::ROk(WrappedArray { + rresult!( + to_ffi(&struct_array.to_data()).map(|(array, schema)| WrappedArray { array, - schema: WrappedSchema(schema), - }), - Err(e) => RResult::RErr(e.to_string().into()), - } + schema: WrappedSchema(schema) + }) + ) } // probably want to use pub unsafe fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result { diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 978ac10206bd..0b4080abcb55 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -44,6 +44,7 @@ use tokio::runtime::Handle; use crate::{ arrow_wrappers::WrappedSchema, + df_result, rresult_return, session_config::ForeignSessionConfig, table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}, }; @@ -233,10 +234,7 @@ unsafe extern "C" fn scan_fn_wrapper( let runtime = &(*private_data).runtime; async move { - let config = match ForeignSessionConfig::try_from(&session_config) { - Ok(c) => c, - Err(e) => return RResult::RErr(e.to_string().into()), - }; + let config = rresult_return!(ForeignSessionConfig::try_from(&session_config)); let session = SessionStateBuilder::new() .with_default_features() .with_config(config.0) @@ -250,15 +248,13 @@ unsafe extern "C" fn scan_fn_wrapper( let codec = DefaultLogicalExtensionCodec {}; let proto_filters = - match LogicalExprList::decode(filters_serialized.as_ref()) { - Ok(f) => f, - Err(e) => return RResult::RErr(e.to_string().into()), - }; - - match parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec) { - Ok(f) => f, - Err(e) => return RResult::RErr(e.to_string().into()), - } + rresult_return!(LogicalExprList::decode(filters_serialized.as_ref())); + + rresult_return!(parse_exprs( + proto_filters.expr.iter(), + &default_ctx, + &codec + )) } }; @@ -268,13 +264,11 @@ unsafe extern "C" fn scan_fn_wrapper( false => Some(&projections), }; - let plan = match internal_provider - .scan(&ctx.state(), maybe_projections, &filters, limit.into()) - .await - { - Ok(p) => p, - Err(e) => return RResult::RErr(e.to_string().into()), - }; + let plan = rresult_return!( + internal_provider + .scan(&ctx.state(), maybe_projections, &filters, limit.into()) + .await + ); RResult::ROk(FFI_ExecutionPlan::new( plan, @@ -298,30 +292,22 @@ unsafe extern "C" fn insert_into_fn_wrapper( let runtime = &(*private_data).runtime; async move { - let config = match ForeignSessionConfig::try_from(&session_config) { - Ok(c) => c, - Err(e) => return RResult::RErr(e.to_string().into()), - }; + let config = rresult_return!(ForeignSessionConfig::try_from(&session_config)); let session = SessionStateBuilder::new() .with_default_features() .with_config(config.0) .build(); let ctx = SessionContext::new_with_state(session); - let input = match ForeignExecutionPlan::try_from(&input) { - Ok(input) => Arc::new(input), - Err(e) => return RResult::RErr(e.to_string().into()), - }; + let input = rresult_return!(ForeignExecutionPlan::try_from(&input).map(Arc::new)); let insert_op = InsertOp::from(insert_op); - let plan = match internal_provider - .insert_into(&ctx.state(), input, insert_op) - .await - { - Ok(p) => p, - Err(e) => return RResult::RErr(e.to_string().into()), - }; + let plan = rresult_return!( + internal_provider + .insert_into(&ctx.state(), input, insert_op) + .await + ); RResult::ROk(FFI_ExecutionPlan::new( plan, @@ -456,14 +442,7 @@ impl TableProvider for ForeignTableProvider { ) .await; - match maybe_plan { - RResult::ROk(p) => ForeignExecutionPlan::try_from(&p)?, - RResult::RErr(_) => { - return Err(DataFusionError::Internal( - "Unable to perform scan via FFI".to_string(), - )) - } - } + ForeignExecutionPlan::try_from(&df_result!(maybe_plan)?)? }; Ok(Arc::new(plan)) @@ -493,12 +472,9 @@ impl TableProvider for ForeignTableProvider { }; let serialized_filters = expr_list.encode_to_vec(); - let pushdowns = pushdown_fn(&self.0, serialized_filters.into()); + let pushdowns = df_result!(pushdown_fn(&self.0, serialized_filters.into()))?; - match pushdowns { - RResult::ROk(p) => Ok(p.iter().map(|v| v.into()).collect()), - RResult::RErr(e) => Err(DataFusionError::Plan(e.to_string())), - } + Ok(pushdowns.iter().map(|v| v.into()).collect()) } } @@ -519,15 +495,7 @@ impl TableProvider for ForeignTableProvider { let maybe_plan = (self.0.insert_into)(&self.0, &session_config, &input, insert_op).await; - match maybe_plan { - RResult::ROk(p) => ForeignExecutionPlan::try_from(&p)?, - RResult::RErr(e) => { - return Err(DataFusionError::Internal(format!( - "Unable to perform insert_into via FFI: {}", - e - ))) - } - } + ForeignExecutionPlan::try_from(&df_result!(maybe_plan)?)? }; Ok(Arc::new(plan)) diff --git a/datafusion/ffi/src/tests/mod.rs b/datafusion/ffi/src/tests/mod.rs index a5fc74b840d1..5a471cb8fe43 100644 --- a/datafusion/ffi/src/tests/mod.rs +++ b/datafusion/ffi/src/tests/mod.rs @@ -26,7 +26,7 @@ use abi_stable::{ StableAbi, }; -use super::table_provider::FFI_TableProvider; +use super::{table_provider::FFI_TableProvider, udf::FFI_ScalarUDF}; use arrow::array::RecordBatch; use async_provider::create_async_table_provider; use datafusion::{ @@ -34,27 +34,30 @@ use datafusion::{ common::record_batch, }; use sync_provider::create_sync_table_provider; +use udf_udaf_udwf::create_ffi_abs_func; mod async_provider; mod sync_provider; +mod udf_udaf_udwf; #[repr(C)] #[derive(StableAbi)] -#[sabi(kind(Prefix(prefix_ref = TableProviderModuleRef)))] +#[sabi(kind(Prefix(prefix_ref = ForeignLibraryModuleRef)))] /// This struct defines the module interfaces. It is to be shared by /// both the module loading program and library that implements the -/// module. It is possible to move this definition into the loading -/// program and reference it in the modules, but this example shows -/// how a user may wish to separate these concerns. -pub struct TableProviderModule { +/// module. +pub struct ForeignLibraryModule { /// Constructs the table provider pub create_table: extern "C" fn(synchronous: bool) -> FFI_TableProvider, + /// Create a scalar UDF + pub create_scalar_udf: extern "C" fn() -> FFI_ScalarUDF, + pub version: extern "C" fn() -> u64, } -impl RootModule for TableProviderModuleRef { - declare_root_module_statics! {TableProviderModuleRef} +impl RootModule for ForeignLibraryModuleRef { + declare_root_module_statics! {ForeignLibraryModuleRef} const BASE_NAME: &'static str = "datafusion_ffi"; const NAME: &'static str = "datafusion_ffi"; const VERSION_STRINGS: VersionStrings = package_version_strings!(); @@ -64,7 +67,7 @@ impl RootModule for TableProviderModuleRef { } } -fn create_test_schema() -> Arc { +pub fn create_test_schema() -> Arc { Arc::new(Schema::new(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Float64, true), @@ -90,9 +93,10 @@ extern "C" fn construct_table_provider(synchronous: bool) -> FFI_TableProvider { #[export_root_module] /// This defines the entry point for using the module. -pub fn get_simple_memory_table() -> TableProviderModuleRef { - TableProviderModule { +pub fn get_foreign_library_module() -> ForeignLibraryModuleRef { + ForeignLibraryModule { create_table: construct_table_provider, + create_scalar_udf: create_ffi_abs_func, version: super::version, } .leak_into_prefix() diff --git a/datafusion/ffi/src/tests/udf_udaf_udwf.rs b/datafusion/ffi/src/tests/udf_udaf_udwf.rs new file mode 100644 index 000000000000..e8a13aac1308 --- /dev/null +++ b/datafusion/ffi/src/tests/udf_udaf_udwf.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::udf::FFI_ScalarUDF; +use datafusion::{functions::math::abs::AbsFunc, logical_expr::ScalarUDF}; + +use std::sync::Arc; + +pub(crate) extern "C" fn create_ffi_abs_func() -> FFI_ScalarUDF { + let udf: Arc = Arc::new(AbsFunc::new().into()); + + udf.into() +} diff --git a/datafusion/ffi/src/udf.rs b/datafusion/ffi/src/udf.rs new file mode 100644 index 000000000000..bbc9cf936cee --- /dev/null +++ b/datafusion/ffi/src/udf.rs @@ -0,0 +1,351 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ffi::c_void, sync::Arc}; + +use abi_stable::{ + std_types::{RResult, RString, RVec}, + StableAbi, +}; +use arrow::datatypes::DataType; +use arrow::{ + array::ArrayRef, + error::ArrowError, + ffi::{from_ffi, to_ffi, FFI_ArrowSchema}, +}; +use datafusion::{ + error::DataFusionError, + logical_expr::type_coercion::functions::data_types_with_scalar_udf, +}; +use datafusion::{ + error::Result, + logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, + }, +}; + +use crate::{ + arrow_wrappers::{WrappedArray, WrappedSchema}, + df_result, rresult, rresult_return, + util::{rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped}, + volatility::FFI_Volatility, +}; + +/// A stable struct for sharing a [`ScalarUDF`] across FFI boundaries. +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(non_camel_case_types)] +pub struct FFI_ScalarUDF { + /// FFI equivalent to the `name` of a [`ScalarUDF`] + pub name: RString, + + /// FFI equivalent to the `aliases` of a [`ScalarUDF`] + pub aliases: RVec, + + /// FFI equivalent to the `volatility` of a [`ScalarUDF`] + pub volatility: FFI_Volatility, + + /// Determines the return type of the underlying [`ScalarUDF`] based on the + /// argument types. + pub return_type: unsafe extern "C" fn( + udf: &Self, + arg_types: RVec, + ) -> RResult, + + /// Execute the underlying [`ScalarUDF`] and return the result as a `FFI_ArrowArray` + /// within an AbiStable wrapper. + pub invoke_with_args: unsafe extern "C" fn( + udf: &Self, + args: RVec, + num_rows: usize, + return_type: WrappedSchema, + ) -> RResult, + + /// See [`ScalarUDFImpl`] for details on short_circuits + pub short_circuits: bool, + + /// Performs type coersion. To simply this interface, all UDFs are treated as having + /// user defined signatures, which will in turn call coerce_types to be called. This + /// call should be transparent to most users as the internal function performs the + /// appropriate calls on the underlying [`ScalarUDF`] + pub coerce_types: unsafe extern "C" fn( + udf: &Self, + arg_types: RVec, + ) -> RResult, RString>, + + /// Used to create a clone on the provider of the udf. This should + /// only need to be called by the receiver of the udf. + pub clone: unsafe extern "C" fn(udf: &Self) -> Self, + + /// Release the memory of the private data when it is no longer being used. + pub release: unsafe extern "C" fn(udf: &mut Self), + + /// Internal data. This is only to be accessed by the provider of the udf. + /// A [`ForeignScalarUDF`] should never attempt to access this data. + pub private_data: *mut c_void, +} + +unsafe impl Send for FFI_ScalarUDF {} +unsafe impl Sync for FFI_ScalarUDF {} + +pub struct ScalarUDFPrivateData { + pub udf: Arc, +} + +unsafe extern "C" fn return_type_fn_wrapper( + udf: &FFI_ScalarUDF, + arg_types: RVec, +) -> RResult { + let private_data = udf.private_data as *const ScalarUDFPrivateData; + let udf = &(*private_data).udf; + + let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types)); + + let return_type = udf + .return_type(&arg_types) + .and_then(|v| FFI_ArrowSchema::try_from(v).map_err(DataFusionError::from)) + .map(WrappedSchema); + + rresult!(return_type) +} + +unsafe extern "C" fn coerce_types_fn_wrapper( + udf: &FFI_ScalarUDF, + arg_types: RVec, +) -> RResult, RString> { + let private_data = udf.private_data as *const ScalarUDFPrivateData; + let udf = &(*private_data).udf; + + let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types)); + + let return_types = rresult_return!(data_types_with_scalar_udf(&arg_types, udf)); + + rresult!(vec_datatype_to_rvec_wrapped(&return_types)) +} + +unsafe extern "C" fn invoke_with_args_fn_wrapper( + udf: &FFI_ScalarUDF, + args: RVec, + number_rows: usize, + return_type: WrappedSchema, +) -> RResult { + let private_data = udf.private_data as *const ScalarUDFPrivateData; + let udf = &(*private_data).udf; + + let args = args + .into_iter() + .map(|arr| { + from_ffi(arr.array, &arr.schema.0) + .map(|v| ColumnarValue::Array(arrow::array::make_array(v))) + }) + .collect::>(); + + let args = rresult_return!(args); + let return_type = rresult_return!(DataType::try_from(&return_type.0)); + + let args = ScalarFunctionArgs { + args, + number_rows, + return_type: &return_type, + }; + + let result = rresult_return!(udf + .invoke_with_args(args) + .and_then(|r| r.to_array(number_rows))); + + let (result_array, result_schema) = rresult_return!(to_ffi(&result.to_data())); + + RResult::ROk(WrappedArray { + array: result_array, + schema: WrappedSchema(result_schema), + }) +} + +unsafe extern "C" fn release_fn_wrapper(udf: &mut FFI_ScalarUDF) { + let private_data = Box::from_raw(udf.private_data as *mut ScalarUDFPrivateData); + drop(private_data); +} + +unsafe extern "C" fn clone_fn_wrapper(udf: &FFI_ScalarUDF) -> FFI_ScalarUDF { + let private_data = udf.private_data as *const ScalarUDFPrivateData; + let udf_data = &(*private_data); + + Arc::clone(&udf_data.udf).into() +} + +impl Clone for FFI_ScalarUDF { + fn clone(&self) -> Self { + unsafe { (self.clone)(self) } + } +} + +impl From> for FFI_ScalarUDF { + fn from(udf: Arc) -> Self { + let name = udf.name().into(); + let aliases = udf.aliases().iter().map(|a| a.to_owned().into()).collect(); + let volatility = udf.signature().volatility.into(); + let short_circuits = udf.short_circuits(); + + let private_data = Box::new(ScalarUDFPrivateData { udf }); + + Self { + name, + aliases, + volatility, + short_circuits, + invoke_with_args: invoke_with_args_fn_wrapper, + return_type: return_type_fn_wrapper, + coerce_types: coerce_types_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + private_data: Box::into_raw(private_data) as *mut c_void, + } + } +} + +impl Drop for FFI_ScalarUDF { + fn drop(&mut self) { + unsafe { (self.release)(self) } + } +} + +/// This struct is used to access an UDF provided by a foreign +/// library across a FFI boundary. +/// +/// The ForeignScalarUDF is to be used by the caller of the UDF, so it has +/// no knowledge or access to the private data. All interaction with the UDF +/// must occur through the functions defined in FFI_ScalarUDF. +#[derive(Debug)] +pub struct ForeignScalarUDF { + name: String, + aliases: Vec, + udf: FFI_ScalarUDF, + signature: Signature, +} + +unsafe impl Send for ForeignScalarUDF {} +unsafe impl Sync for ForeignScalarUDF {} + +impl TryFrom<&FFI_ScalarUDF> for ForeignScalarUDF { + type Error = DataFusionError; + + fn try_from(udf: &FFI_ScalarUDF) -> Result { + let name = udf.name.to_owned().into(); + let signature = Signature::user_defined((&udf.volatility).into()); + + let aliases = udf.aliases.iter().map(|s| s.to_string()).collect(); + + Ok(Self { + name, + udf: udf.clone(), + aliases, + signature, + }) + } +} + +impl ScalarUDFImpl for ForeignScalarUDF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + &self.name + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let arg_types = vec_datatype_to_rvec_wrapped(arg_types)?; + + let result = unsafe { (self.udf.return_type)(&self.udf, arg_types) }; + + let result = df_result!(result); + + result.and_then(|r| (&r.0).try_into().map_err(DataFusionError::from)) + } + + fn invoke_with_args(&self, invoke_args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { + args, + number_rows, + return_type, + } = invoke_args; + + let args = args + .into_iter() + .map(|v| v.to_array(number_rows)) + .collect::>>()? + .into_iter() + .map(|v| { + to_ffi(&v.to_data()).map(|(ffi_array, ffi_schema)| WrappedArray { + array: ffi_array, + schema: WrappedSchema(ffi_schema), + }) + }) + .collect::, ArrowError>>()? + .into(); + + let return_type = WrappedSchema(FFI_ArrowSchema::try_from(return_type)?); + + let result = unsafe { + (self.udf.invoke_with_args)(&self.udf, args, number_rows, return_type) + }; + + let result = df_result!(result)?; + let result_array: ArrayRef = result.try_into()?; + + Ok(ColumnarValue::Array(result_array)) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn short_circuits(&self) -> bool { + self.udf.short_circuits + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + unsafe { + let arg_types = vec_datatype_to_rvec_wrapped(arg_types)?; + let result_types = df_result!((self.udf.coerce_types)(&self.udf, arg_types))?; + Ok(rvec_wrapped_to_vec_datatype(&result_types)?) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_round_trip_scalar_udf() -> Result<()> { + let original_udf = datafusion::functions::math::abs::AbsFunc::new(); + let original_udf = Arc::new(ScalarUDF::from(original_udf)); + + let local_udf: FFI_ScalarUDF = Arc::clone(&original_udf).into(); + + let foreign_udf: ForeignScalarUDF = (&local_udf).try_into()?; + + assert!(original_udf.name() == foreign_udf.name()); + + Ok(()) + } +} diff --git a/datafusion/ffi/src/util.rs b/datafusion/ffi/src/util.rs new file mode 100644 index 000000000000..9d5f2aefe324 --- /dev/null +++ b/datafusion/ffi/src/util.rs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use abi_stable::std_types::RVec; +use arrow::{datatypes::DataType, ffi::FFI_ArrowSchema}; + +use crate::arrow_wrappers::WrappedSchema; + +/// This macro is a helpful conversion utility to conver from an abi_stable::RResult to a +/// DataFusion result. +#[macro_export] +macro_rules! df_result { + ( $x:expr ) => { + match $x { + abi_stable::std_types::RResult::ROk(v) => Ok(v), + abi_stable::std_types::RResult::RErr(e) => { + Err(datafusion::error::DataFusionError::Execution(e.to_string())) + } + } + }; +} + +/// This macro is a helpful conversion utility to conver from a DataFusion Result to an abi_stable::RResult +#[macro_export] +macro_rules! rresult { + ( $x:expr ) => { + match $x { + Ok(v) => abi_stable::std_types::RResult::ROk(v), + Err(e) => abi_stable::std_types::RResult::RErr( + abi_stable::std_types::RString::from(e.to_string()), + ), + } + }; +} + +/// This macro is a helpful conversion utility to conver from a DataFusion Result to an abi_stable::RResult +/// and to also call return when it is an error. Since you cannot use `?` on an RResult, this is designed +/// to mimic the pattern. +#[macro_export] +macro_rules! rresult_return { + ( $x:expr ) => { + match $x { + Ok(v) => v, + Err(e) => { + return abi_stable::std_types::RResult::RErr( + abi_stable::std_types::RString::from(e.to_string()), + ) + } + } + }; +} + +/// This is a utility function to convert a slice of [`DataType`] to its equivalent +/// FFI friendly counterpart, [`WrappedSchema`] +pub fn vec_datatype_to_rvec_wrapped( + data_types: &[DataType], +) -> Result, arrow::error::ArrowError> { + Ok(data_types + .iter() + .map(FFI_ArrowSchema::try_from) + .collect::, arrow::error::ArrowError>>()? + .into_iter() + .map(WrappedSchema) + .collect()) +} + +/// This is a utility function to convert an FFI friendly vector of [`WrappedSchema`] +/// to their equivalent [`DataType`]. +pub fn rvec_wrapped_to_vec_datatype( + data_types: &RVec, +) -> Result, arrow::error::ArrowError> { + data_types + .iter() + .map(|d| DataType::try_from(&d.0)) + .collect() +} + +#[cfg(test)] +mod tests { + use abi_stable::std_types::{RResult, RString}; + use datafusion::error::DataFusionError; + + fn wrap_result(result: Result) -> RResult { + RResult::ROk(rresult_return!(result)) + } + + #[test] + fn test_conversion() { + const VALID_VALUE: &str = "valid_value"; + const ERROR_VALUE: &str = "error_value"; + + let ok_r_result: RResult = + RResult::ROk(VALID_VALUE.to_string().into()); + let err_r_result: RResult = + RResult::RErr(ERROR_VALUE.to_string().into()); + + let returned_ok_result = df_result!(ok_r_result); + assert!(returned_ok_result.is_ok()); + assert!(returned_ok_result.unwrap().to_string() == VALID_VALUE); + + let returned_err_result = df_result!(err_r_result); + assert!(returned_err_result.is_err()); + assert!( + returned_err_result.unwrap_err().to_string() + == format!("Execution error: {}", ERROR_VALUE) + ); + + let ok_result: Result = Ok(VALID_VALUE.to_string()); + let err_result: Result = + Err(DataFusionError::Execution(ERROR_VALUE.to_string())); + + let returned_ok_r_result = wrap_result(ok_result); + assert!(returned_ok_r_result == RResult::ROk(VALID_VALUE.into())); + + let returned_err_r_result = wrap_result(err_result); + assert!( + returned_err_r_result + == RResult::RErr(format!("Execution error: {}", ERROR_VALUE).into()) + ); + } +} diff --git a/datafusion/ffi/src/volatility.rs b/datafusion/ffi/src/volatility.rs new file mode 100644 index 000000000000..8b565b91b76d --- /dev/null +++ b/datafusion/ffi/src/volatility.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use abi_stable::StableAbi; +use datafusion::logical_expr::Volatility; + +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(non_camel_case_types)] +pub enum FFI_Volatility { + Immutable, + Stable, + Volatile, +} + +impl From for FFI_Volatility { + fn from(value: Volatility) -> Self { + match value { + Volatility::Immutable => Self::Immutable, + Volatility::Stable => Self::Stable, + Volatility::Volatile => Self::Volatile, + } + } +} + +impl From<&FFI_Volatility> for Volatility { + fn from(value: &FFI_Volatility) -> Self { + match value { + FFI_Volatility::Immutable => Self::Immutable, + FFI_Volatility::Stable => Self::Stable, + FFI_Volatility::Volatile => Self::Volatile, + } + } +} diff --git a/datafusion/ffi/tests/table_provider.rs b/datafusion/ffi/tests/ffi_integration.rs similarity index 68% rename from datafusion/ffi/tests/table_provider.rs rename to datafusion/ffi/tests/ffi_integration.rs index 9169c9f4221c..84e120df4299 100644 --- a/datafusion/ffi/tests/table_provider.rs +++ b/datafusion/ffi/tests/ffi_integration.rs @@ -21,10 +21,13 @@ mod tests { use abi_stable::library::RootModule; + use datafusion::common::record_batch; use datafusion::error::{DataFusionError, Result}; - use datafusion::prelude::SessionContext; + use datafusion::logical_expr::ScalarUDF; + use datafusion::prelude::{col, SessionContext}; use datafusion_ffi::table_provider::ForeignTableProvider; - use datafusion_ffi::tests::TableProviderModuleRef; + use datafusion_ffi::tests::{create_record_batch, ForeignLibraryModuleRef}; + use datafusion_ffi::udf::ForeignScalarUDF; use std::path::Path; use std::sync::Arc; @@ -61,11 +64,7 @@ mod tests { Ok(best_path) } - /// It is important that this test is in the `tests` directory and not in the - /// library directory so we can verify we are building a dynamic library and - /// testing it via a different executable. - #[cfg(feature = "integration-tests")] - async fn test_table_provider(synchronous: bool) -> Result<()> { + fn get_module() -> Result { let expected_version = datafusion_ffi::version(); let crate_root = Path::new(env!("CARGO_MANIFEST_DIR")); @@ -80,22 +79,30 @@ mod tests { // so you will need to change the approach here based on your use case. // let target: &std::path::Path = "../../../../target/".as_ref(); let library_path = - compute_library_path::(target_dir.as_path()) + compute_library_path::(target_dir.as_path()) .map_err(|e| DataFusionError::External(Box::new(e)))? .join("deps"); // Load the module - let table_provider_module = - TableProviderModuleRef::load_from_directory(&library_path) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let module = ForeignLibraryModuleRef::load_from_directory(&library_path) + .map_err(|e| DataFusionError::External(Box::new(e)))?; assert_eq!( - table_provider_module + module .version() .expect("Unable to call version on FFI module")(), expected_version ); + Ok(module) + } + + /// It is important that this test is in the `tests` directory and not in the + /// library directory so we can verify we are building a dynamic library and + /// testing it via a different executable. + async fn test_table_provider(synchronous: bool) -> Result<()> { + let table_provider_module = get_module()?; + // By calling the code below, the table provided will be created within // the module's code. let ffi_table_provider = table_provider_module.create_table().ok_or( @@ -116,9 +123,9 @@ mod tests { let results = df.collect().await?; assert_eq!(results.len(), 3); - assert_eq!(results[0], datafusion_ffi::tests::create_record_batch(1, 5)); - assert_eq!(results[1], datafusion_ffi::tests::create_record_batch(6, 1)); - assert_eq!(results[2], datafusion_ffi::tests::create_record_batch(7, 5)); + assert_eq!(results[0], create_record_batch(1, 5)); + assert_eq!(results[1], create_record_batch(6, 1)); + assert_eq!(results[2], create_record_batch(7, 5)); Ok(()) } @@ -132,4 +139,44 @@ mod tests { async fn sync_test_table_provider() -> Result<()> { test_table_provider(true).await } + + /// This test validates that we can load an external module and use a scalar + /// udf defined in it via the foreign function interface. In this case we are + /// using the abs() function as our scalar UDF. + #[tokio::test] + async fn test_scalar_udf() -> Result<()> { + let module = get_module()?; + + let ffi_abs_func = + module + .create_scalar_udf() + .ok_or(DataFusionError::NotImplemented( + "External table provider failed to implement create_scalar_udf" + .to_string(), + ))?(); + let foreign_abs_func: ForeignScalarUDF = (&ffi_abs_func).try_into()?; + + let udf: ScalarUDF = foreign_abs_func.into(); + + let ctx = SessionContext::default(); + let df = ctx.read_batch(create_record_batch(-5, 5))?; + + let df = df + .with_column("abs_a", udf.call(vec![col("a")]))? + .with_column("abs_b", udf.call(vec![col("b")]))?; + + let result = df.collect().await?; + + let expected = record_batch!( + ("a", Int32, vec![-5, -4, -3, -2, -1]), + ("b", Float64, vec![-5., -4., -3., -2., -1.]), + ("abs_a", Int32, vec![5, 4, 3, 2, 1]), + ("abs_b", Float64, vec![5., 4., 3., 2., 1.]) + )?; + + assert!(result.len() == 1); + assert!(result[0] == expected); + + Ok(()) + } } From 2f40f6c2ef352904d6661653ffe18ba4c3144b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Wed, 19 Feb 2025 23:21:21 +0300 Subject: [PATCH 24/71] Minor: Further Clean-up in Enforce Sorting (#14732) * Update mod.rs * Update mod.rs * Review --------- Co-authored-by: Mehmet Ozan Kabak --- .../src/enforce_sorting/mod.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs index 420c080f09c2..11f1d8751d83 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs @@ -385,9 +385,6 @@ pub fn parallelize_sorts( pub fn ensure_sorting( mut requirements: PlanWithCorrespondingSort, ) -> Result> { - // Before starting, making requirements' children's ExecutionPlan be same as the requirements' plan's children's ExecutionPlan. - // It should be guaranteed by previous code, but we need to make sure to avoid any potential missing. - requirements = requirements.update_plan_from_children()?; requirements = update_sort_ctx_children_data(requirements, false)?; // Perform naive analysis at the beginning -- remove already-satisfied sorts: @@ -419,7 +416,6 @@ pub fn ensure_sorting( child = update_child_to_remove_unnecessary_sort(idx, child, plan)?; } child = add_sort_above(child, required, None); - child = child.update_plan_from_children()?; child = update_sort_ctx_children_data(child, true)?; } } else if physical_ordering.is_none() @@ -433,25 +429,24 @@ pub fn ensure_sorting( updated_children.push(child); } requirements.children = updated_children; + requirements = requirements.update_plan_from_children()?; // For window expressions, we can remove some sorts when we can // calculate the result in reverse: let child_node = &requirements.children[0]; - if is_window(plan) && child_node.data { + if is_window(&requirements.plan) && child_node.data { return adjust_window_sort_removal(requirements).map(Transformed::yes); - } else if is_sort_preserving_merge(plan) + } else if is_sort_preserving_merge(&requirements.plan) && child_node.plan.output_partitioning().partition_count() <= 1 { // This `SortPreservingMergeExec` is unnecessary, input already has a // single partition and no fetch is required. let mut child_node = requirements.children.swap_remove(0); - if let Some(fetch) = plan.fetch() { - // Add the limit exec if the spm has a fetch + if let Some(fetch) = requirements.plan.fetch() { + // Add the limit exec if the original SPM had a fetch: child_node.plan = Arc::new(LocalLimitExec::new(Arc::clone(&child_node.plan), fetch)); } return Ok(Transformed::yes(child_node)); - } else { - requirements = requirements.update_plan_from_children()?; } update_sort_ctx_children_data(requirements, false).map(Transformed::yes) } @@ -712,7 +707,6 @@ fn remove_corresponding_sort_from_sub_plan( Arc::new(CoalescePartitionsExec::new(plan)) as _ }; node = PlanWithCorrespondingSort::new(plan, false, vec![node]); - node = node.update_plan_from_children()?; node = update_sort_ctx_children_data(node, false)?; } Ok(node) From 0ba16fb3fb15fe50d92ef65bcc76f150180eba54 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 06:28:57 -0500 Subject: [PATCH 25/71] chore(deps): bump arrow-flight from 54.1.0 to 54.2.0 (#14786) Bumps [arrow-flight](https://github.com/apache/arrow-rs) from 54.1.0 to 54.2.0. - [Release notes](https://github.com/apache/arrow-rs/releases) - [Changelog](https://github.com/apache/arrow-rs/blob/main/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/54.1.0...54.2.0) --- updated-dependencies: - dependency-name: arrow-flight dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 88102bb3fdc8..354f458e5f4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -361,9 +361,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9b3aaba47ed4b6146563c8b79ad0f7aa283f794cde0c057c656291b81196746" +checksum = "cf7806ee3d229ee866013e83446e937ab3c8a9e6a664b259d41dd960b309c5d0" dependencies = [ "arrow-arith", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index cc12d6b2e429..ccf3f02a2fde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,7 +83,7 @@ arrow = { version = "54.2.0", features = [ "chrono-tz", ] } arrow-buffer = { version = "54.1.0", default-features = false } -arrow-flight = { version = "54.1.0", features = [ +arrow-flight = { version = "54.2.0", features = [ "flight-sql-experimental", ] } arrow-ipc = { version = "54.2.0", default-features = false, features = [ From 8440c6ea503263fb759173ee45ae2105eec56b6d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 06:40:15 -0500 Subject: [PATCH 26/71] chore(deps): bump serde_json from 1.0.138 to 1.0.139 (#14784) Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.138 to 1.0.139. - [Release notes](https://github.com/serde-rs/json/releases) - [Commits](https://github.com/serde-rs/json/compare/v1.0.138...v1.0.139) --- updated-dependencies: - dependency-name: serde_json dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 354f458e5f4f..b7794d731b75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5363,9 +5363,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.138" +version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6" dependencies = [ "itoa", "memchr", From 9ff7bb9913c11b306400ccbd1c20d838971f12fb Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 20 Feb 2025 12:58:33 +0100 Subject: [PATCH 27/71] dependabot: group arrow/parquet minor/patch bumps, remove limit (#14730) * dependabot: group arrow/parquet minor/patch bumps, remove limit * Update .github/dependabot.yml --------- Co-authored-by: Oleks V --- .github/dependabot.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 6e61c47d7b2c..7c2b7e3a5458 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,14 +21,24 @@ updates: directory: "/" schedule: interval: daily - open-pull-requests-limit: 10 target-branch: main labels: [auto-dependencies] ignore: - # arrow is bumped manually + # major version bumps of arrow* and parquet are handled manually - dependency-name: "arrow*" update-types: ["version-update:semver-major"] + - dependency-name: "parquet" + update-types: ["version-update:semver-major"] groups: + # minor and patch bumps of arrow* and parquet are grouped + arrow-parquet: + applies-to: version-updates + patterns: + - "arrow*" + - "parquet" + update-types: + - "minor" + - "patch" proto: applies-to: version-updates patterns: From 2b39b845c666305322dcceafc4fff1b0e2c483e8 Mon Sep 17 00:00:00 2001 From: niebayes Date: Thu, 20 Feb 2025 19:59:53 +0800 Subject: [PATCH 28/71] fix: Substrait serializer clippy error: not calling truncate (#14723) * specify truncate true * add error handling * Apply suggestions from code review Co-authored-by: Matthijs Brobbel * remove substrait from error messages * Apply suggestions from code review Co-authored-by: Matthijs Brobbel * simplify serialize * fix ut * Update datafusion/substrait/tests/cases/serialize.rs Co-authored-by: Matthijs Brobbel * apply part of follow-up suggestions --------- Co-authored-by: Matthijs Brobbel --- datafusion/substrait/Cargo.toml | 1 + datafusion/substrait/src/serializer.rs | 45 +++++++++++++------ datafusion/substrait/tests/cases/serialize.rs | 20 +++++++++ 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index f13d2b77a787..3e3ea7843ac9 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -41,6 +41,7 @@ pbjson-types = { workspace = true } prost = { workspace = true } substrait = { version = "0.53", features = ["serde"] } url = { workspace = true } +tokio = { workspace = true, features = ["fs"] } [dev-dependencies] datafusion = { workspace = true, features = ["nested_expressions"] } diff --git a/datafusion/substrait/src/serializer.rs b/datafusion/substrait/src/serializer.rs index 4278671777fd..4a9e5d55ce05 100644 --- a/datafusion/substrait/src/serializer.rs +++ b/datafusion/substrait/src/serializer.rs @@ -22,42 +22,59 @@ use datafusion::error::Result; use datafusion::prelude::*; use prost::Message; +use std::path::Path; use substrait::proto::Plan; +use tokio::{ + fs::OpenOptions, + io::{AsyncReadExt, AsyncWriteExt}, +}; -use std::fs::OpenOptions; -use std::io::{Read, Write}; +/// Plans a sql and serializes the generated logical plan to bytes. +/// The bytes are then written into a file at `path`. +/// +/// Returns an error if the file already exists. +pub async fn serialize( + sql: &str, + ctx: &SessionContext, + path: impl AsRef, +) -> Result<()> { + let protobuf_out = serialize_bytes(sql, ctx).await?; -#[allow(clippy::suspicious_open_options)] -pub async fn serialize(sql: &str, ctx: &SessionContext, path: &str) -> Result<()> { - let protobuf_out = serialize_bytes(sql, ctx).await; - let mut file = OpenOptions::new().create(true).write(true).open(path)?; - file.write_all(&protobuf_out?)?; + let mut file = OpenOptions::new() + .write(true) + .create_new(true) + .open(path) + .await?; + file.write_all(&protobuf_out).await?; Ok(()) } +/// Plans a sql and serializes the generated logical plan to bytes. pub async fn serialize_bytes(sql: &str, ctx: &SessionContext) -> Result> { let df = ctx.sql(sql).await?; let plan = df.into_optimized_plan()?; let proto = producer::to_substrait_plan(&plan, &ctx.state())?; let mut protobuf_out = Vec::::new(); - proto.encode(&mut protobuf_out).map_err(|e| { - DataFusionError::Substrait(format!("Failed to encode substrait plan: {e}")) - })?; + proto + .encode(&mut protobuf_out) + .map_err(|e| DataFusionError::Substrait(format!("Failed to encode plan: {e}")))?; Ok(protobuf_out) } -pub async fn deserialize(path: &str) -> Result> { +/// Reads the file at `path` and deserializes a plan from the bytes. +pub async fn deserialize(path: impl AsRef) -> Result> { let mut protobuf_in = Vec::::new(); - let mut file = OpenOptions::new().read(true).open(path)?; + let mut file = OpenOptions::new().read(true).open(path).await?; + file.read_to_end(&mut protobuf_in).await?; - file.read_to_end(&mut protobuf_in)?; deserialize_bytes(protobuf_in).await } +/// Deserializes a plan from the bytes. pub async fn deserialize_bytes(proto_bytes: Vec) -> Result> { Ok(Box::new(Message::decode(&*proto_bytes).map_err(|e| { - DataFusionError::Substrait(format!("Failed to decode substrait plan: {e}")) + DataFusionError::Substrait(format!("Failed to decode plan: {e}")) })?)) } diff --git a/datafusion/substrait/tests/cases/serialize.rs b/datafusion/substrait/tests/cases/serialize.rs index e28c63312788..02089b9fa92d 100644 --- a/datafusion/substrait/tests/cases/serialize.rs +++ b/datafusion/substrait/tests/cases/serialize.rs @@ -17,6 +17,7 @@ #[cfg(test)] mod tests { + use datafusion::common::assert_contains; use datafusion::datasource::provider_as_source; use datafusion::logical_expr::LogicalPlanBuilder; use datafusion_substrait::logical_plan::consumer::from_substrait_plan; @@ -31,6 +32,25 @@ mod tests { use substrait::proto::rel_common::{Emit, EmitKind}; use substrait::proto::{rel, RelCommon}; + #[tokio::test] + async fn serialize_to_file() -> Result<()> { + let ctx = create_context().await?; + let path = "tests/serialize_to_file.bin"; + let sql = "SELECT a, b FROM data"; + + // Test case 1: serializing to a non-existing file should succeed. + serializer::serialize(sql, &ctx, path).await?; + serializer::deserialize(path).await?; + + // Test case 2: serializing to an existing file should fail. + let got = serializer::serialize(sql, &ctx, path).await.unwrap_err(); + assert_contains!(got.to_string(), "File exists"); + + fs::remove_file(path)?; + + Ok(()) + } + #[tokio::test] async fn serialize_simple_select() -> Result<()> { let ctx = create_context().await?; From 310eab006b26885ec558cf5c4572c73a7d824ee9 Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Thu, 20 Feb 2025 04:00:24 -0800 Subject: [PATCH 29/71] Map access supports constant-resolvable expressions (#14712) * Map access supports constant-resolvable expressions * adding tests fix clippy fix clippy fix clippy * fix clippy --- datafusion/functions-nested/src/planner.rs | 22 +++- datafusion/functions/src/core/getfield.rs | 111 +++++++++++++-------- datafusion/sqllogictest/test_files/map.slt | 62 ++++++++++++ 3 files changed, 148 insertions(+), 47 deletions(-) diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs index d55176a42c9a..369eaecb1905 100644 --- a/datafusion/functions-nested/src/planner.rs +++ b/datafusion/functions-nested/src/planner.rs @@ -17,17 +17,20 @@ //! SQL planning extensions like [`NestedFunctionPlanner`] and [`FieldAccessPlanner`] -use std::sync::Arc; - +use arrow::datatypes::DataType; +use datafusion_common::ExprSchema; use datafusion_common::{plan_err, utils::list_ndims, DFSchema, Result}; -use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams, ScalarFunction}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams}; use datafusion_expr::AggregateUDF; use datafusion_expr::{ planner::{ExprPlanner, PlannerResult, RawBinaryExpr, RawFieldAccessExpr}, sqlparser, Expr, ExprSchemable, GetFieldAccess, }; +use datafusion_functions::core::get_field as get_field_inner; use datafusion_functions::expr_fn::get_field; use datafusion_functions_aggregate::nth_value::nth_value_udaf; +use std::sync::Arc; use crate::map::map_udf; use crate::{ @@ -140,7 +143,7 @@ impl ExprPlanner for FieldAccessPlanner { fn plan_field_access( &self, expr: RawFieldAccessExpr, - _schema: &DFSchema, + schema: &DFSchema, ) -> Result> { let RawFieldAccessExpr { expr, field_access } = expr; @@ -173,6 +176,17 @@ impl ExprPlanner for FieldAccessPlanner { null_treatment, )), )), + // special case for map access with + Expr::Column(ref c) + if matches!(schema.data_type(c)?, DataType::Map(_, _)) => + { + Ok(PlannerResult::Planned(Expr::ScalarFunction( + ScalarFunction::new_udf( + get_field_inner(), + vec![expr, *index], + ), + ))) + } _ => Ok(PlannerResult::Planned(array_element(expr, *index))), } } diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index d667d0d8c151..d900ee5825ae 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -16,9 +16,12 @@ // under the License. use arrow::array::{ - make_array, Array, Capacities, MutableArrayData, Scalar, StringArray, + make_array, make_comparator, Array, BooleanArray, Capacities, MutableArrayData, + Scalar, }; +use arrow::compute::SortOptions; use arrow::datatypes::DataType; +use arrow_buffer::NullBuffer; use datafusion_common::cast::{as_map_array, as_struct_array}; use datafusion_common::{ exec_err, internal_err, plan_datafusion_err, utils::take_function_args, Result, @@ -106,11 +109,7 @@ impl ScalarUDFImpl for GetFieldFunc { let name = match field_name { Expr::Literal(name) => name, - _ => { - return exec_err!( - "get_field function requires the argument field_name to be a string" - ); - } + other => &ScalarValue::Utf8(Some(other.schema_name().to_string())), }; Ok(format!("{base}[{name}]")) @@ -118,14 +117,9 @@ impl ScalarUDFImpl for GetFieldFunc { fn schema_name(&self, args: &[Expr]) -> Result { let [base, field_name] = take_function_args(self.name(), args)?; - let name = match field_name { Expr::Literal(name) => name, - _ => { - return exec_err!( - "get_field function requires the argument field_name to be a string" - ); - } + other => &ScalarValue::Utf8(Some(other.schema_name().to_string())), }; Ok(format!("{}[{}]", base.schema_name(), name)) @@ -182,7 +176,6 @@ impl ScalarUDFImpl for GetFieldFunc { let arrays = ColumnarValue::values_to_arrays(&[base.clone(), field_name.clone()])?; let array = Arc::clone(&arrays[0]); - let name = match field_name { ColumnarValue::Scalar(name) => name, _ => { @@ -192,38 +185,70 @@ impl ScalarUDFImpl for GetFieldFunc { } }; + fn process_map_array( + array: Arc, + key_array: Arc, + ) -> Result { + let map_array = as_map_array(array.as_ref())?; + let keys = if key_array.data_type().is_nested() { + let comparator = make_comparator( + map_array.keys().as_ref(), + key_array.as_ref(), + SortOptions::default(), + )?; + let len = map_array.keys().len().min(key_array.len()); + let values = (0..len).map(|i| comparator(i, i).is_eq()).collect(); + let nulls = + NullBuffer::union(map_array.keys().nulls(), key_array.nulls()); + BooleanArray::new(values, nulls) + } else { + let be_compared = Scalar::new(key_array); + arrow::compute::kernels::cmp::eq(&be_compared, map_array.keys())? + }; + + let original_data = map_array.entries().column(1).to_data(); + let capacity = Capacities::Array(original_data.len()); + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], true, capacity); + + for entry in 0..map_array.len() { + let start = map_array.value_offsets()[entry] as usize; + let end = map_array.value_offsets()[entry + 1] as usize; + + let maybe_matched = keys + .slice(start, end - start) + .iter() + .enumerate() + .find(|(_, t)| t.unwrap()); + + if maybe_matched.is_none() { + mutable.extend_nulls(1); + continue; + } + let (match_offset, _) = maybe_matched.unwrap(); + mutable.extend(0, start + match_offset, start + match_offset + 1); + } + + let data = mutable.freeze(); + let data = make_array(data); + Ok(ColumnarValue::Array(data)) + } + match (array.data_type(), name) { - (DataType::Map(_, _), ScalarValue::Utf8(Some(k))) => { - let map_array = as_map_array(array.as_ref())?; - let key_scalar: Scalar>> = Scalar::new(StringArray::from(vec![k.clone()])); - let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?; - - // note that this array has more entries than the expected output/input size - // because map_array is flattened - let original_data = map_array.entries().column(1).to_data(); - let capacity = Capacities::Array(original_data.len()); - let mut mutable = - MutableArrayData::with_capacities(vec![&original_data], true, - capacity); - - for entry in 0..map_array.len(){ - let start = map_array.value_offsets()[entry] as usize; - let end = map_array.value_offsets()[entry + 1] as usize; - - let maybe_matched = - keys.slice(start, end-start). - iter().enumerate(). - find(|(_, t)| t.unwrap()); - if maybe_matched.is_none() { - mutable.extend_nulls(1); - continue - } - let (match_offset,_) = maybe_matched.unwrap(); - mutable.extend(0, start + match_offset, start + match_offset + 1); + (DataType::Map(_, _), ScalarValue::List(arr)) => { + let key_array: Arc = Arc::new((**arr).clone()); + process_map_array(array, key_array) + } + (DataType::Map(_, _), ScalarValue::Struct(arr)) => { + process_map_array(array, Arc::clone(arr) as Arc) + } + (DataType::Map(_, _), other) => { + let data_type = other.data_type(); + if data_type.is_nested() { + exec_err!("unsupported type {:?} for map access", data_type) + } else { + process_map_array(array, other.to_array()?) } - let data = mutable.freeze(); - let data = make_array(data); - Ok(ColumnarValue::Array(data)) } (DataType::Struct(_), ScalarValue::Utf8(Some(k))) => { let as_struct_array = as_struct_array(&array)?; diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index 71296b6f6474..42a4ba621801 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -592,6 +592,43 @@ select map_extract(column1, 1), map_extract(column1, 5), map_extract(column1, 7) [NULL] [NULL] [[1, NULL, 3]] [NULL] [NULL] [NULL] +query ? +select column1[1] from map_array_table_1; +---- +[1, NULL, 3] +NULL +NULL +NULL + +query ? +select column1[-1000 + 1001] from map_array_table_1; +---- +[1, NULL, 3] +NULL +NULL +NULL + +# test for negative scenario +query ? +SELECT column1[-1] FROM map_array_table_1; +---- +NULL +NULL +NULL +NULL + +query ? +SELECT column1[1000] FROM map_array_table_1; +---- +NULL +NULL +NULL +NULL + + +query error DataFusion error: Arrow error: Invalid argument error +SELECT column1[NULL] FROM map_array_table_1; + query ??? select map_extract(column1, column2), map_extract(column1, column3), map_extract(column1, column4) from map_array_table_1; ---- @@ -722,3 +759,28 @@ drop table map_array_table_1; statement ok drop table map_array_table_2; + + +statement ok +create table tt as values(MAP{[1,2,3]:1}, MAP {{'a':1, 'b':2}:2}, MAP{true: 3}); + +# accessing using an array +query I +select column1[make_array(1, 2, 3)] from tt; +---- +1 + +# accessing using a struct +query I +select column2[{a:1, b: 2}] from tt; +---- +2 + +# accessing using Bool +query I +select column3[true] from tt; +---- +3 + +statement ok +drop table tt; From 28c97fcf3dfca3edbda67bb16544c2b0968cb7e4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 20 Feb 2025 07:37:51 -0500 Subject: [PATCH 30/71] Fix build after logical conflict (#14791) --- datafusion/functions/src/core/getfield.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index d900ee5825ae..3ac26b98359b 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -236,11 +236,11 @@ impl ScalarUDFImpl for GetFieldFunc { match (array.data_type(), name) { (DataType::Map(_, _), ScalarValue::List(arr)) => { - let key_array: Arc = Arc::new((**arr).clone()); + let key_array: Arc = arr; process_map_array(array, key_array) } (DataType::Map(_, _), ScalarValue::Struct(arr)) => { - process_map_array(array, Arc::clone(arr) as Arc) + process_map_array(array, arr as Arc) } (DataType::Map(_, _), other) => { let data_type = other.data_type(); From 82ed8e0d61ab1c46d51de30a61d5aff7f3d270e1 Mon Sep 17 00:00:00 2001 From: Owen Leung Date: Thu, 20 Feb 2025 22:58:08 +0800 Subject: [PATCH 31/71] Fix CI job test-datafusion-pyarrow (#14790) * Change to ubuntu-latest to observe the error msg * Use python 3.8.18 * Remove tag bullseye * Revert to tag bullseye & remove setup python step * Install pip --- .github/workflows/rust.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index a743d0e8fd07..99aaa7d6f290 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -389,21 +389,20 @@ jobs: test-datafusion-pyarrow: name: cargo test pyarrow (amd64) needs: linux-build-lib - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest container: - image: amd64/rust:bullseye # Workaround https://github.com/actions/setup-python/issues/721 + image: amd64/rust:bullseye # Use the bullseye tag image which comes with python3.9 steps: - uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - - uses: actions/setup-python@v5 - with: - python-version: "3.8" - name: Install PyArrow run: | echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV - python -m pip install pyarrow + apt-get update + apt-get install python3-pip -y + python3 -m pip install pyarrow - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: From 139b5b567232745cd92ef8747f4098f617938a98 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 20 Feb 2025 16:32:34 +0100 Subject: [PATCH 32/71] Use `doc_auto_cfg`, logo and favicon for docs.rs (#14746) --- datafusion-cli/Cargo.toml | 3 +++ datafusion-cli/src/lib.rs | 5 +++++ datafusion/catalog-listing/Cargo.toml | 3 +++ datafusion/catalog-listing/src/mod.rs | 6 ++++++ datafusion/catalog/Cargo.toml | 3 +++ datafusion/catalog/src/lib.rs | 6 ++++++ datafusion/common-runtime/Cargo.toml | 4 +++- datafusion/common-runtime/src/lib.rs | 5 +++++ datafusion/common/Cargo.toml | 4 +++- datafusion/common/src/lib.rs | 5 +++++ datafusion/core/Cargo.toml | 7 +++---- datafusion/core/src/lib.rs | 5 +++++ datafusion/datasource/Cargo.toml | 3 +++ datafusion/datasource/src/mod.rs | 6 ++++++ datafusion/doc/Cargo.toml | 4 +++- datafusion/doc/src/lib.rs | 6 ++++++ datafusion/execution/Cargo.toml | 4 +++- datafusion/execution/src/lib.rs | 5 +++++ datafusion/expr-common/Cargo.toml | 6 +++--- datafusion/expr-common/src/lib.rs | 7 ++++++- datafusion/expr/Cargo.toml | 4 +++- datafusion/expr/src/lib.rs | 5 +++++ datafusion/ffi/Cargo.toml | 4 +++- datafusion/ffi/src/lib.rs | 5 +++++ datafusion/functions-aggregate-common/Cargo.toml | 6 +++--- datafusion/functions-aggregate-common/src/lib.rs | 5 +++++ datafusion/functions-aggregate/Cargo.toml | 6 +++--- datafusion/functions-aggregate/src/lib.rs | 5 +++++ datafusion/functions-nested/Cargo.toml | 8 +++----- datafusion/functions-nested/src/lib.rs | 5 +++++ datafusion/functions-table/Cargo.toml | 6 +++--- datafusion/functions-table/src/lib.rs | 6 ++++++ datafusion/functions-window-common/Cargo.toml | 6 +++--- datafusion/functions-window-common/src/lib.rs | 6 ++++++ datafusion/functions-window/Cargo.toml | 6 +++--- datafusion/functions-window/src/lib.rs | 6 ++++++ datafusion/functions/Cargo.toml | 6 +++--- datafusion/functions/src/lib.rs | 5 +++++ datafusion/macros/Cargo.toml | 3 +++ datafusion/macros/src/user_doc.rs | 6 ++++++ datafusion/optimizer/Cargo.toml | 4 +++- datafusion/optimizer/src/lib.rs | 5 +++++ datafusion/physical-expr-common/Cargo.toml | 4 +++- datafusion/physical-expr-common/src/lib.rs | 5 +++++ datafusion/physical-expr/Cargo.toml | 4 +++- datafusion/physical-expr/src/lib.rs | 5 +++++ datafusion/physical-optimizer/Cargo.toml | 3 +++ datafusion/physical-optimizer/src/lib.rs | 5 +++++ datafusion/physical-plan/Cargo.toml | 4 +++- datafusion/physical-plan/src/lib.rs | 6 +++++- datafusion/proto-common/Cargo.toml | 4 +++- datafusion/proto-common/src/lib.rs | 5 +++++ datafusion/proto/Cargo.toml | 4 +++- datafusion/proto/src/lib.rs | 5 +++++ datafusion/sql/Cargo.toml | 4 +++- datafusion/sql/src/lib.rs | 5 +++++ datafusion/sqllogictest/Cargo.toml | 4 +++- datafusion/sqllogictest/src/lib.rs | 6 ++++++ datafusion/substrait/src/lib.rs | 6 ++++++ datafusion/wasmtest/Cargo.toml | 3 +++ datafusion/wasmtest/src/lib.rs | 6 ++++++ 61 files changed, 257 insertions(+), 46 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index d88f8fccb928..20cd1c4295e0 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -27,6 +27,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [dependencies] arrow = { workspace = true } async-trait = { workspace = true } diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index fbfc9242a61d..34fba6f79304 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] #![doc = include_str!("../README.md")] pub const DATAFUSION_CLI_VERSION: &str = env!("CARGO_PKG_VERSION"); diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index 68d0ca3a149f..c6f9ea8bd055 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -27,6 +27,9 @@ repository.workspace = true rust-version.workspace = true version.workspace = true +[package.metadata.docs.rs] +all-features = true + [dependencies] arrow = { workspace = true } async-trait = { workspace = true } diff --git a/datafusion/catalog-listing/src/mod.rs b/datafusion/catalog-listing/src/mod.rs index b98790e86455..cb0d86d8666e 100644 --- a/datafusion/catalog-listing/src/mod.rs +++ b/datafusion/catalog-listing/src/mod.rs @@ -15,4 +15,10 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + pub mod helpers; diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 73ac44a0316e..864749411198 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -27,6 +27,9 @@ repository.workspace = true rust-version.workspace = true version.workspace = true +[package.metadata.docs.rs] +all-features = true + [dependencies] arrow = { workspace = true } async-trait = { workspace = true } diff --git a/datafusion/catalog/src/lib.rs b/datafusion/catalog/src/lib.rs index 0b8d73fabd25..a339d4916b8d 100644 --- a/datafusion/catalog/src/lib.rs +++ b/datafusion/catalog/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + //! Interfaces and default implementations of catalogs and schemas. //! //! Implementations diff --git a/datafusion/common-runtime/Cargo.toml b/datafusion/common-runtime/Cargo.toml index a21c72cd9f83..6fd9b7ac8fe8 100644 --- a/datafusion/common-runtime/Cargo.toml +++ b/datafusion/common-runtime/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_common_runtime" -path = "src/lib.rs" [dependencies] log = { workspace = true } diff --git a/datafusion/common-runtime/src/lib.rs b/datafusion/common-runtime/src/lib.rs index 51cb988ea06a..7bd8dc4cfe36 100644 --- a/datafusion/common-runtime/src/lib.rs +++ b/datafusion/common-runtime/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 3be666ce7974..fc5c08ee2316 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_common" -path = "src/lib.rs" [features] avro = ["apache-avro"] diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index df1ae100f581..d5b7c22a546c 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 8a706ca19f4d..e968967a2e75 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -29,13 +29,12 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true -[lib] -name = "datafusion" -path = "src/lib.rs" - [features] nested_expressions = ["datafusion-functions-nested"] # This feature is deprecated. Use the `nested_expressions` feature instead. diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index f4aa366500ef..9a0d0157c1ae 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] #![warn(missing_docs, clippy::needless_borrow)] diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index caf1c60a785d..bf1f8789ac43 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -27,6 +27,9 @@ repository.workspace = true rust-version.workspace = true version.workspace = true +[package.metadata.docs.rs] +all-features = true + [features] compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"] default = ["compression"] diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index c735c3108b3d..2fc2da64891d 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + //! A table that uses the `ObjectStore` listing capability //! to get the list of files to process. diff --git a/datafusion/doc/Cargo.toml b/datafusion/doc/Cargo.toml index c188bcb2a535..fa316348a6da 100644 --- a/datafusion/doc/Cargo.toml +++ b/datafusion/doc/Cargo.toml @@ -27,9 +27,11 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_doc" -path = "src/lib.rs" diff --git a/datafusion/doc/src/lib.rs b/datafusion/doc/src/lib.rs index 6940a8ef3ca2..68ed1e2352ca 100644 --- a/datafusion/doc/src/lib.rs +++ b/datafusion/doc/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + #[allow(rustdoc::broken_intra_doc_links)] /// Documentation for use by [`ScalarUDFImpl`](ScalarUDFImpl), /// [`AggregateUDFImpl`](AggregateUDFImpl) and [`WindowUDFImpl`](WindowUDFImpl) functions. diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml index bb86868a8214..8f642f3384d2 100644 --- a/datafusion/execution/Cargo.toml +++ b/datafusion/execution/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_execution" -path = "src/lib.rs" [dependencies] arrow = { workspace = true } diff --git a/datafusion/execution/src/lib.rs b/datafusion/execution/src/lib.rs index 317bd3203ab1..a9e3a27f8035 100644 --- a/datafusion/execution/src/lib.rs +++ b/datafusion/execution/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index abc78a9f084b..14717dd78135 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -27,14 +27,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_expr_common" -path = "src/lib.rs" - -[features] [dependencies] arrow = { workspace = true } diff --git a/datafusion/expr-common/src/lib.rs b/datafusion/expr-common/src/lib.rs index 179dd75ace85..fede0bb8e57e 100644 --- a/datafusion/expr-common/src/lib.rs +++ b/datafusion/expr-common/src/lib.rs @@ -19,10 +19,15 @@ //! //! This crate contains types and traits that are used by both Logical and Physical expressions. //! They are kept in their own crate to avoid physical expressions depending on logical expressions. -//! +//! //! //! [DataFusion]: +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index b4f3f7fb680f..37e1ed1936fb 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_expr" -path = "src/lib.rs" [features] recursive_protection = ["dep:recursive"] diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 2f04f234eb1d..d2ea6e809150 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 4c396144347c..97914666688f 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -27,12 +27,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_ffi" -path = "src/lib.rs" crate-type = ["cdylib", "rlib"] [dependencies] diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs index bbcdd85ff80a..4eabf91d892a 100644 --- a/datafusion/ffi/src/lib.rs +++ b/datafusion/ffi/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml index cf6eb99e60c6..72c8a58a7e45 100644 --- a/datafusion/functions-aggregate-common/Cargo.toml +++ b/datafusion/functions-aggregate-common/Cargo.toml @@ -27,14 +27,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_functions_aggregate_common" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] ahash = { workspace = true } diff --git a/datafusion/functions-aggregate-common/src/lib.rs b/datafusion/functions-aggregate-common/src/lib.rs index cc50ff70913b..6f9dfca30c19 100644 --- a/datafusion/functions-aggregate-common/src/lib.rs +++ b/datafusion/functions-aggregate-common/src/lib.rs @@ -22,6 +22,11 @@ //! //! [DataFusion]: +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index 007e1e76a3be..38052835f197 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -28,14 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_functions_aggregate" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] ahash = { workspace = true } diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index 746873442d9a..f4bdb53efd55 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index a63175b36e21..b33b415a868d 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -28,16 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true -[features] - [lib] name = "datafusion_functions_nested" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] arrow = { workspace = true } diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs index c47e4a696a1d..41ebb4366cff 100644 --- a/datafusion/functions-nested/src/lib.rs +++ b/datafusion/functions-nested/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/functions-table/Cargo.toml b/datafusion/functions-table/Cargo.toml index f722d698f3d3..78d59257dd48 100644 --- a/datafusion/functions-table/Cargo.toml +++ b/datafusion/functions-table/Cargo.toml @@ -28,14 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_functions_table" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] arrow = { workspace = true } diff --git a/datafusion/functions-table/src/lib.rs b/datafusion/functions-table/src/lib.rs index f5436f7bf8a6..4a31760e7c4d 100644 --- a/datafusion/functions-table/src/lib.rs +++ b/datafusion/functions-table/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + pub mod generate_series; use datafusion_catalog::TableFunction; diff --git a/datafusion/functions-window-common/Cargo.toml b/datafusion/functions-window-common/Cargo.toml index b5df212b7d2a..466e7bc68b48 100644 --- a/datafusion/functions-window-common/Cargo.toml +++ b/datafusion/functions-window-common/Cargo.toml @@ -28,14 +28,14 @@ repository = { workspace = true } rust-version = { workspace = true } version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_functions_window_common" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] datafusion-common = { workspace = true } diff --git a/datafusion/functions-window-common/src/lib.rs b/datafusion/functions-window-common/src/lib.rs index da8d096da562..6f2a1ac0f33f 100644 --- a/datafusion/functions-window-common/src/lib.rs +++ b/datafusion/functions-window-common/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + //! Common user-defined window functionality for [DataFusion] //! //! [DataFusion]: diff --git a/datafusion/functions-window/Cargo.toml b/datafusion/functions-window/Cargo.toml index fc1bc51bcc66..e0c17c579b19 100644 --- a/datafusion/functions-window/Cargo.toml +++ b/datafusion/functions-window/Cargo.toml @@ -28,14 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_functions_window" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] datafusion-common = { workspace = true } diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs index 9f8e54a0423b..0d932bf84725 100644 --- a/datafusion/functions-window/src/lib.rs +++ b/datafusion/functions-window/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + //! Window Function packages for [DataFusion]. //! //! This crate contains a collection of various window function packages for DataFusion, diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index c00997853bb3..788bc67d970c 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -28,6 +28,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true @@ -60,9 +63,6 @@ unicode_expressions = ["unicode-segmentation"] [lib] name = "datafusion_functions" -path = "src/lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] arrow = { workspace = true } diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index ffb0ab5f51ec..de2571779d42 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml index 8946640f20bb..737d2ed72874 100644 --- a/datafusion/macros/Cargo.toml +++ b/datafusion/macros/Cargo.toml @@ -27,6 +27,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true diff --git a/datafusion/macros/src/user_doc.rs b/datafusion/macros/src/user_doc.rs index 6ca90ed376c3..c6510c156423 100644 --- a/datafusion/macros/src/user_doc.rs +++ b/datafusion/macros/src/user_doc.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + extern crate proc_macro; use datafusion_expr::scalar_doc_sections::doc_sections_const; use proc_macro::TokenStream; diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 3f5ec9b0da03..3413b365f67d 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_optimizer" -path = "src/lib.rs" [features] recursive_protection = ["dep:recursive"] diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs index 614284e1b477..61ca9b31cd29 100644 --- a/datafusion/optimizer/src/lib.rs +++ b/datafusion/optimizer/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index 14d6ca64d15e..a5a12b5527b7 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_physical_expr_common" -path = "src/lib.rs" [dependencies] ahash = { workspace = true } diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index a05f1c96306f..440f044d88eb 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 38e8b44791ab..a3321f493388 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_physical_expr" -path = "src/lib.rs" [dependencies] ahash = { workspace = true } diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 11d6f54a7cc3..b68d10905cab 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index c9c86e9c8d5c..e8473e6556d1 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -28,6 +28,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index c2beab032049..2613b95bbdc0 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index f0afdaa2de3d..676663114702 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -28,6 +28,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true @@ -36,7 +39,6 @@ force_hash_collisions = [] [lib] name = "datafusion_physical_plan" -path = "src/lib.rs" [dependencies] ahash = { workspace = true } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 9210e3b0273c..06fe23d2ff90 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -15,8 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 - #![deny(clippy::clone_on_ref_ptr)] //! Traits for physical query plan, supporting parallel execution for partitioned relations. diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml index 3454b26ebcf6..957cbc253616 100644 --- a/datafusion/proto-common/Cargo.toml +++ b/datafusion/proto-common/Cargo.toml @@ -31,9 +31,11 @@ rust-version = { workspace = true } # Exclude proto files so crates.io consumers don't need protoc exclude = ["*.proto"] +[package.metadata.docs.rs] +all-features = true + [lib] name = "datafusion_proto_common" -path = "src/lib.rs" [features] default = [] diff --git a/datafusion/proto-common/src/lib.rs b/datafusion/proto-common/src/lib.rs index 9272a4e87960..56cd42ee5067 100644 --- a/datafusion/proto-common/src/lib.rs +++ b/datafusion/proto-common/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index fb5d414dcec4..00d4969182cf 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -31,9 +31,11 @@ rust-version = { workspace = true } # Exclude proto files so crates.io consumers don't need protoc exclude = ["*.proto"] +[package.metadata.docs.rs] +all-features = true + [lib] name = "datafusion_proto" -path = "src/lib.rs" [features] default = ["parquet"] diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index 9f2973e3c958..5d84be1cff55 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index c4a404975d29..4435ee0f56cb 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -28,12 +28,14 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_sql" -path = "src/lib.rs" [features] default = ["unicode_expressions", "unparser"] diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index 16a3d6d007cf..d552efa8254c 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index f1d37c7202d6..88e34dd3b9c4 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -27,12 +27,14 @@ repository = { workspace = true } rust-version = { workspace = true } version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true [lib] name = "datafusion_sqllogictest" -path = "src/lib.rs" [dependencies] arrow = { workspace = true } diff --git a/datafusion/sqllogictest/src/lib.rs b/datafusion/sqllogictest/src/lib.rs index 0ea55782d34e..ee20e70d14f4 100644 --- a/datafusion/sqllogictest/src/lib.rs +++ b/datafusion/sqllogictest/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + //! DataFusion sqllogictest driver mod engines; diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index f33e86a2d20c..a7493a48e4c5 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + //! Serialize / Deserialize DataFusion Plans to [Substrait.io] //! //! This crate provides support for serializing and deserializing both DataFusion diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 7db051ad191f..6d64a15f4b99 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -27,6 +27,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[package.metadata.docs.rs] +all-features = true + [lints] workspace = true diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs index c7c620d1be3a..e2ba50beb657 100644 --- a/datafusion/wasmtest/src/lib.rs +++ b/datafusion/wasmtest/src/lib.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" +)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + extern crate wasm_bindgen; use datafusion_common::{DFSchema, ScalarValue}; From 568a2b1a120c17343f092a46bb7b9b620df434a3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Feb 2025 00:21:36 +0800 Subject: [PATCH 33/71] chore(deps): bump sqllogictest from 0.27.1 to 0.27.2 (#14785) Bumps [sqllogictest](https://github.com/risinglightdb/sqllogictest-rs) from 0.27.1 to 0.27.2. - [Release notes](https://github.com/risinglightdb/sqllogictest-rs/releases) - [Changelog](https://github.com/risinglightdb/sqllogictest-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/risinglightdb/sqllogictest-rs/compare/v0.27.1...v0.27.2) --- updated-dependencies: - dependency-name: sqllogictest dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- datafusion/sqllogictest/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b7794d731b75..0eae57c106e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5573,9 +5573,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqllogictest" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07a06aea5e52b0a63b9d8328b46ea2740cdab4cac13def8ef4f2e5288610f9ed" +checksum = "6f1c93848602f92e5925690d4805ccbc1ccdb61bee7d4ae79ad6862b542a539c" dependencies = [ "async-trait", "educe", diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 88e34dd3b9c4..bc1b283eda65 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -53,7 +53,7 @@ object_store = { workspace = true } postgres-protocol = { version = "0.6.7", optional = true } postgres-types = { version = "0.2.8", features = ["derive", "with-chrono-0_4"], optional = true } rust_decimal = { version = "1.36.0", features = ["tokio-pg"] } -sqllogictest = "0.27.1" +sqllogictest = "0.27.2" sqlparser = { workspace = true } tempfile = { workspace = true } testcontainers = { version = "0.23", features = ["default"], optional = true } From 84232d806070f4b177e35148255c501a5f6e740b Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 21 Feb 2025 00:22:22 +0800 Subject: [PATCH 34/71] Add additional info about memory reservation to the doc of MemoryPool (#14789) --- datafusion/execution/src/memory_pool/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/execution/src/memory_pool/mod.rs b/datafusion/execution/src/memory_pool/mod.rs index 5e49debac7e7..71d40aeab53c 100644 --- a/datafusion/execution/src/memory_pool/mod.rs +++ b/datafusion/execution/src/memory_pool/mod.rs @@ -55,7 +55,10 @@ pub use pool::*; /// "large" amounts of memory (proportional to number of input rows), such as /// `GroupByHashExec`. It does NOT track and limit memory used internally by /// other operators such as `DataSourceExec` or the `RecordBatch`es that flow -/// between operators. +/// between operators. Furthermore, operators should not reserve memory for the +/// batches they produce. Instead, if a parent operator needs to hold batches +/// from its children in memory for an extended period, it is the parent +/// operator's responsibility to reserve the necessary memory for those batches. /// /// In order to avoid allocating memory until the OS or the container system /// kills the process, DataFusion `ExecutionPlan`s (operators) that consume From 83487e3c6bf60312e94c7cd68b0c100cdd82f21e Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Fri, 21 Feb 2025 11:57:39 +0800 Subject: [PATCH 35/71] =?UTF-8?q?feat:=20Improve=20datafusion-cli=20memory?= =?UTF-8?q?=20usage=20and=20considering=20reserve=20mem=E2=80=A6=20(#14766?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Improve datafusion-cli memory usage and considering reserve memory for the result batches * Address new comments * Address new comments * fix test * fix test * Address comments * Fix doc * Fix row count showing * Fix fmt * fix corner case * remove unused code --- datafusion-cli/src/command.rs | 4 ++- datafusion-cli/src/exec.rs | 43 +++++++++++++++++++++++------ datafusion-cli/src/print_options.rs | 2 +- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/datafusion-cli/src/command.rs b/datafusion-cli/src/command.rs index f0eb58a23391..fc7d1a2617cf 100644 --- a/datafusion-cli/src/command.rs +++ b/datafusion-cli/src/command.rs @@ -62,7 +62,9 @@ impl Command { Self::Help => { let now = Instant::now(); let command_batch = all_commands_info(); - print_options.print_batches(command_batch.schema(), &[command_batch], now) + let schema = command_batch.schema(); + let num_rows = command_batch.num_rows(); + print_options.print_batches(schema, &[command_batch], now, num_rows) } Self::ListTables => { exec_and_print(ctx, print_options, "SHOW TABLES".into()).await diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index a4f154b2de92..84664794b7d9 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -17,11 +17,6 @@ //! Execution functions -use std::collections::HashMap; -use std::fs::File; -use std::io::prelude::*; -use std::io::BufReader; - use crate::cli_context::CliSessionContext; use crate::helper::split_from_semicolon; use crate::print_format::PrintFormat; @@ -31,6 +26,11 @@ use crate::{ object_storage::get_object_store, print_options::{MaxRows, PrintOptions}, }; +use futures::StreamExt; +use std::collections::HashMap; +use std::fs::File; +use std::io::prelude::*; +use std::io::BufReader; use datafusion::common::instant::Instant; use datafusion::common::{plan_datafusion_err, plan_err}; @@ -39,10 +39,12 @@ use datafusion::datasource::listing::ListingTableUrl; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{DdlStatement, LogicalPlan}; use datafusion::physical_plan::execution_plan::EmissionType; -use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; +use datafusion::physical_plan::{execute_stream, ExecutionPlanProperties}; use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; +use datafusion::execution::memory_pool::MemoryConsumer; +use datafusion::physical_plan::spill::get_record_batch_memory_size; use datafusion::sql::sqlparser; use rustyline::error::ReadlineError; use rustyline::Editor; @@ -235,6 +237,10 @@ pub(super) async fn exec_and_print( let df = ctx.execute_logical_plan(plan).await?; let physical_plan = df.create_physical_plan().await?; + // Track memory usage for the query result if it's bounded + let mut reservation = + MemoryConsumer::new("DataFusion-Cli").register(task_ctx.memory_pool()); + if physical_plan.boundedness().is_unbounded() { if physical_plan.pipeline_behavior() == EmissionType::Final { return plan_err!( @@ -247,10 +253,29 @@ pub(super) async fn exec_and_print( let stream = execute_stream(physical_plan, task_ctx.clone())?; print_options.print_stream(stream, now).await?; } else { - // Bounded stream; collected results are printed after all input consumed. + // Bounded stream; collected results size is limited by the maxrows option let schema = physical_plan.schema(); - let results = collect(physical_plan, task_ctx.clone()).await?; - adjusted.into_inner().print_batches(schema, &results, now)?; + let mut stream = execute_stream(physical_plan, task_ctx.clone())?; + let mut results = vec![]; + let mut row_count = 0_usize; + while let Some(batch) = stream.next().await { + let batch = batch?; + let curr_num_rows = batch.num_rows(); + if let MaxRows::Limited(max_rows) = print_options.maxrows { + // Stop collecting results if the number of rows exceeds the limit + // results batch should include the last batch that exceeds the limit + if row_count < max_rows + curr_num_rows { + // Try to grow the reservation to accommodate the batch in memory + reservation.try_grow(get_record_batch_memory_size(&batch))?; + results.push(batch); + } + } + row_count += curr_num_rows; + } + adjusted + .into_inner() + .print_batches(schema, &results, now, row_count)?; + reservation.free(); } } diff --git a/datafusion-cli/src/print_options.rs b/datafusion-cli/src/print_options.rs index e80cc55663ae..9557e783e8a7 100644 --- a/datafusion-cli/src/print_options.rs +++ b/datafusion-cli/src/print_options.rs @@ -102,6 +102,7 @@ impl PrintOptions { schema: SchemaRef, batches: &[RecordBatch], query_start_time: Instant, + row_count: usize, ) -> Result<()> { let stdout = std::io::stdout(); let mut writer = stdout.lock(); @@ -109,7 +110,6 @@ impl PrintOptions { self.format .print_batches(&mut writer, schema, batches, self.maxrows, true)?; - let row_count: usize = batches.iter().map(|b| b.num_rows()).sum(); let formatted_exec_details = get_execution_details_formatted( row_count, if self.format == PrintFormat::Table { From c92df4febe7662b0da866741b173e2e6bfdff619 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Fri, 21 Feb 2025 12:42:51 +0800 Subject: [PATCH 36/71] Fix CI fail for extended test (by freeing up more disk space in CI runner) (#14745) * Fix extended test * feedback * Update datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs Co-authored-by: Bruce Ritchie * fix action version hash --------- Co-authored-by: Bruce Ritchie --- .github/workflows/extended.yml | 71 +++++++++++-------- .../sort_mem_validation.rs | 31 +++++++- 2 files changed, 70 insertions(+), 32 deletions(-) diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 19910957a85b..0f52d14cdae2 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -39,43 +39,54 @@ jobs: linux-build-lib: name: linux build test runs-on: ubuntu-latest - container: - image: amd64/rust steps: - uses: actions/checkout@v4 - - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder with: - rust-version: stable + submodules: true + fetch-depth: 1 + - name: Install Rust + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source $HOME/.cargo/env + rustup default stable + - name: Install Protobuf Compiler + run: sudo apt-get install -y protobuf-compiler - name: Prepare cargo build run: | cargo check --profile ci --all-targets cargo clean -# # Run extended tests (with feature 'extended_tests') -# # Disabling as it is running out of disk space -# # see https://github.com/apache/datafusion/issues/14576 -# linux-test-extended: -# name: cargo test 'extended_tests' (amd64) -# needs: linux-build-lib -# runs-on: ubuntu-latest -# container: -# image: amd64/rust -# steps: -# - uses: actions/checkout@v4 -# with: -# submodules: true -# fetch-depth: 1 -# - name: Setup Rust toolchain -# uses: ./.github/actions/setup-builder -# with: -# rust-version: stable -# - name: Run tests (excluding doctests) -# run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests -# - name: Verify Working Directory Clean -# run: git diff --exit-code -# - name: Cleanup -# run: cargo clean + # Run extended tests (with feature 'extended_tests') + linux-test-extended: + name: cargo test 'extended_tests' (amd64) + needs: linux-build-lib + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be + - name: Install Rust + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source $HOME/.cargo/env + rustup default stable + - name: Install Protobuf Compiler + run: sudo apt-get install -y protobuf-compiler + # For debugging, test binaries can be large. + - name: Show available disk space + run: | + df -h + - name: Run tests (excluding doctests) + env: + RUST_BACKTRACE: 1 + run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests + - name: Verify Working Directory Clean + run: git diff --exit-code + - name: Cleanup + run: cargo clean # Check answers are correct when hash values collide hash-collisions: @@ -95,7 +106,7 @@ jobs: - name: Run tests run: | cd datafusion - cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro,extended_tests + cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro cargo clean sqllogictest-sqlite: diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs index 1789f37535a9..64ab1378340a 100644 --- a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs +++ b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs @@ -21,12 +21,14 @@ //! This file is organized as: //! - Test runners that spawn individual test processes //! - Test cases that contain the actual validation logic -use std::{process::Command, str}; - use log::info; +use std::sync::Once; +use std::{process::Command, str}; use crate::memory_limit::memory_limit_validation::utils; +static INIT: Once = Once::new(); + // =========================================================================== // Test runners: // Runners are splitted into multiple tests to run in parallel @@ -67,10 +69,35 @@ fn sort_with_mem_limit_2_cols_2_runner() { spawn_test_process("sort_with_mem_limit_2_cols_2"); } +/// `spawn_test_process` might trigger multiple recompilations and the test binary +/// size might grow indefinitely. This initializer ensures recompilation is only done +/// once and the target size is bounded. +/// +/// TODO: This is a hack, can be cleaned up if we have a better way to let multiple +/// test cases run in different processes (instead of different threads by default) +fn init_once() { + INIT.call_once(|| { + let _ = Command::new("cargo") + .arg("test") + .arg("--no-run") + .arg("--package") + .arg("datafusion") + .arg("--test") + .arg("core_integration") + .arg("--features") + .arg("extended_tests") + .env("DATAFUSION_TEST_MEM_LIMIT_VALIDATION", "1") + .output() + .expect("Failed to execute test command"); + }); +} + /// Helper function that executes a test in a separate process with the required environment /// variable set. Memory limit validation tasks need to measure memory resident set /// size (RSS), so they must run in a separate process. fn spawn_test_process(test: &str) { + init_once(); + let test_path = format!( "memory_limit::memory_limit_validation::sort_mem_validation::{}", test From 2325b6cbf8fd93cccdd39beaaaf7da0c588d6135 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 21 Feb 2025 00:52:08 -0800 Subject: [PATCH 37/71] chore: Benchmark deps cleanup (#14793) * Synchronize criterion version across workspace Use same version everywhere. This means update for some crates. * Synchronize and update rand version across workspace Use same version everywhere and update. * Avoid linear search for character in bench helper Get the char in O(1) instead of O(n). --- Cargo.toml | 3 ++- datafusion/core/Cargo.toml | 2 +- datafusion/functions-aggregate-common/Cargo.toml | 2 +- datafusion/functions-aggregate/Cargo.toml | 2 +- datafusion/functions-nested/Cargo.toml | 4 ++-- datafusion/functions/Cargo.toml | 2 +- datafusion/functions/benches/helper.rs | 5 ++--- datafusion/physical-expr/Cargo.toml | 2 +- datafusion/physical-plan/Cargo.toml | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ccf3f02a2fde..b6098a636954 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,6 +95,7 @@ async-trait = "0.1.73" bigdecimal = "0.4.7" bytes = "1.10" chrono = { version = "0.4.38", default-features = false } +criterion = "0.5.1" ctor = "0.2.9" dashmap = "6.0.1" datafusion = { path = "datafusion/core", version = "45.0.0", default-features = false } @@ -144,7 +145,7 @@ pbjson-types = "0.7" # Should match arrow-flight's version of prost. prost = "0.13.1" prost-derive = "0.13.1" -rand = "0.8" +rand = "0.8.5" recursive = "0.1.1" regex = "1.8" rstest = "0.24.0" diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index e968967a2e75..87a37248a31d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -131,7 +131,7 @@ zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] async-trait = { workspace = true } -criterion = { version = "0.5", features = ["async_tokio"] } +criterion = { workspace = true, features = ["async_tokio"] } ctor = { workspace = true } dashmap = "6.1.0" datafusion-doc = { workspace = true } diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml index 72c8a58a7e45..cf065ca1cb17 100644 --- a/datafusion/functions-aggregate-common/Cargo.toml +++ b/datafusion/functions-aggregate-common/Cargo.toml @@ -44,7 +44,7 @@ datafusion-expr-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } [dev-dependencies] -criterion = "0.5" +criterion = { workspace = true } rand = { workspace = true } [[bench]] diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index 38052835f197..ec6e6b633bb8 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -54,7 +54,7 @@ paste = "1.0.14" [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } -criterion = "0.5" +criterion = { workspace = true } rand = { workspace = true } [[bench]] diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index b33b415a868d..9a7b1f460ef5 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -53,8 +53,8 @@ log = { workspace = true } paste = "1.0.14" [dev-dependencies] -criterion = { version = "0.5", features = ["async_tokio"] } -rand = "0.8.5" +criterion = { workspace = true, features = ["async_tokio"] } +rand = { workspace = true } [[bench]] harness = false diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 788bc67d970c..b44127d6a1b7 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -89,7 +89,7 @@ uuid = { version = "1.13", features = ["v4"], optional = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } -criterion = "0.5" +criterion = { workspace = true } rand = { workspace = true } tokio = { workspace = true, features = ["macros", "rt", "sync"] } diff --git a/datafusion/functions/benches/helper.rs b/datafusion/functions/benches/helper.rs index c7c405bc4696..0dbb4b0027d4 100644 --- a/datafusion/functions/benches/helper.rs +++ b/datafusion/functions/benches/helper.rs @@ -35,7 +35,7 @@ pub fn gen_string_array( let rng_ref = &mut rng; let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes - let corpus_char_count = corpus.chars().count(); + let corpus = corpus.chars().collect::>(); let mut output_string_vec: Vec> = Vec::with_capacity(n_rows); for _ in 0..n_rows { @@ -46,8 +46,7 @@ pub fn gen_string_array( // Generate random UTF8 string let mut generated_string = String::with_capacity(str_len_chars); for _ in 0..str_len_chars { - let idx = rng_ref.gen_range(0..corpus_char_count); - let char = corpus.chars().nth(idx).unwrap(); + let char = corpus[rng_ref.gen_range(0..corpus.len())]; generated_string.push(char); } output_string_vec.push(Some(generated_string)); diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index a3321f493388..72baa0db00a2 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -55,7 +55,7 @@ petgraph = "0.7.1" [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } -criterion = "0.5" +criterion = { workspace = true } datafusion-functions = { workspace = true } rand = { workspace = true } rstest = { workspace = true } diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 676663114702..ae0ea60e1a3e 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -65,7 +65,7 @@ pin-project-lite = "^0.2.7" tokio = { workspace = true } [dev-dependencies] -criterion = { version = "0.5", features = ["async_futures"] } +criterion = { workspace = true, features = ["async_futures"] } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window = { workspace = true } rand = { workspace = true } From 4d6e3343d2ee6de0b7d86a9062478949457d6060 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 21 Feb 2025 00:53:02 -0800 Subject: [PATCH 38/71] Fix test not to litter in repository (#14795) Before the change, executing the test would create (and leave behind) the `datafusion/execution/DOESNT_EXIST` folder inside source repository. --- datafusion/execution/src/disk_manager.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/execution/src/disk_manager.rs b/datafusion/execution/src/disk_manager.rs index 756da7ed5b46..caa62eefe14c 100644 --- a/datafusion/execution/src/disk_manager.rs +++ b/datafusion/execution/src/disk_manager.rs @@ -250,7 +250,8 @@ mod tests { #[test] fn test_disk_manager_create_spill_folder() { - let config = DiskManagerConfig::new_specified(vec!["DOESNT_EXIST".into()]); + let dir = TempDir::new().unwrap(); + let config = DiskManagerConfig::new_specified(vec![dir.path().to_owned()]); DiskManager::try_new(config) .unwrap() From fde2239e831a00a5d2334b4f5dbd71ec34a07991 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:53:33 +0800 Subject: [PATCH 39/71] chore(deps): bump testcontainers from 0.23.2 to 0.23.3 (#14787) Bumps [testcontainers](https://github.com/testcontainers/testcontainers-rs) from 0.23.2 to 0.23.3. - [Release notes](https://github.com/testcontainers/testcontainers-rs/releases) - [Changelog](https://github.com/testcontainers/testcontainers-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/testcontainers/testcontainers-rs/compare/0.23.2...0.23.3) --- updated-dependencies: - dependency-name: testcontainers dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0eae57c106e9..629b2cfb03c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5891,9 +5891,9 @@ dependencies = [ [[package]] name = "testcontainers" -version = "0.23.2" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "042009c52a4204476bff461ca8ef17bab6f1a91628504a8a36c6fd2c1cde2d5e" +checksum = "59a4f01f39bb10fc2a5ab23eb0d888b1e2bb168c157f61a1b98e6c501c639c74" dependencies = [ "async-trait", "bollard", From 188e0d31d192fa6168e9529459136677968829f1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:27:02 +0800 Subject: [PATCH 40/71] chore(deps): bump serde from 1.0.217 to 1.0.218 (#14788) Bumps [serde](https://github.com/serde-rs/serde) from 1.0.217 to 1.0.218. - [Release notes](https://github.com/serde-rs/serde/releases) - [Commits](https://github.com/serde-rs/serde/compare/v1.0.217...v1.0.218) --- updated-dependencies: - dependency-name: serde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: jonahgao --- Cargo.lock | 8 ++++---- benchmarks/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 629b2cfb03c1..357f44c17fa0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5323,9 +5323,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] @@ -5341,9 +5341,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index ad8debaf2fa3..860089063c4d 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -43,7 +43,7 @@ futures = { workspace = true } log = { workspace = true } mimalloc = { version = "0.1", optional = true, default-features = false } parquet = { workspace = true, default-features = true } -serde = { version = "1.0.136", features = ["derive"] } +serde = { version = "1.0.218", features = ["derive"] } serde_json = { workspace = true } snmalloc-rs = { version = "0.3", optional = true } structopt = { version = "0.3", default-features = false } From b66beb8d509cc488a43080efcca77dfd2e6ee603 Mon Sep 17 00:00:00 2001 From: logan-keede <68557630+logan-keede@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:19:35 +0530 Subject: [PATCH 41/71] refactor: move `DataSource` to `datafusion-datasource` (#14671) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * build moves only tests+benches pending * unstable * some tests fixed * Mock MemorySourceConfig and DataSource * some cleanup * test pass but Mock is not efficient * temporary stable * one struct, test pass, cleaning pending * cleaning * more cleaning * clippy * 🧹🧹*cleaning*🧹🧹 * adding re-export * fix:cargo fmt * fix: doctest * fix: leftout doctest * fix: circular dependency * clean, rename, document, improve --- Cargo.lock | 110 +-- datafusion-cli/src/functions.rs | 2 +- .../examples/parquet_exec_visitor.rs | 2 +- .../examples/remote_catalog.rs | 2 +- datafusion-examples/examples/simple_udtf.rs | 2 +- datafusion/core/Cargo.toml | 4 + datafusion/core/benches/physical_plan.rs | 2 +- datafusion/core/benches/sort.rs | 3 +- .../{physical-plan => core}/benches/spm.rs | 2 +- datafusion/core/src/datasource/empty.rs | 3 +- .../core/src/datasource/listing/table.rs | 3 +- datafusion/core/src/datasource/memory.rs | 4 +- datafusion/core/src/datasource/mod.rs | 2 + .../datasource/physical_plan/arrow_file.rs | 2 +- .../core/src/datasource/physical_plan/avro.rs | 2 +- .../core/src/datasource/physical_plan/csv.rs | 4 +- .../physical_plan/file_scan_config.rs | 2 +- .../core/src/datasource/physical_plan/json.rs | 2 +- .../datasource/physical_plan/parquet/mod.rs | 4 +- .../physical_plan/parquet/source.rs | 6 +- datafusion/core/src/physical_planner.rs | 6 +- datafusion/core/src/test/mod.rs | 2 +- datafusion/core/src/test_util/parquet.rs | 2 +- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 4 +- datafusion/core/tests/fuzz_cases/join_fuzz.rs | 4 +- .../core/tests/fuzz_cases/merge_fuzz.rs | 2 +- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 2 +- .../sort_preserving_repartition_fuzz.rs | 4 +- .../core/tests/fuzz_cases/window_fuzz.rs | 4 +- datafusion/core/tests/memory_limit/mod.rs | 4 +- .../core/tests/parquet/file_statistics.rs | 2 +- datafusion/core/tests/parquet/page_pruning.rs | 2 +- datafusion/core/tests/parquet/utils.rs | 2 +- .../aggregate_statistics.rs | 4 +- .../enforce_distribution.rs | 2 +- .../physical_optimizer/projection_pushdown.rs | 4 +- .../replace_with_order_preserving_variants.rs | 4 +- .../tests/physical_optimizer/test_utils.rs | 4 +- datafusion/core/tests/sql/path_partition.rs | 2 +- .../user_defined_table_functions.rs | 2 +- datafusion/datasource/Cargo.toml | 2 + datafusion/datasource/src/memory.rs | 926 ++++++++++++++++++ datafusion/datasource/src/mod.rs | 2 + .../src/source.rs | 10 +- datafusion/physical-optimizer/Cargo.toml | 1 + datafusion/physical-plan/Cargo.toml | 4 - .../physical-plan/src/aggregates/mod.rs | 19 +- datafusion/physical-plan/src/empty.rs | 4 +- datafusion/physical-plan/src/filter.rs | 1 - .../physical-plan/src/joins/hash_join.rs | 30 +- .../src/joins/nested_loop_join.rs | 7 +- .../src/joins/sort_merge_join.rs | 14 +- .../physical-plan/src/joins/test_utils.rs | 11 +- datafusion/physical-plan/src/lib.rs | 2 - datafusion/physical-plan/src/memory.rs | 874 +---------------- .../physical-plan/src/placeholder_row.rs | 9 +- .../physical-plan/src/recursive_query.rs | 12 +- .../physical-plan/src/repartition/mod.rs | 26 +- .../physical-plan/src/sorts/partial_sort.rs | 8 +- datafusion/physical-plan/src/sorts/sort.rs | 8 +- .../src/sorts/sort_preserving_merge.rs | 26 +- datafusion/physical-plan/src/test.rs | 340 ++++++- datafusion/physical-plan/src/union.rs | 11 +- datafusion/physical-plan/src/values.rs | 7 +- .../src/windows/bounded_window_agg_exec.rs | 4 +- datafusion/physical-plan/src/work_table.rs | 6 +- datafusion/proto/src/physical_plan/mod.rs | 2 +- .../substrait/src/physical_plan/producer.rs | 2 +- 68 files changed, 1479 insertions(+), 1114 deletions(-) rename datafusion/{physical-plan => core}/benches/spm.rs (98%) create mode 100644 datafusion/datasource/src/memory.rs rename datafusion/{physical-plan => datasource}/src/source.rs (95%) diff --git a/Cargo.lock b/Cargo.lock index 357f44c17fa0..716e0cf10386 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -674,9 +674,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.58.0" +version = "1.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ff718c9ee45cc1ebd4774a0e086bb80a6ab752b4902edf1c9f56b86ee1f770" +checksum = "00a35fc7e74f5be45839eb753568535c074a592185dd0a2d406685018d581c43" dependencies = [ "aws-credential-types", "aws-runtime", @@ -696,9 +696,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.59.0" +version = "1.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5183e088715cc135d8d396fdd3bc02f018f0da4c511f53cb8d795b6a31c55809" +checksum = "f8fa655b4f313124ce272cbc38c5fef13793c832279cec750103e5e6b71a54b8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -718,9 +718,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.59.0" +version = "1.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9f944ef032717596639cea4a2118a3a457268ef51bbb5fde9637e54c465da00" +checksum = "dc1cfe5e16b90421ea031f4c6348b534ef442e76f6bf4a1b2b592c12cc2c6af9" dependencies = [ "aws-credential-types", "aws-runtime", @@ -741,9 +741,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.8" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" +checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -1078,7 +1078,7 @@ dependencies = [ "hyperlocal", "log", "pin-project-lite", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -1249,9 +1249,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.13" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" dependencies = [ "jobserver", "libc", @@ -1643,9 +1643,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -1918,6 +1918,8 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", "flate2", "futures", @@ -2230,6 +2232,7 @@ version = "45.0.0" dependencies = [ "arrow", "datafusion-common", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -2563,9 +2566,9 @@ dependencies = [ [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" @@ -2924,9 +2927,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" dependencies = [ "atomic-waker", "bytes", @@ -3134,7 +3137,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "httparse", @@ -3187,7 +3190,7 @@ dependencies = [ "http 1.2.0", "hyper 1.6.0", "hyper-util", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", @@ -3796,9 +3799,9 @@ dependencies = [ [[package]] name = "miniz_oxide" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", ] @@ -4494,9 +4497,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" dependencies = [ "cc", ] @@ -4611,7 +4614,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.22", + "rustls 0.23.23", "socket2", "thiserror 2.0.11", "tokio", @@ -4629,7 +4632,7 @@ dependencies = [ "rand 0.8.5", "ring", "rustc-hash", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-pki-types", "slab", "thiserror 2.0.11", @@ -4640,9 +4643,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", @@ -4695,8 +4698,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha 0.9.0", - "rand_core 0.9.0", - "zerocopy 0.8.17", + "rand_core 0.9.1", + "zerocopy 0.8.18", ] [[package]] @@ -4716,7 +4719,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.0", + "rand_core 0.9.1", ] [[package]] @@ -4730,12 +4733,12 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff" +checksum = "a88e0da7a2c97baa202165137c158d0a2e824ac465d13d81046727b34cb247d3" dependencies = [ "getrandom 0.3.1", - "zerocopy 0.8.17", + "zerocopy 0.8.18", ] [[package]] @@ -4896,7 +4899,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -4911,7 +4914,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -4934,15 +4937,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" dependencies = [ "cc", "cfg-if", "getrandom 0.2.15", "libc", - "spin", "untrusted", "windows-sys 0.52.0", ] @@ -5088,9 +5090,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.22" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "once_cell", "ring", @@ -5506,9 +5508,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "snafu" @@ -5565,12 +5567,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "sqllogictest" version = "0.27.2" @@ -6133,7 +6129,7 @@ version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.22", + "rustls 0.23.23", "tokio", ] @@ -6204,7 +6200,7 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -6360,9 +6356,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "typify" @@ -7094,11 +7090,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.17" +version = "0.8.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa91407dacce3a68c56de03abe2760159582b846c6a4acd2f456618087f12713" +checksum = "79386d31a42a4996e3336b0919ddb90f81112af416270cff95b5f5af22b839c2" dependencies = [ - "zerocopy-derive 0.8.17", + "zerocopy-derive 0.8.18", ] [[package]] @@ -7114,9 +7110,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.17" +version = "0.8.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06718a168365cad3d5ff0bb133aad346959a2074bd4a85c121255a11304a8626" +checksum = "76331675d372f91bf8d17e13afbd5fe639200b73d01f0fc748bb059f9cca2db7" dependencies = [ "proc-macro2", "quote", diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index b5bcb8243ea9..13d2d5fd3547 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -28,10 +28,10 @@ use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; use datafusion::catalog::{Session, TableFunctionImpl}; use datafusion::common::{plan_err, Column}; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::logical_expr::Expr; -use datafusion::physical_plan::memory::MemorySourceConfig; use datafusion::physical_plan::ExecutionPlan; use datafusion::scalar::ScalarValue; diff --git a/datafusion-examples/examples/parquet_exec_visitor.rs b/datafusion-examples/examples/parquet_exec_visitor.rs index 20809a1121c1..6c9f1a354430 100644 --- a/datafusion-examples/examples/parquet_exec_visitor.rs +++ b/datafusion-examples/examples/parquet_exec_visitor.rs @@ -20,10 +20,10 @@ use std::sync::Arc; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ListingOptions, PartitionedFile}; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion::error::DataFusionError; use datafusion::execution::context::SessionContext; use datafusion::physical_plan::metrics::MetricValue; -use datafusion::physical_plan::source::DataSourceExec; use datafusion::physical_plan::{ execute_stream, visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor, }; diff --git a/datafusion-examples/examples/remote_catalog.rs b/datafusion-examples/examples/remote_catalog.rs index f84c6a0302ce..70c0963545e0 100644 --- a/datafusion-examples/examples/remote_catalog.rs +++ b/datafusion-examples/examples/remote_catalog.rs @@ -36,9 +36,9 @@ use datafusion::catalog::TableProvider; use datafusion::catalog::{AsyncSchemaProvider, Session}; use datafusion::common::Result; use datafusion::common::{assert_batches_eq, internal_datafusion_err, plan_err}; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::execution::SendableRecordBatchStream; use datafusion::logical_expr::{Expr, TableType}; -use datafusion::physical_plan::memory::MemorySourceConfig; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::{DataFrame, SessionContext}; diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/simple_udtf.rs index afba4c390f71..d2b2d1bf9655 100644 --- a/datafusion-examples/examples/simple_udtf.rs +++ b/datafusion-examples/examples/simple_udtf.rs @@ -23,13 +23,13 @@ use datafusion::arrow::record_batch::RecordBatch; use datafusion::catalog::Session; use datafusion::catalog::TableFunctionImpl; use datafusion::common::{plan_err, ScalarValue}; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::simplify::SimplifyContext; use datafusion::logical_expr::{Expr, TableType}; use datafusion::optimizer::simplify_expressions::ExprSimplifier; -use datafusion::physical_plan::memory::MemorySourceConfig; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; use std::fs::File; diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 87a37248a31d..1e0f27ccdfc8 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -221,3 +221,7 @@ required-features = ["nested_expressions"] [[bench]] harness = false name = "dataframe" + +[[bench]] +harness = false +name = "spm" diff --git a/datafusion/core/benches/physical_plan.rs b/datafusion/core/benches/physical_plan.rs index 53c245ecc2b5..aae1457ab9e6 100644 --- a/datafusion/core/benches/physical_plan.rs +++ b/datafusion/core/benches/physical_plan.rs @@ -33,9 +33,9 @@ use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMerge use datafusion::physical_plan::{ collect, expressions::{col, PhysicalSortExpr}, - memory::MemorySourceConfig, }; use datafusion::prelude::SessionContext; +use datafusion_datasource::memory::MemorySourceConfig; use datafusion_physical_expr_common::sort_expr::LexOrdering; // Initialize the operator using the provided record batches and the sort key diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs index 4d71d4c56a6d..8f0b3753f67c 100644 --- a/datafusion/core/benches/sort.rs +++ b/datafusion/core/benches/sort.rs @@ -79,12 +79,13 @@ use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::{ execution::context::TaskContext, physical_plan::{ - coalesce_partitions::CoalescePartitionsExec, memory::MemorySourceConfig, + coalesce_partitions::CoalescePartitionsExec, sorts::sort_preserving_merge::SortPreservingMergeExec, ExecutionPlan, ExecutionPlanProperties, }, prelude::SessionContext, }; +use datafusion_datasource::memory::MemorySourceConfig; use datafusion_physical_expr::{expressions::col, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; diff --git a/datafusion/physical-plan/benches/spm.rs b/datafusion/core/benches/spm.rs similarity index 98% rename from datafusion/physical-plan/benches/spm.rs rename to datafusion/core/benches/spm.rs index 3a2ecb57394b..63b06f20cd86 100644 --- a/datafusion/physical-plan/benches/spm.rs +++ b/datafusion/core/benches/spm.rs @@ -27,7 +27,7 @@ use datafusion_physical_plan::{collect, ExecutionPlan}; use criterion::async_executor::FuturesExecutor; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_physical_plan::memory::MemorySourceConfig; +use datafusion_datasource::memory::MemorySourceConfig; fn generate_spm_for_round_robin_tie_breaker( has_same_value: bool, diff --git a/datafusion/core/src/datasource/empty.rs b/datafusion/core/src/datasource/empty.rs index abda7fa9ec4b..77686c5eb7c2 100644 --- a/datafusion/core/src/datasource/empty.rs +++ b/datafusion/core/src/datasource/empty.rs @@ -28,7 +28,8 @@ use datafusion_common::project_schema; use crate::datasource::{TableProvider, TableType}; use crate::error::Result; use crate::logical_expr::Expr; -use crate::physical_plan::{empty::EmptyExec, ExecutionPlan}; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::ExecutionPlan; /// An empty plan that is useful for testing and generating plans /// without mapping them to actual data. diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 3be8af59ea2a..819da155a1a2 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -37,7 +37,8 @@ use datafusion_common::{config_err, DataFusionError, Result}; use datafusion_expr::dml::InsertOp; use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; use datafusion_expr::{SortExpr, TableType}; -use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics}; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::{ExecutionPlan, Statistics}; use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef}; use datafusion_common::{ diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 94c6e45804e8..b8bec410070c 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -38,11 +38,11 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_catalog::Session; use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; +pub use datafusion_datasource::memory::MemorySourceConfig; +pub use datafusion_datasource::source::DataSourceExec; use datafusion_execution::TaskContext; use datafusion_expr::dml::InsertOp; use datafusion_expr::SortExpr; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use async_trait::async_trait; use futures::StreamExt; diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 12dd9d7cab38..96687913fb42 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -36,6 +36,8 @@ mod statistics; pub mod stream; pub mod view; +pub use datafusion_datasource::source; + // backwards compatibility pub use self::default_table_source::{ provider_as_source, source_as_provider, DefaultTableSource, diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 4a7cdc192cd3..c6e05893a979 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -32,12 +32,12 @@ use arrow::datatypes::SchemaRef; use arrow_ipc::reader::FileDecoder; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, }; diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index b0a1d8c8c9e2..1a88dc31a64d 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -29,12 +29,12 @@ use crate::error::Result; use arrow::datatypes::SchemaRef; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, }; diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index c0952229b5e0..412c90726af0 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -37,13 +37,13 @@ use arrow::csv; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use datafusion_physical_plan::projection::ProjectionExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, PlanProperties}; use futures::{StreamExt, TryStreamExt}; @@ -409,7 +409,7 @@ impl ExecutionPlan for CsvExec { /// # }; /// # use datafusion::datasource::physical_plan::CsvSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// /// # let object_store_url = ObjectStoreUrl::local_filesystem(); /// # let file_schema = Arc::new(Schema::empty()); diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 123ecc2f9582..4996b6d97b58 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -36,13 +36,13 @@ use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, Partitioning} use crate::datasource::data_source::FileSource; pub use datafusion_datasource::file_scan_config::*; +use datafusion_datasource::source::{DataSource, DataSourceExec}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_plan::display::{display_orderings, ProjectSchemaDisplay}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::projection::{ all_alias_free_columns, new_projections_for_columns, ProjectionExec, }; -use datafusion_physical_plan::source::{DataSource, DataSourceExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; /// Convert type to a type suitable for use as a [`ListingTable`] diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 590b1cb88dcd..249f50efa544 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -35,12 +35,12 @@ use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use arrow::json::ReaderBuilder; use arrow::{datatypes::SchemaRef, json}; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, PlanProperties}; use futures::{StreamExt, TryStreamExt}; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 4bd43cd1aaca..2a2d6d7fefdf 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -50,10 +50,10 @@ pub use access_plan::{ParquetAccessPlan, RowGroupAccess}; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::Constraints; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr}; use datafusion_physical_optimizer::pruning::PruningPredicate; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; -use datafusion_physical_plan::source::DataSourceExec; pub use metrics::ParquetFileMetrics; pub use page_filter::PagePruningAccessPlanFilter; pub use reader::{DefaultParquetFileReaderFactory, ParquetFileReaderFactory}; @@ -579,10 +579,10 @@ mod tests { use arrow::record_batch::RecordBatch; use bytes::{BufMut, BytesMut}; use datafusion_common::{assert_contains, ScalarValue}; + use datafusion_datasource::source::DataSourceExec; use datafusion_expr::{col, lit, when, Expr}; use datafusion_physical_expr::planner::logical2physical; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; - use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use crate::datasource::physical_plan::parquet::source::ParquetSource; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/source.rs b/datafusion/core/src/datasource/physical_plan/parquet/source.rs index 21881112075d..810a16de41af 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/source.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/source.rs @@ -81,7 +81,7 @@ use object_store::ObjectStore; /// # use datafusion::datasource::listing::PartitionedFile; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_physical_expr::expressions::lit; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// # use datafusion_common::config::TableParquetOptions; /// /// # let file_schema = Arc::new(Schema::empty()); @@ -160,7 +160,7 @@ use object_store::ObjectStore; /// # use arrow::datatypes::Schema; /// # use datafusion::datasource::physical_plan::FileScanConfig; /// # use datafusion::datasource::listing::PartitionedFile; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// /// # fn parquet_exec() -> DataSourceExec { unimplemented!() } /// // Split a single DataSourceExec into multiple DataSourceExecs, one for each file @@ -202,7 +202,7 @@ use object_store::ObjectStore; /// # use datafusion::datasource::physical_plan::FileScanConfig; /// # use datafusion::datasource::physical_plan::parquet::source::ParquetSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// /// # fn schema() -> SchemaRef { /// # Arc::new(Schema::empty()) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index d73b7d81536a..a74cdcc5920b 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -38,7 +38,6 @@ use crate::logical_expr::{ use crate::physical_expr::{create_physical_expr, create_physical_exprs}; use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; use crate::physical_plan::analyze::AnalyzeExec; -use crate::physical_plan::empty::EmptyExec; use crate::physical_plan::explain::ExplainExec; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::filter::FilterExec; @@ -48,7 +47,6 @@ use crate::physical_plan::joins::{ }; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::projection::ProjectionExec; -use crate::physical_plan::recursive_query::RecursiveQueryExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::union::UnionExec; @@ -58,6 +56,8 @@ use crate::physical_plan::{ displayable, windows, ExecutionPlan, ExecutionPlanProperties, InputOrderMode, Partitioning, PhysicalExpr, WindowExpr, }; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use arrow::array::{builder::StringBuilder, RecordBatch}; use arrow::compute::SortOptions; @@ -68,6 +68,7 @@ use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue, }; +use datafusion_datasource::memory::MemorySourceConfig; use datafusion_expr::dml::{CopyTo, InsertOp}; use datafusion_expr::expr::{ physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, @@ -84,7 +85,6 @@ use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::LexOrdering; use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::execution_plan::InvariantLevel; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::unnest::ListUnnest; diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index ba85f9afb6da..c569113a27bd 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -42,7 +42,7 @@ use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::DataFusionError; -use datafusion_physical_plan::source::DataSourceExec; +use datafusion_datasource::source::DataSourceExec; #[cfg(feature = "compression")] use bzip2::write::BzEncoder; diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 0e0090ef028e..fc98b43051f8 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -37,7 +37,7 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; -use datafusion_physical_plan::source::DataSourceExec; +use datafusion_datasource::source::DataSourceExec; use object_store::path::Path; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 5e1f263b4c76..1025a49ea1e3 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -30,6 +30,8 @@ use arrow::datatypes::{ }; use arrow::util::pretty::pretty_format_batches; use datafusion::common::Result; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::MemTable; use datafusion::physical_expr::aggregate::AggregateExprBuilder; use datafusion::physical_plan::aggregates::{ @@ -43,8 +45,6 @@ use datafusion_functions_aggregate::sum::sum_udaf; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::InputOrderMode; use test_utils::{add_empty_batches, StringBatchGenerator}; diff --git a/datafusion/core/tests/fuzz_cases/join_fuzz.rs b/datafusion/core/tests/fuzz_cases/join_fuzz.rs index 5dd29f90ef83..da93dd5edf29 100644 --- a/datafusion/core/tests/fuzz_cases/join_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/join_fuzz.rs @@ -26,6 +26,8 @@ use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; use datafusion::common::JoinSide; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::logical_expr::{JoinType, Operator}; use datafusion::physical_expr::expressions::BinaryExpr; use datafusion::physical_plan::collect; @@ -38,8 +40,6 @@ use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_common::ScalarValue; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::PhysicalExprRef; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use itertools::Itertools; use rand::Rng; diff --git a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs index 35fca789ddcb..92f375525066 100644 --- a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs @@ -24,10 +24,10 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::physical_plan::{ collect, expressions::{col, PhysicalSortExpr}, - memory::MemorySourceConfig, sorts::sort_preserving_merge::SortPreservingMergeExec, }; use datafusion::prelude::{SessionConfig, SessionContext}; diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index 51a5bc87efd9..0b0f0aa2f105 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -24,6 +24,7 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::physical_plan::expressions::PhysicalSortExpr; use datafusion::physical_plan::sorts::sort::SortExec; @@ -33,7 +34,6 @@ use datafusion_common::cast::as_int32_array; use datafusion_execution::memory_pool::GreedyMemoryPool; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::memory::MemorySourceConfig; use rand::Rng; use test_utils::{batches_to_vec, partitions_to_sorted_vec}; diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index d23408743f9f..06b93d41af36 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -44,9 +44,9 @@ mod sp_repartition_fuzz_tests { }; use test_utils::add_empty_batches; + use datafusion::datasource::memory::MemorySourceConfig; + use datafusion::datasource::source::DataSourceExec; use datafusion_physical_expr_common::sort_expr::LexOrdering; - use datafusion_physical_plan::memory::MemorySourceConfig; - use datafusion_physical_plan::source::DataSourceExec; use itertools::izip; use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index 4a484221a88a..a7f9e38c9ae3 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -22,6 +22,8 @@ use arrow::compute::{concat_batches, SortOptions}; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::functions_window::row_number::row_number_udwf; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::windows::{ @@ -48,8 +50,6 @@ use datafusion_functions_window::rank::{dense_rank_udwf, rank_udwf}; use datafusion_physical_expr::expressions::{cast, col, lit}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use rand::distributions::Alphanumeric; use rand::rngs::StdRng; diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 669294d38af1..a1985a1aa447 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -27,6 +27,8 @@ use arrow::array::{ArrayRef, DictionaryArray, RecordBatch}; use arrow::compute::SortOptions; use arrow::datatypes::{Int32Type, SchemaRef}; use datafusion::assert_batches_eq; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::{MemTable, TableProvider}; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; @@ -46,8 +48,6 @@ use datafusion_expr::{Expr, TableType}; use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion_physical_optimizer::join_selection::JoinSelection; use datafusion_physical_optimizer::PhysicalOptimizerRule; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::spill::get_record_batch_memory_size; use test_utils::AccessLogGenerator; diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 82024a731ed3..ad75cf2607c4 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -22,6 +22,7 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, }; +use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::session_state::SessionStateBuilder; @@ -34,7 +35,6 @@ use datafusion_execution::cache::cache_unit::{ use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::{col, lit, Expr}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion::datasource::physical_plan::FileScanConfig; use tempfile::tempdir; diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index ea86bf3685bb..fe96a2eb5e71 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -25,6 +25,7 @@ use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion::execution::context::SessionState; use datafusion::physical_plan::metrics::MetricValue; use datafusion::physical_plan::ExecutionPlan; @@ -33,7 +34,6 @@ use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::{col, lit, Expr}; use datafusion_physical_expr::create_physical_expr; -use datafusion_physical_plan::source::DataSourceExec; use futures::StreamExt; use object_store::path::Path; diff --git a/datafusion/core/tests/parquet/utils.rs b/datafusion/core/tests/parquet/utils.rs index dd5541461ff6..8cb50b22cf63 100644 --- a/datafusion/core/tests/parquet/utils.rs +++ b/datafusion/core/tests/parquet/utils.rs @@ -18,8 +18,8 @@ //! Utilities for parquet tests use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion_physical_plan::metrics::MetricsSet; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{accept, ExecutionPlan, ExecutionPlanVisitor}; /// Find the metrics from the first DataSourceExec encountered in the plan diff --git a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs index 1757c7150bfe..a79d743cb253 100644 --- a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs @@ -22,6 +22,8 @@ use crate::physical_optimizer::test_utils::TestAggregate; use arrow::array::Int32Array; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::cast::as_int64_array; use datafusion_common::config::ConfigOptions; use datafusion_common::Result; @@ -36,9 +38,7 @@ use datafusion_physical_plan::aggregates::PhysicalGroupBy; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::common; use datafusion_physical_plan::filter::FilterExec; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::projection::ProjectionExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::ExecutionPlan; /// Mock data using a MemorySourceConfig which has an exact count statistic diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 50c67f09c704..66d1380e09c3 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -31,6 +31,7 @@ use datafusion::datasource::file_format::file_compression_type::FileCompressionT use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; @@ -57,7 +58,6 @@ use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::projection::ProjectionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::ExecutionPlanProperties; use datafusion_physical_plan::PlanProperties; diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 89bd97881e3a..dfba57a584ea 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -21,7 +21,9 @@ use std::sync::Arc; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig}; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::config::ConfigOptions; use datafusion_common::Result; use datafusion_common::{JoinSide, JoinType, ScalarValue}; @@ -46,12 +48,10 @@ use datafusion_physical_plan::joins::{ HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode, SymmetricHashJoinExec, }; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::projection::{update_expr, ProjectionExec}; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::streaming::PartitionStream; use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index d9b569dfa611..58eb866c590c 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -32,13 +32,13 @@ use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::collect; use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; -use datafusion_physical_plan::memory::MemorySourceConfig; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::{ displayable, get_plan_string, ExecutionPlan, Partitioning, }; -use datafusion_physical_plan::source::DataSourceExec; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::tree_node::{TransformedResult, TreeNode}; use datafusion_common::Result; use datafusion_expr::{JoinType, Operator}; diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 162f93facc90..e4d72c112c38 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -26,7 +26,9 @@ use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::utils::expr::COUNT_STAR_EXPANSION; @@ -52,11 +54,9 @@ use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::joins::utils::{JoinFilter, JoinOn}; use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec}; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::tree_node::PlanContext; use datafusion_physical_plan::union::UnionExec; diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs index 6345f5e4352f..1a19bfe9e86f 100644 --- a/datafusion/core/tests/sql/path_partition.rs +++ b/datafusion/core/tests/sql/path_partition.rs @@ -26,6 +26,7 @@ use std::sync::Arc; use arrow::datatypes::DataType; use datafusion::datasource::listing::ListingTableUrl; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion::{ assert_batches_sorted_eq, datasource::{ @@ -43,7 +44,6 @@ use datafusion_common::ScalarValue; use datafusion_execution::config::SessionConfig; use datafusion_expr::{col, lit, Expr, Operator}; use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; -use datafusion_physical_plan::source::DataSourceExec; use async_trait::async_trait; use bytes::Bytes; diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs index 0ec9a5fd7620..618f0590ab3d 100644 --- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs @@ -26,6 +26,7 @@ use arrow::csv::ReaderBuilder; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::TaskContext; @@ -35,7 +36,6 @@ use datafusion_catalog::Session; use datafusion_catalog::TableFunctionImpl; use datafusion_common::{assert_batches_eq, DFSchema, ScalarValue}; use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, Projection, TableType}; -use datafusion_physical_plan::memory::MemorySourceConfig; use async_trait::async_trait; diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index bf1f8789ac43..a1d7af106f69 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -52,6 +52,8 @@ datafusion-common = { workspace = true, features = ["object_store"] } datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-physical-expr = { workspace = true } +datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } flate2 = { version = "1.0.24", optional = true } futures = { workspace = true } diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs new file mode 100644 index 000000000000..efb178ad078e --- /dev/null +++ b/datafusion/datasource/src/memory.rs @@ -0,0 +1,926 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Execution plan for reading in-memory batches of data + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use crate::source::{DataSource, DataSourceExec}; +use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion_physical_plan::memory::MemoryStream; +use datafusion_physical_plan::projection::{ + all_alias_free_columns, new_projections_for_columns, ProjectionExec, +}; +use datafusion_physical_plan::{ + common, ColumnarValue, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, + PhysicalExpr, PlanProperties, SendableRecordBatchStream, Statistics, +}; + +use arrow::array::{RecordBatch, RecordBatchOptions}; +use arrow::datatypes::{Schema, SchemaRef}; +use datafusion_common::{ + internal_err, plan_err, project_schema, Constraints, Result, ScalarValue, +}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::equivalence::ProjectionMapping; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::utils::collect_columns; +use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; + +/// Execution plan for reading in-memory batches of data +#[derive(Clone)] +#[deprecated( + since = "46.0.0", + note = "use MemorySourceConfig and DataSourceExec instead" +)] +pub struct MemoryExec { + inner: DataSourceExec, + /// The partitions to query + partitions: Vec>, + /// Optional projection + projection: Option>, + // Sort information: one or more equivalent orderings + sort_information: Vec, + /// if partition sizes should be displayed + show_sizes: bool, +} + +#[allow(unused, deprecated)] +impl fmt::Debug for MemoryExec { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt_as(DisplayFormatType::Default, f) + } +} + +#[allow(unused, deprecated)] +impl DisplayAs for MemoryExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt_as(t, f) + } +} + +#[allow(unused, deprecated)] +impl ExecutionPlan for MemoryExec { + fn name(&self) -> &'static str { + "MemoryExec" + } + + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn properties(&self) -> &PlanProperties { + self.inner.properties() + } + + fn children(&self) -> Vec<&Arc> { + // This is a leaf node and has no children + vec![] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + // MemoryExec has no children + if children.is_empty() { + Ok(self) + } else { + internal_err!("Children cannot be replaced in {self:?}") + } + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + self.inner.execute(partition, context) + } + + /// We recompute the statistics dynamically from the arrow metadata as it is pretty cheap to do so + fn statistics(&self) -> Result { + self.inner.statistics() + } + + fn try_swapping_with_projection( + &self, + projection: &ProjectionExec, + ) -> Result>> { + self.inner.try_swapping_with_projection(projection) + } +} + +#[allow(unused, deprecated)] +impl MemoryExec { + /// Create a new execution plan for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result { + let source = MemorySourceConfig::try_new(partitions, schema, projection.clone())?; + let data_source = DataSourceExec::new(Arc::new(source)); + Ok(Self { + inner: data_source, + partitions: partitions.to_vec(), + projection, + sort_information: vec![], + show_sizes: true, + }) + } + + /// Create a new execution plan from a list of constant values (`ValuesExec`) + pub fn try_new_as_values( + schema: SchemaRef, + data: Vec>>, + ) -> Result { + if data.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + let n_row = data.len(); + let n_col = schema.fields().len(); + + // We have this single row batch as a placeholder to satisfy evaluation argument + // and generate a single output row + let placeholder_schema = Arc::new(Schema::empty()); + let placeholder_batch = RecordBatch::try_new_with_options( + Arc::clone(&placeholder_schema), + vec![], + &RecordBatchOptions::new().with_row_count(Some(1)), + )?; + + // Evaluate each column + let arrays = (0..n_col) + .map(|j| { + (0..n_row) + .map(|i| { + let expr = &data[i][j]; + let result = expr.evaluate(&placeholder_batch)?; + + match result { + ColumnarValue::Scalar(scalar) => Ok(scalar), + ColumnarValue::Array(array) if array.len() == 1 => { + ScalarValue::try_from_array(&array, 0) + } + ColumnarValue::Array(_) => { + plan_err!("Cannot have array values in a values list") + } + } + }) + .collect::>>() + .and_then(ScalarValue::iter_to_array) + }) + .collect::>>()?; + + let batch = RecordBatch::try_new_with_options( + Arc::clone(&schema), + arrays, + &RecordBatchOptions::new().with_row_count(Some(n_row)), + )?; + + let partitions = vec![batch]; + Self::try_new_from_batches(Arc::clone(&schema), partitions) + } + + /// Create a new plan using the provided schema and batches. + /// + /// Errors if any of the batches don't match the provided schema, or if no + /// batches are provided. + pub fn try_new_from_batches( + schema: SchemaRef, + batches: Vec, + ) -> Result { + if batches.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + for batch in &batches { + let batch_schema = batch.schema(); + if batch_schema != schema { + return plan_err!( + "Batch has invalid schema. Expected: {}, got: {}", + schema, + batch_schema + ); + } + } + + let partitions = vec![batches]; + let source = MemorySourceConfig { + partitions: partitions.clone(), + schema: Arc::clone(&schema), + projected_schema: Arc::clone(&schema), + projection: None, + sort_information: vec![], + show_sizes: true, + fetch: None, + }; + let data_source = DataSourceExec::new(Arc::new(source)); + Ok(Self { + inner: data_source, + partitions, + projection: None, + sort_information: vec![], + show_sizes: true, + }) + } + + fn memory_source_config(&self) -> MemorySourceConfig { + self.inner + .source() + .as_any() + .downcast_ref::() + .unwrap() + .clone() + } + + pub fn with_constraints(mut self, constraints: Constraints) -> Self { + self.inner = self.inner.with_constraints(constraints); + self + } + + /// Set `show_sizes` to determine whether to display partition sizes + pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { + let mut memory_source = self.memory_source_config(); + memory_source.show_sizes = show_sizes; + self.show_sizes = show_sizes; + self.inner = DataSourceExec::new(Arc::new(memory_source)); + self + } + + /// Ref to constraints + pub fn constraints(&self) -> &Constraints { + self.properties().equivalence_properties().constraints() + } + + /// Ref to partitions + pub fn partitions(&self) -> &[Vec] { + &self.partitions + } + + /// Ref to projection + pub fn projection(&self) -> &Option> { + &self.projection + } + + /// Show sizes + pub fn show_sizes(&self) -> bool { + self.show_sizes + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + + /// A memory table can be ordered by multiple expressions simultaneously. + /// [`EquivalenceProperties`] keeps track of expressions that describe the + /// global ordering of the schema. These columns are not necessarily same; e.g. + /// ```text + /// ┌-------┐ + /// | a | b | + /// |---|---| + /// | 1 | 9 | + /// | 2 | 8 | + /// | 3 | 7 | + /// | 5 | 5 | + /// └---┴---┘ + /// ``` + /// where both `a ASC` and `b DESC` can describe the table ordering. With + /// [`EquivalenceProperties`], we can keep track of these equivalences + /// and treat `a ASC` and `b DESC` as the same ordering requirement. + /// + /// Note that if there is an internal projection, that projection will be + /// also applied to the given `sort_information`. + pub fn try_with_sort_information( + mut self, + sort_information: Vec, + ) -> Result { + self.sort_information = sort_information.clone(); + let mut memory_source = self.memory_source_config(); + memory_source = memory_source.try_with_sort_information(sort_information)?; + self.inner = DataSourceExec::new(Arc::new(memory_source)); + Ok(self) + } + + /// Arc clone of ref to original schema + pub fn original_schema(&self) -> SchemaRef { + Arc::clone(&self.inner.schema()) + } + + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. + fn compute_properties( + schema: SchemaRef, + orderings: &[LexOrdering], + constraints: Constraints, + partitions: &[Vec], + ) -> PlanProperties { + PlanProperties::new( + EquivalenceProperties::new_with_orderings(schema, orderings) + .with_constraints(constraints), + Partitioning::UnknownPartitioning(partitions.len()), + EmissionType::Incremental, + Boundedness::Bounded, + ) + } +} + +/// Data source configuration for reading in-memory batches of data +#[derive(Clone)] +pub struct MemorySourceConfig { + /// The partitions to query + partitions: Vec>, + /// Schema representing the data before projection + schema: SchemaRef, + /// Schema representing the data after the optional projection is applied + projected_schema: SchemaRef, + /// Optional projection + projection: Option>, + /// Sort information: one or more equivalent orderings + sort_information: Vec, + /// if partition sizes should be displayed + show_sizes: bool, + /// The maximum number of records to read from this plan. If `None`, + /// all records after filtering are returned. + fetch: Option, +} + +impl DataSource for MemorySourceConfig { + fn open( + &self, + partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin( + MemoryStream::try_new( + self.partitions[partition].clone(), + Arc::clone(&self.projected_schema), + self.projection.clone(), + )? + .with_fetch(self.fetch), + )) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + let partition_sizes: Vec<_> = + self.partitions.iter().map(|b| b.len()).collect(); + + let output_ordering = self + .sort_information + .first() + .map(|output_ordering| { + format!(", output_ordering={}", output_ordering) + }) + .unwrap_or_default(); + + let eq_properties = self.eq_properties(); + let constraints = eq_properties.constraints(); + let constraints = if constraints.is_empty() { + String::new() + } else { + format!(", {}", constraints) + }; + + let limit = self + .fetch + .map_or(String::new(), |limit| format!(", fetch={}", limit)); + if self.show_sizes { + write!( + f, + "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } else { + write!( + f, + "partitions={}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } + } + } + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.partitions.len()) + } + + fn eq_properties(&self) -> EquivalenceProperties { + EquivalenceProperties::new_with_orderings( + Arc::clone(&self.projected_schema), + self.sort_information.as_slice(), + ) + } + + fn statistics(&self) -> Result { + Ok(common::compute_record_batch_statistics( + &self.partitions, + &self.schema, + self.projection.clone(), + )) + } + + fn with_fetch(&self, limit: Option) -> Option> { + let source = self.clone(); + Some(Arc::new(source.with_limit(limit))) + } + + fn fetch(&self) -> Option { + self.fetch + } + + fn try_swapping_with_projection( + &self, + projection: &ProjectionExec, + ) -> Result>> { + // If there is any non-column or alias-carrier expression, Projection should not be removed. + // This process can be moved into MemoryExec, but it would be an overlap of their responsibility. + all_alias_free_columns(projection.expr()) + .then(|| { + let all_projections = (0..self.schema.fields().len()).collect(); + let new_projections = new_projections_for_columns( + projection, + self.projection().as_ref().unwrap_or(&all_projections), + ); + + MemorySourceConfig::try_new_exec( + self.partitions(), + self.original_schema(), + Some(new_projections), + ) + .map(|e| e as _) + }) + .transpose() + } +} + +impl MemorySourceConfig { + /// Create a new `MemorySourceConfig` for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result { + let projected_schema = project_schema(&schema, projection.as_ref())?; + Ok(Self { + partitions: partitions.to_vec(), + schema, + projected_schema, + projection, + sort_information: vec![], + show_sizes: true, + fetch: None, + }) + } + + /// Create a new `DataSourceExec` plan for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new_exec( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result> { + let source = Self::try_new(partitions, schema, projection)?; + Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + } + + /// Create a new execution plan from a list of constant values (`ValuesExec`) + pub fn try_new_as_values( + schema: SchemaRef, + data: Vec>>, + ) -> Result> { + if data.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + let n_row = data.len(); + let n_col = schema.fields().len(); + + // We have this single row batch as a placeholder to satisfy evaluation argument + // and generate a single output row + let placeholder_schema = Arc::new(Schema::empty()); + let placeholder_batch = RecordBatch::try_new_with_options( + Arc::clone(&placeholder_schema), + vec![], + &RecordBatchOptions::new().with_row_count(Some(1)), + )?; + + // Evaluate each column + let arrays = (0..n_col) + .map(|j| { + (0..n_row) + .map(|i| { + let expr = &data[i][j]; + let result = expr.evaluate(&placeholder_batch)?; + + match result { + ColumnarValue::Scalar(scalar) => Ok(scalar), + ColumnarValue::Array(array) if array.len() == 1 => { + ScalarValue::try_from_array(&array, 0) + } + ColumnarValue::Array(_) => { + plan_err!("Cannot have array values in a values list") + } + } + }) + .collect::>>() + .and_then(ScalarValue::iter_to_array) + }) + .collect::>>()?; + + let batch = RecordBatch::try_new_with_options( + Arc::clone(&schema), + arrays, + &RecordBatchOptions::new().with_row_count(Some(n_row)), + )?; + + let partitions = vec![batch]; + Self::try_new_from_batches(Arc::clone(&schema), partitions) + } + + /// Create a new plan using the provided schema and batches. + /// + /// Errors if any of the batches don't match the provided schema, or if no + /// batches are provided. + pub fn try_new_from_batches( + schema: SchemaRef, + batches: Vec, + ) -> Result> { + if batches.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + for batch in &batches { + let batch_schema = batch.schema(); + if batch_schema != schema { + return plan_err!( + "Batch has invalid schema. Expected: {}, got: {}", + schema, + batch_schema + ); + } + } + + let partitions = vec![batches]; + let source = Self { + partitions, + schema: Arc::clone(&schema), + projected_schema: Arc::clone(&schema), + projection: None, + sort_information: vec![], + show_sizes: true, + fetch: None, + }; + Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + } + + /// Set the limit of the files + pub fn with_limit(mut self, limit: Option) -> Self { + self.fetch = limit; + self + } + + /// Set `show_sizes` to determine whether to display partition sizes + pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { + self.show_sizes = show_sizes; + self + } + + /// Ref to partitions + pub fn partitions(&self) -> &[Vec] { + &self.partitions + } + + /// Ref to projection + pub fn projection(&self) -> &Option> { + &self.projection + } + + /// Show sizes + pub fn show_sizes(&self) -> bool { + self.show_sizes + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + + /// A memory table can be ordered by multiple expressions simultaneously. + /// [`EquivalenceProperties`] keeps track of expressions that describe the + /// global ordering of the schema. These columns are not necessarily same; e.g. + /// ```text + /// ┌-------┐ + /// | a | b | + /// |---|---| + /// | 1 | 9 | + /// | 2 | 8 | + /// | 3 | 7 | + /// | 5 | 5 | + /// └---┴---┘ + /// ``` + /// where both `a ASC` and `b DESC` can describe the table ordering. With + /// [`EquivalenceProperties`], we can keep track of these equivalences + /// and treat `a ASC` and `b DESC` as the same ordering requirement. + /// + /// Note that if there is an internal projection, that projection will be + /// also applied to the given `sort_information`. + pub fn try_with_sort_information( + mut self, + mut sort_information: Vec, + ) -> Result { + // All sort expressions must refer to the original schema + let fields = self.schema.fields(); + let ambiguous_column = sort_information + .iter() + .flat_map(|ordering| ordering.clone()) + .flat_map(|expr| collect_columns(&expr.expr)) + .find(|col| { + fields + .get(col.index()) + .map(|field| field.name() != col.name()) + .unwrap_or(true) + }); + if let Some(col) = ambiguous_column { + return internal_err!( + "Column {:?} is not found in the original schema of the MemorySourceConfig", + col + ); + } + + // If there is a projection on the source, we also need to project orderings + if let Some(projection) = &self.projection { + let base_eqp = EquivalenceProperties::new_with_orderings( + self.original_schema(), + &sort_information, + ); + let proj_exprs = projection + .iter() + .map(|idx| { + let base_schema = self.original_schema(); + let name = base_schema.field(*idx).name(); + (Arc::new(Column::new(name, *idx)) as _, name.to_string()) + }) + .collect::>(); + let projection_mapping = + ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; + sort_information = base_eqp + .project(&projection_mapping, Arc::clone(&self.projected_schema)) + .into_oeq_class() + .into_inner(); + } + + self.sort_information = sort_information; + Ok(self) + } + + /// Arc clone of ref to original schema + pub fn original_schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +#[cfg(test)] +mod memory_source_tests { + use std::sync::Arc; + + use crate::memory::MemorySourceConfig; + use crate::source::DataSourceExec; + use datafusion_physical_plan::ExecutionPlan; + + use arrow::compute::SortOptions; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_physical_expr::expressions::col; + use datafusion_physical_expr::PhysicalSortExpr; + use datafusion_physical_expr_common::sort_expr::LexOrdering; + + #[test] + fn test_memory_order_eq() -> datafusion_common::Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + Field::new("c", DataType::Int64, false), + ])); + let sort1 = LexOrdering::new(vec![ + PhysicalSortExpr { + expr: col("a", &schema)?, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: col("b", &schema)?, + options: SortOptions::default(), + }, + ]); + let sort2 = LexOrdering::new(vec![PhysicalSortExpr { + expr: col("c", &schema)?, + options: SortOptions::default(), + }]); + let mut expected_output_order = LexOrdering::default(); + expected_output_order.extend(sort1.clone()); + expected_output_order.extend(sort2.clone()); + + let sort_information = vec![sort1.clone(), sort2.clone()]; + let mem_exec = Arc::new(DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&[vec![]], schema, None)? + .try_with_sort_information(sort_information)?, + ))); + + assert_eq!( + mem_exec.properties().output_ordering().unwrap(), + &expected_output_order + ); + let eq_properties = mem_exec.properties().equivalence_properties(); + assert!(eq_properties.oeq_class().contains(&sort1)); + assert!(eq_properties.oeq_class().contains(&sort2)); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ArrayRef, Int32Array}; + use datafusion_physical_plan::expressions::lit; + use std::collections::HashMap; + + use arrow::datatypes::{DataType, Field}; + use datafusion_common::assert_batches_eq; + use datafusion_common::stats::{ColumnStatistics, Precision}; + use futures::StreamExt; + + // Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" + pub fn make_partition(sz: i32) -> RecordBatch { + let seq_start = 0; + let seq_end = sz; + let values = (seq_start..seq_end).collect::>(); + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let arr = Arc::new(Int32Array::from(values)); + let arr = arr as ArrayRef; + + RecordBatch::try_new(schema, vec![arr]).unwrap() + } + + #[tokio::test] + async fn exec_with_limit() -> Result<()> { + let task_ctx = Arc::new(TaskContext::default()); + let batch = make_partition(7); + let schema = batch.schema(); + let batches = vec![batch.clone(), batch]; + + let exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); + assert_eq!(exec.fetch(), None); + + let exec = exec.with_fetch(Some(4)).unwrap(); + assert_eq!(exec.fetch(), Some(4)); + + let mut it = exec.execute(0, task_ctx)?; + let mut results = vec![]; + while let Some(batch) = it.next().await { + results.push(batch?); + } + + let expected = [ + "+---+", "| i |", "+---+", "| 0 |", "| 1 |", "| 2 |", "| 3 |", "+---+", + ]; + assert_batches_eq!(expected, &results); + Ok(()) + } + + /// Get the schema for the aggregate_test_* csv files + pub fn aggr_test_schema() -> SchemaRef { + let mut f1 = Field::new("c1", DataType::Utf8, false); + f1.set_metadata(HashMap::from_iter(vec![("testing".into(), "test".into())])); + let schema = Schema::new(vec![ + f1, + Field::new("c2", DataType::UInt32, false), + Field::new("c3", DataType::Int8, false), + Field::new("c4", DataType::Int16, false), + Field::new("c5", DataType::Int32, false), + Field::new("c6", DataType::Int64, false), + Field::new("c7", DataType::UInt8, false), + Field::new("c8", DataType::UInt16, false), + Field::new("c9", DataType::UInt32, false), + Field::new("c10", DataType::UInt64, false), + Field::new("c11", DataType::Float32, false), + Field::new("c12", DataType::Float64, false), + Field::new("c13", DataType::Utf8, false), + ]); + + Arc::new(schema) + } + + #[tokio::test] + async fn values_empty_case() -> Result<()> { + let schema = aggr_test_schema(); + let empty = MemorySourceConfig::try_new_as_values(schema, vec![]); + assert!(empty.is_err()); + Ok(()) + } + + #[test] + fn new_exec_with_batches() { + let batch = make_partition(7); + let schema = batch.schema(); + let batches = vec![batch.clone(), batch]; + let _exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); + } + + #[test] + fn new_exec_with_batches_empty() { + let batch = make_partition(7); + let schema = batch.schema(); + let _ = MemorySourceConfig::try_new_from_batches(schema, Vec::new()).unwrap_err(); + } + + #[test] + fn new_exec_with_batches_invalid_schema() { + let batch = make_partition(7); + let batches = vec![batch.clone(), batch]; + + let invalid_schema = Arc::new(Schema::new(vec![ + Field::new("col0", DataType::UInt32, false), + Field::new("col1", DataType::Utf8, false), + ])); + let _ = MemorySourceConfig::try_new_from_batches(invalid_schema, batches) + .unwrap_err(); + } + + // Test issue: https://github.com/apache/datafusion/issues/8763 + #[test] + fn new_exec_with_non_nullable_schema() { + let schema = Arc::new(Schema::new(vec![Field::new( + "col0", + DataType::UInt32, + false, + )])); + let _ = MemorySourceConfig::try_new_as_values( + Arc::clone(&schema), + vec![vec![lit(1u32)]], + ) + .unwrap(); + // Test that a null value is rejected + let _ = MemorySourceConfig::try_new_as_values( + schema, + vec![vec![lit(ScalarValue::UInt32(None))]], + ) + .unwrap_err(); + } + + #[test] + fn values_stats_with_nulls_only() -> Result<()> { + let data = vec![ + vec![lit(ScalarValue::Null)], + vec![lit(ScalarValue::Null)], + vec![lit(ScalarValue::Null)], + ]; + let rows = data.len(); + let values = MemorySourceConfig::try_new_as_values( + Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])), + data, + )?; + + assert_eq!( + values.statistics()?, + Statistics { + num_rows: Precision::Exact(rows), + total_byte_size: Precision::Exact(8), // not important + column_statistics: vec![ColumnStatistics { + null_count: Precision::Exact(rows), // there are only nulls + distinct_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + },], + } + ); + + Ok(()) + } +} diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 2fc2da64891d..8183d7b53244 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -30,6 +30,8 @@ pub mod file_meta; pub mod file_scan_config; pub mod file_sink_config; pub mod file_stream; +pub mod memory; +pub mod source; pub mod url; pub mod write; use chrono::TimeZone; diff --git a/datafusion/physical-plan/src/source.rs b/datafusion/datasource/src/source.rs similarity index 95% rename from datafusion/physical-plan/src/source.rs rename to datafusion/datasource/src/source.rs index 0c1dfddd2678..1b0d76902972 100644 --- a/datafusion/physical-plan/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -20,10 +20,12 @@ use std::fmt; use std::fmt::{Debug, Formatter}; use std::sync::Arc; -use crate::execution_plan::{Boundedness, EmissionType}; -use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use crate::projection::ProjectionExec; -use crate::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use datafusion_physical_plan::projection::ProjectionExec; +use datafusion_physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, +}; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index e8473e6556d1..9b7e6a0f5db0 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -40,6 +40,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } datafusion-common = { workspace = true, default-features = true } +datafusion-datasource = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index ae0ea60e1a3e..4a10398e5a9e 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -77,10 +77,6 @@ tokio = { workspace = true, features = [ "parking_lot", ] } -[[bench]] -harness = false -name = "spm" - [[bench]] harness = false name = "partial_ordering" diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 85b41da85742..0947a2ff5539 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1347,10 +1347,10 @@ mod tests { use crate::common::collect; use crate::execution_plan::Boundedness; use crate::expressions::col; - use crate::memory::MemorySourceConfig; use crate::metrics::MetricValue; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use crate::RecordBatchStream; use arrow::array::{ @@ -2207,7 +2207,7 @@ mod tests { vec![test_last_value_agg_expr(&schema, sort_options)?] }; - let memory_exec = MemorySourceConfig::try_new_exec( + let memory_exec = TestMemoryExec::try_new_exec( &[ vec![partition1], vec![partition2], @@ -2442,11 +2442,8 @@ mod tests { }) .collect(); - let input = MemorySourceConfig::try_new_exec( - &[input_batches], - Arc::clone(&schema), - None, - )?; + let input = + TestMemoryExec::try_new_exec(&[input_batches], Arc::clone(&schema), None)?; let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Single, @@ -2557,7 +2554,7 @@ mod tests { .build() .map(Arc::new)?]; - let input = MemorySourceConfig::try_new_exec( + let input = TestMemoryExec::try_new_exec( &[vec![batch.clone()]], Arc::::clone(&batch.schema()), None, @@ -2627,7 +2624,7 @@ mod tests { ]; let input = - MemorySourceConfig::try_new_exec(&[input_data], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[input_data], Arc::clone(&schema), None)?; let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Partial, group_by, @@ -2714,7 +2711,7 @@ mod tests { ]; let input = - MemorySourceConfig::try_new_exec(&[input_data], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[input_data], Arc::clone(&schema), None)?; let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Partial, group_by, @@ -2829,7 +2826,7 @@ mod tests { create_record_batch(&schema, (vec![2, 3, 4, 4], vec![1.0, 2.0, 3.0, 4.0]))?, ]; let plan: Arc = - MemorySourceConfig::try_new_exec(&[batches], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?; let grouping_set = PhysicalGroupBy::new( vec![(col("a", &schema)?, "a".to_string())], diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index 5168c3cc101f..c4e738cb3ad1 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -20,10 +20,10 @@ use std::any::Any; use std::sync::Arc; -use super::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; +use crate::memory::MemoryStream; +use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::{ execution_plan::{Boundedness, EmissionType}, - memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning, }; diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 5866f0938e41..a66873bc6576 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -624,7 +624,6 @@ mod tests { use crate::expressions::*; use crate::test; use crate::test::exec::StatisticsExec; - use arrow::datatypes::{Field, Schema, UnionFields, UnionMode}; use datafusion_common::ScalarValue; diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 2983478ada74..23ffd2a28d3c 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -1638,7 +1638,7 @@ impl EmbeddedProjection for HashJoinExec { #[cfg(test)] mod tests { use super::*; - use crate::memory::MemorySourceConfig; + use crate::test::TestMemoryExec; use crate::{ common, expressions::Column, repartition::RepartitionExec, test::build_table_i32, test::exec::MockExec, @@ -1680,7 +1680,7 @@ mod tests { ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn join( @@ -2083,7 +2083,7 @@ mod tests { build_table_i32(("a1", &vec![2]), ("b2", &vec![2]), ("c1", &vec![9])); let schema = batch1.schema(); let left = - MemorySourceConfig::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) + TestMemoryExec::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) .unwrap(); let right = build_table( @@ -2155,7 +2155,7 @@ mod tests { let schema = batch1.schema(); let left = - MemorySourceConfig::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) + TestMemoryExec::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) .unwrap(); let right = build_table( ("a2", &vec![20, 30, 10]), @@ -2209,7 +2209,7 @@ mod tests { build_table_i32(("a2", &vec![30]), ("b1", &vec![5]), ("c2", &vec![90])); let schema = batch1.schema(); let right = - MemorySourceConfig::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) + TestMemoryExec::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) .unwrap(); let on = vec![( @@ -2288,8 +2288,7 @@ mod tests { ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch.clone(), batch]], schema, None) - .unwrap() + TestMemoryExec::try_new_exec(&[vec![batch.clone(), batch]], schema, None).unwrap() } #[apply(batch_sizes)] @@ -2394,8 +2393,7 @@ mod tests { Arc::new(Column::new_with_schema("b1", &right.schema()).unwrap()) as _, )]; let schema = right.schema(); - let right = - MemorySourceConfig::try_new_exec(&[vec![right]], schema, None).unwrap(); + let right = TestMemoryExec::try_new_exec(&[vec![right]], schema, None).unwrap(); let join = join(left, right, on, &JoinType::Left, false).unwrap(); let columns = columns(&join.schema()); @@ -2432,8 +2430,7 @@ mod tests { Arc::new(Column::new_with_schema("b2", &right.schema()).unwrap()) as _, )]; let schema = right.schema(); - let right = - MemorySourceConfig::try_new_exec(&[vec![right]], schema, None).unwrap(); + let right = TestMemoryExec::try_new_exec(&[vec![right]], schema, None).unwrap(); let join = join(left, right, on, &JoinType::Full, false).unwrap(); let columns = columns(&join.schema()); @@ -3738,13 +3735,12 @@ mod tests { let n: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); let batch = RecordBatch::try_new(Arc::clone(&schema), vec![dates, n])?; let left = - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) .unwrap(); let dates: ArrayRef = Arc::new(Date32Array::from(vec![19108, 19108, 19109])); let n: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6])); let batch = RecordBatch::try_new(Arc::clone(&schema), vec![dates, n])?; - let right = - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap(); + let right = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap(); let on = vec![( Arc::new(Column::new_with_schema("date", &left.schema()).unwrap()) as _, Arc::new(Column::new_with_schema("date", &right.schema()).unwrap()) as _, @@ -4034,7 +4030,7 @@ mod tests { ("b1", &vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0]), ("c1", &vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0]), ); - let left = MemorySourceConfig::try_new_exec( + let left = TestMemoryExec::try_new_exec( &[vec![left_batch.clone()], vec![left_batch.clone()]], left_batch.schema(), None, @@ -4045,7 +4041,7 @@ mod tests { ("b2", &vec![12, 13]), ("c2", &vec![14, 15]), ); - let right = MemorySourceConfig::try_new_exec( + let right = TestMemoryExec::try_new_exec( &[vec![right_batch.clone()], vec![right_batch.clone()]], right_batch.schema(), None, @@ -4130,7 +4126,7 @@ mod tests { ) .unwrap(); let schema_ref = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema_ref, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema_ref, None).unwrap() } #[tokio::test] diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 6de6b3b4dff4..64dfc8219b64 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -1030,8 +1030,7 @@ impl EmbeddedProjection for NestedLoopJoinExec { #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::memory::MemorySourceConfig; - use crate::source::DataSourceExec; + use crate::test::TestMemoryExec; use crate::{ common, expressions::Column, repartition::RepartitionExec, test::build_table_i32, }; @@ -1072,7 +1071,7 @@ pub(crate) mod tests { }; let mut source = - MemorySourceConfig::try_new(&[batches], Arc::clone(&schema), None).unwrap(); + TestMemoryExec::try_new(&[batches], Arc::clone(&schema), None).unwrap(); if !sorted_column_names.is_empty() { let mut sort_info = LexOrdering::default(); for name in sorted_column_names { @@ -1089,7 +1088,7 @@ pub(crate) mod tests { source = source.try_with_sort_information(vec![sort_info]).unwrap(); } - Arc::new(DataSourceExec::new(Arc::new(source))) + Arc::new(TestMemoryExec::update_cache(Arc::new(source))) } fn build_left_table() -> Arc { diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index a3e835c64131..6c933ca21807 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -2547,7 +2547,7 @@ mod tests { use crate::joins::sort_merge_join::{get_corrected_filter_mask, JoinedRecordBatches}; use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn}; use crate::joins::SortMergeJoinExec; - use crate::memory::MemorySourceConfig; + use crate::test::TestMemoryExec; use crate::test::{build_table_i32, build_table_i32_two_cols}; use crate::{common, ExecutionPlan}; @@ -2558,12 +2558,12 @@ mod tests { ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn build_table_from_batches(batches: Vec) -> Arc { let schema = batches.first().unwrap().schema(); - MemorySourceConfig::try_new_exec(&[batches], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap() } fn build_date_table( @@ -2588,7 +2588,7 @@ mod tests { .unwrap(); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn build_date64_table( @@ -2613,7 +2613,7 @@ mod tests { .unwrap(); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } /// returns a table with 3 columns of i32 in memory @@ -2636,7 +2636,7 @@ mod tests { ], ) .unwrap(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } pub fn build_table_two_cols( @@ -2645,7 +2645,7 @@ mod tests { ) -> Arc { let batch = build_table_i32_two_cols(a, b); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn join( diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs index 9932c647be0a..e70007aa651f 100644 --- a/datafusion/physical-plan/src/joins/test_utils.rs +++ b/datafusion/physical-plan/src/joins/test_utils.rs @@ -23,9 +23,8 @@ use crate::joins::utils::{JoinFilter, JoinOn}; use crate::joins::{ HashJoinExec, PartitionMode, StreamJoinPartitionMode, SymmetricHashJoinExec, }; -use crate::memory::MemorySourceConfig; use crate::repartition::RepartitionExec; -use crate::source::DataSourceExec; +use crate::test::TestMemoryExec; use crate::{common, ExecutionPlan, ExecutionPlanProperties, Partitioning}; use arrow::array::{ @@ -530,14 +529,14 @@ pub fn create_memory_table( right_sorted: Vec, ) -> Result<(Arc, Arc)> { let left_schema = left_partition[0].schema(); - let left = MemorySourceConfig::try_new(&[left_partition], left_schema, None)? + let left = TestMemoryExec::try_new(&[left_partition], left_schema, None)? .try_with_sort_information(left_sorted)?; let right_schema = right_partition[0].schema(); - let right = MemorySourceConfig::try_new(&[right_partition], right_schema, None)? + let right = TestMemoryExec::try_new(&[right_partition], right_schema, None)? .try_with_sort_information(right_sorted)?; Ok(( - Arc::new(DataSourceExec::new(Arc::new(left))), - Arc::new(DataSourceExec::new(Arc::new(right))), + Arc::new(TestMemoryExec::update_cache(Arc::new(left))), + Arc::new(TestMemoryExec::update_cache(Arc::new(right))), )) } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 06fe23d2ff90..6ddaef1a2d28 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -83,13 +83,11 @@ pub mod unnest; pub mod values; pub mod windows; pub mod work_table; - pub mod udaf { pub use datafusion_expr::StatisticsArgs; pub use datafusion_physical_expr::aggregate::AggregateFunctionExpr; } pub mod coalesce; -pub mod source; #[cfg(test)] pub mod test; diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 0077804bdfc9..fd338cc91353 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -22,696 +22,22 @@ use std::fmt; use std::sync::Arc; use std::task::{Context, Poll}; -use super::{ - common, ColumnarValue, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, - PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream, - Statistics, -}; use crate::execution_plan::{Boundedness, EmissionType}; -use crate::projection::{ - all_alias_free_columns, new_projections_for_columns, ProjectionExec, +use crate::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use crate::source::{DataSource, DataSourceExec}; -use arrow::array::{RecordBatch, RecordBatchOptions}; -use arrow::datatypes::{Schema, SchemaRef}; -use datafusion_common::{ - internal_err, plan_err, project_schema, Constraints, Result, ScalarValue, -}; +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +use datafusion_common::{internal_err, Result}; use datafusion_execution::memory_pool::MemoryReservation; use datafusion_execution::TaskContext; -use datafusion_physical_expr::equivalence::ProjectionMapping; -use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::utils::collect_columns; -use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; +use datafusion_physical_expr::EquivalenceProperties; use futures::Stream; use parking_lot::RwLock; -/// Execution plan for reading in-memory batches of data -#[derive(Clone)] -#[deprecated( - since = "46.0.0", - note = "use MemorySourceConfig and DataSourceExec instead" -)] -pub struct MemoryExec { - inner: DataSourceExec, - /// The partitions to query - partitions: Vec>, - /// Optional projection - projection: Option>, - // Sort information: one or more equivalent orderings - sort_information: Vec, - /// if partition sizes should be displayed - show_sizes: bool, -} - -#[allow(unused, deprecated)] -impl fmt::Debug for MemoryExec { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt_as(DisplayFormatType::Default, f) - } -} - -#[allow(unused, deprecated)] -impl DisplayAs for MemoryExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt_as(t, f) - } -} - -#[allow(unused, deprecated)] -impl ExecutionPlan for MemoryExec { - fn name(&self) -> &'static str { - "MemoryExec" - } - - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn properties(&self) -> &PlanProperties { - self.inner.properties() - } - - fn children(&self) -> Vec<&Arc> { - // This is a leaf node and has no children - vec![] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> Result> { - // MemoryExec has no children - if children.is_empty() { - Ok(self) - } else { - internal_err!("Children cannot be replaced in {self:?}") - } - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> Result { - self.inner.execute(partition, context) - } - - /// We recompute the statistics dynamically from the arrow metadata as it is pretty cheap to do so - fn statistics(&self) -> Result { - self.inner.statistics() - } - - fn try_swapping_with_projection( - &self, - projection: &ProjectionExec, - ) -> Result>> { - self.inner.try_swapping_with_projection(projection) - } -} - -#[allow(unused, deprecated)] -impl MemoryExec { - /// Create a new execution plan for reading in-memory record batches - /// The provided `schema` should not have the projection applied. - pub fn try_new( - partitions: &[Vec], - schema: SchemaRef, - projection: Option>, - ) -> Result { - let source = MemorySourceConfig::try_new(partitions, schema, projection.clone())?; - let data_source = DataSourceExec::new(Arc::new(source)); - Ok(Self { - inner: data_source, - partitions: partitions.to_vec(), - projection, - sort_information: vec![], - show_sizes: true, - }) - } - - /// Create a new execution plan from a list of constant values (`ValuesExec`) - pub fn try_new_as_values( - schema: SchemaRef, - data: Vec>>, - ) -> Result { - if data.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - let n_row = data.len(); - let n_col = schema.fields().len(); - - // We have this single row batch as a placeholder to satisfy evaluation argument - // and generate a single output row - let placeholder_schema = Arc::new(Schema::empty()); - let placeholder_batch = RecordBatch::try_new_with_options( - Arc::clone(&placeholder_schema), - vec![], - &RecordBatchOptions::new().with_row_count(Some(1)), - )?; - - // Evaluate each column - let arrays = (0..n_col) - .map(|j| { - (0..n_row) - .map(|i| { - let expr = &data[i][j]; - let result = expr.evaluate(&placeholder_batch)?; - - match result { - ColumnarValue::Scalar(scalar) => Ok(scalar), - ColumnarValue::Array(array) if array.len() == 1 => { - ScalarValue::try_from_array(&array, 0) - } - ColumnarValue::Array(_) => { - plan_err!("Cannot have array values in a values list") - } - } - }) - .collect::>>() - .and_then(ScalarValue::iter_to_array) - }) - .collect::>>()?; - - let batch = RecordBatch::try_new_with_options( - Arc::clone(&schema), - arrays, - &RecordBatchOptions::new().with_row_count(Some(n_row)), - )?; - - let partitions = vec![batch]; - Self::try_new_from_batches(Arc::clone(&schema), partitions) - } - - /// Create a new plan using the provided schema and batches. - /// - /// Errors if any of the batches don't match the provided schema, or if no - /// batches are provided. - pub fn try_new_from_batches( - schema: SchemaRef, - batches: Vec, - ) -> Result { - if batches.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - for batch in &batches { - let batch_schema = batch.schema(); - if batch_schema != schema { - return plan_err!( - "Batch has invalid schema. Expected: {}, got: {}", - schema, - batch_schema - ); - } - } - - let partitions = vec![batches]; - let source = MemorySourceConfig { - partitions: partitions.clone(), - schema: Arc::clone(&schema), - projected_schema: Arc::clone(&schema), - projection: None, - sort_information: vec![], - show_sizes: true, - fetch: None, - }; - let data_source = DataSourceExec::new(Arc::new(source)); - Ok(Self { - inner: data_source, - partitions, - projection: None, - sort_information: vec![], - show_sizes: true, - }) - } - - fn memory_source_config(&self) -> MemorySourceConfig { - self.inner - .source() - .as_any() - .downcast_ref::() - .unwrap() - .clone() - } - - pub fn with_constraints(mut self, constraints: Constraints) -> Self { - self.inner = self.inner.with_constraints(constraints); - self - } - - /// Set `show_sizes` to determine whether to display partition sizes - pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { - let mut memory_source = self.memory_source_config(); - memory_source.show_sizes = show_sizes; - self.show_sizes = show_sizes; - self.inner = DataSourceExec::new(Arc::new(memory_source)); - self - } - - /// Ref to constraints - pub fn constraints(&self) -> &Constraints { - self.properties().equivalence_properties().constraints() - } - - /// Ref to partitions - pub fn partitions(&self) -> &[Vec] { - &self.partitions - } - - /// Ref to projection - pub fn projection(&self) -> &Option> { - &self.projection - } - - /// Show sizes - pub fn show_sizes(&self) -> bool { - self.show_sizes - } - - /// Ref to sort information - pub fn sort_information(&self) -> &[LexOrdering] { - &self.sort_information - } - - /// A memory table can be ordered by multiple expressions simultaneously. - /// [`EquivalenceProperties`] keeps track of expressions that describe the - /// global ordering of the schema. These columns are not necessarily same; e.g. - /// ```text - /// ┌-------┐ - /// | a | b | - /// |---|---| - /// | 1 | 9 | - /// | 2 | 8 | - /// | 3 | 7 | - /// | 5 | 5 | - /// └---┴---┘ - /// ``` - /// where both `a ASC` and `b DESC` can describe the table ordering. With - /// [`EquivalenceProperties`], we can keep track of these equivalences - /// and treat `a ASC` and `b DESC` as the same ordering requirement. - /// - /// Note that if there is an internal projection, that projection will be - /// also applied to the given `sort_information`. - pub fn try_with_sort_information( - mut self, - sort_information: Vec, - ) -> Result { - self.sort_information = sort_information.clone(); - let mut memory_source = self.memory_source_config(); - memory_source = memory_source.try_with_sort_information(sort_information)?; - self.inner = DataSourceExec::new(Arc::new(memory_source)); - Ok(self) - } - - /// Arc clone of ref to original schema - pub fn original_schema(&self) -> SchemaRef { - Arc::clone(&self.inner.schema()) - } - - /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn compute_properties( - schema: SchemaRef, - orderings: &[LexOrdering], - constraints: Constraints, - partitions: &[Vec], - ) -> PlanProperties { - PlanProperties::new( - EquivalenceProperties::new_with_orderings(schema, orderings) - .with_constraints(constraints), - Partitioning::UnknownPartitioning(partitions.len()), - EmissionType::Incremental, - Boundedness::Bounded, - ) - } -} - -/// Data source configuration for reading in-memory batches of data -#[derive(Clone)] -pub struct MemorySourceConfig { - /// The partitions to query - partitions: Vec>, - /// Schema representing the data before projection - schema: SchemaRef, - /// Schema representing the data after the optional projection is applied - projected_schema: SchemaRef, - /// Optional projection - projection: Option>, - /// Sort information: one or more equivalent orderings - sort_information: Vec, - /// if partition sizes should be displayed - show_sizes: bool, - /// The maximum number of records to read from this plan. If `None`, - /// all records after filtering are returned. - fetch: Option, -} - -impl DataSource for MemorySourceConfig { - fn open( - &self, - partition: usize, - _context: Arc, - ) -> Result { - Ok(Box::pin( - MemoryStream::try_new( - self.partitions[partition].clone(), - Arc::clone(&self.projected_schema), - self.projection.clone(), - )? - .with_fetch(self.fetch), - )) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default | DisplayFormatType::Verbose => { - let partition_sizes: Vec<_> = - self.partitions.iter().map(|b| b.len()).collect(); - - let output_ordering = self - .sort_information - .first() - .map(|output_ordering| { - format!(", output_ordering={}", output_ordering) - }) - .unwrap_or_default(); - - let eq_properties = self.eq_properties(); - let constraints = eq_properties.constraints(); - let constraints = if constraints.is_empty() { - String::new() - } else { - format!(", {}", constraints) - }; - - let limit = self - .fetch - .map_or(String::new(), |limit| format!(", fetch={}", limit)); - if self.show_sizes { - write!( - f, - "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}", - partition_sizes.len(), - ) - } else { - write!( - f, - "partitions={}{limit}{output_ordering}{constraints}", - partition_sizes.len(), - ) - } - } - } - } - - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.partitions.len()) - } - - fn eq_properties(&self) -> EquivalenceProperties { - EquivalenceProperties::new_with_orderings( - Arc::clone(&self.projected_schema), - self.sort_information.as_slice(), - ) - } - - fn statistics(&self) -> Result { - Ok(common::compute_record_batch_statistics( - &self.partitions, - &self.schema, - self.projection.clone(), - )) - } - - fn with_fetch(&self, limit: Option) -> Option> { - let source = self.clone(); - Some(Arc::new(source.with_limit(limit))) - } - - fn fetch(&self) -> Option { - self.fetch - } - - fn try_swapping_with_projection( - &self, - projection: &ProjectionExec, - ) -> Result>> { - // If there is any non-column or alias-carrier expression, Projection should not be removed. - // This process can be moved into MemoryExec, but it would be an overlap of their responsibility. - all_alias_free_columns(projection.expr()) - .then(|| { - let all_projections = (0..self.schema.fields().len()).collect(); - let new_projections = new_projections_for_columns( - projection, - self.projection().as_ref().unwrap_or(&all_projections), - ); - - MemorySourceConfig::try_new_exec( - self.partitions(), - self.original_schema(), - Some(new_projections), - ) - .map(|e| e as _) - }) - .transpose() - } -} - -impl MemorySourceConfig { - /// Create a new `MemorySourceConfig` for reading in-memory record batches - /// The provided `schema` should not have the projection applied. - pub fn try_new( - partitions: &[Vec], - schema: SchemaRef, - projection: Option>, - ) -> Result { - let projected_schema = project_schema(&schema, projection.as_ref())?; - Ok(Self { - partitions: partitions.to_vec(), - schema, - projected_schema, - projection, - sort_information: vec![], - show_sizes: true, - fetch: None, - }) - } - - /// Create a new `DataSourceExec` plan for reading in-memory record batches - /// The provided `schema` should not have the projection applied. - pub fn try_new_exec( - partitions: &[Vec], - schema: SchemaRef, - projection: Option>, - ) -> Result> { - let source = Self::try_new(partitions, schema, projection)?; - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) - } - - /// Create a new execution plan from a list of constant values (`ValuesExec`) - pub fn try_new_as_values( - schema: SchemaRef, - data: Vec>>, - ) -> Result> { - if data.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - let n_row = data.len(); - let n_col = schema.fields().len(); - - // We have this single row batch as a placeholder to satisfy evaluation argument - // and generate a single output row - let placeholder_schema = Arc::new(Schema::empty()); - let placeholder_batch = RecordBatch::try_new_with_options( - Arc::clone(&placeholder_schema), - vec![], - &RecordBatchOptions::new().with_row_count(Some(1)), - )?; - - // Evaluate each column - let arrays = (0..n_col) - .map(|j| { - (0..n_row) - .map(|i| { - let expr = &data[i][j]; - let result = expr.evaluate(&placeholder_batch)?; - - match result { - ColumnarValue::Scalar(scalar) => Ok(scalar), - ColumnarValue::Array(array) if array.len() == 1 => { - ScalarValue::try_from_array(&array, 0) - } - ColumnarValue::Array(_) => { - plan_err!("Cannot have array values in a values list") - } - } - }) - .collect::>>() - .and_then(ScalarValue::iter_to_array) - }) - .collect::>>()?; - - let batch = RecordBatch::try_new_with_options( - Arc::clone(&schema), - arrays, - &RecordBatchOptions::new().with_row_count(Some(n_row)), - )?; - - let partitions = vec![batch]; - Self::try_new_from_batches(Arc::clone(&schema), partitions) - } - - /// Create a new plan using the provided schema and batches. - /// - /// Errors if any of the batches don't match the provided schema, or if no - /// batches are provided. - pub fn try_new_from_batches( - schema: SchemaRef, - batches: Vec, - ) -> Result> { - if batches.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - for batch in &batches { - let batch_schema = batch.schema(); - if batch_schema != schema { - return plan_err!( - "Batch has invalid schema. Expected: {}, got: {}", - schema, - batch_schema - ); - } - } - - let partitions = vec![batches]; - let source = Self { - partitions, - schema: Arc::clone(&schema), - projected_schema: Arc::clone(&schema), - projection: None, - sort_information: vec![], - show_sizes: true, - fetch: None, - }; - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) - } - - /// Set the limit of the files - pub fn with_limit(mut self, limit: Option) -> Self { - self.fetch = limit; - self - } - - /// Set `show_sizes` to determine whether to display partition sizes - pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { - self.show_sizes = show_sizes; - self - } - - /// Ref to partitions - pub fn partitions(&self) -> &[Vec] { - &self.partitions - } - - /// Ref to projection - pub fn projection(&self) -> &Option> { - &self.projection - } - - /// Show sizes - pub fn show_sizes(&self) -> bool { - self.show_sizes - } - - /// Ref to sort information - pub fn sort_information(&self) -> &[LexOrdering] { - &self.sort_information - } - - /// A memory table can be ordered by multiple expressions simultaneously. - /// [`EquivalenceProperties`] keeps track of expressions that describe the - /// global ordering of the schema. These columns are not necessarily same; e.g. - /// ```text - /// ┌-------┐ - /// | a | b | - /// |---|---| - /// | 1 | 9 | - /// | 2 | 8 | - /// | 3 | 7 | - /// | 5 | 5 | - /// └---┴---┘ - /// ``` - /// where both `a ASC` and `b DESC` can describe the table ordering. With - /// [`EquivalenceProperties`], we can keep track of these equivalences - /// and treat `a ASC` and `b DESC` as the same ordering requirement. - /// - /// Note that if there is an internal projection, that projection will be - /// also applied to the given `sort_information`. - pub fn try_with_sort_information( - mut self, - mut sort_information: Vec, - ) -> Result { - // All sort expressions must refer to the original schema - let fields = self.schema.fields(); - let ambiguous_column = sort_information - .iter() - .flat_map(|ordering| ordering.clone()) - .flat_map(|expr| collect_columns(&expr.expr)) - .find(|col| { - fields - .get(col.index()) - .map(|field| field.name() != col.name()) - .unwrap_or(true) - }); - if let Some(col) = ambiguous_column { - return internal_err!( - "Column {:?} is not found in the original schema of the MemorySourceConfig", - col - ); - } - - // If there is a projection on the source, we also need to project orderings - if let Some(projection) = &self.projection { - let base_eqp = EquivalenceProperties::new_with_orderings( - self.original_schema(), - &sort_information, - ); - let proj_exprs = projection - .iter() - .map(|idx| { - let base_schema = self.original_schema(); - let name = base_schema.field(*idx).name(); - (Arc::new(Column::new(name, *idx)) as _, name.to_string()) - }) - .collect::>(); - let projection_mapping = - ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; - sort_information = base_eqp - .project(&projection_mapping, Arc::clone(&self.projected_schema)) - .into_oeq_class() - .into_inner(); - } - - self.sort_information = sort_information; - Ok(self) - } - - /// Arc clone of ref to original schema - pub fn original_schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } -} - /// Iterator over batches pub struct MemoryStream { /// Vector of record batches @@ -746,13 +72,13 @@ impl MemoryStream { } /// Set the memory reservation for the data - pub(super) fn with_reservation(mut self, reservation: MemoryReservation) -> Self { + pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self { self.reservation = Some(reservation); self } /// Set the number of rows to produce - pub(super) fn with_fetch(mut self, fetch: Option) -> Self { + pub fn with_fetch(mut self, fetch: Option) -> Self { self.fetch = fetch; self } @@ -962,62 +288,6 @@ impl RecordBatchStream for LazyMemoryStream { } } -#[cfg(test)] -mod memory_exec_tests { - use std::sync::Arc; - - use crate::memory::MemorySourceConfig; - use crate::source::DataSourceExec; - use crate::ExecutionPlan; - - use arrow::compute::SortOptions; - use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_physical_expr::expressions::col; - use datafusion_physical_expr::PhysicalSortExpr; - use datafusion_physical_expr_common::sort_expr::LexOrdering; - - #[test] - fn test_memory_order_eq() -> datafusion_common::Result<()> { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int64, false), - Field::new("b", DataType::Int64, false), - Field::new("c", DataType::Int64, false), - ])); - let sort1 = LexOrdering::new(vec![ - PhysicalSortExpr { - expr: col("a", &schema)?, - options: SortOptions::default(), - }, - PhysicalSortExpr { - expr: col("b", &schema)?, - options: SortOptions::default(), - }, - ]); - let sort2 = LexOrdering::new(vec![PhysicalSortExpr { - expr: col("c", &schema)?, - options: SortOptions::default(), - }]); - let mut expected_output_order = LexOrdering::default(); - expected_output_order.extend(sort1.clone()); - expected_output_order.extend(sort2.clone()); - - let sort_information = vec![sort1.clone(), sort2.clone()]; - let mem_exec = Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[vec![]], schema, None)? - .try_with_sort_information(sort_information)?, - ))); - - assert_eq!( - mem_exec.properties().output_ordering().unwrap(), - &expected_output_order - ); - let eq_properties = mem_exec.properties().equivalence_properties(); - assert!(eq_properties.oeq_class().contains(&sort1)); - assert!(eq_properties.oeq_class().contains(&sort2)); - Ok(()) - } -} - #[cfg(test)] mod lazy_memory_tests { use super::*; @@ -1137,129 +407,3 @@ mod lazy_memory_tests { Ok(()) } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::expressions::lit; - use crate::test::{self, make_partition}; - - use arrow::datatypes::{DataType, Field}; - use datafusion_common::assert_batches_eq; - use datafusion_common::stats::{ColumnStatistics, Precision}; - use futures::StreamExt; - - #[tokio::test] - async fn exec_with_limit() -> Result<()> { - let task_ctx = Arc::new(TaskContext::default()); - let batch = make_partition(7); - let schema = batch.schema(); - let batches = vec![batch.clone(), batch]; - - let exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); - assert_eq!(exec.fetch(), None); - - let exec = exec.with_fetch(Some(4)).unwrap(); - assert_eq!(exec.fetch(), Some(4)); - - let mut it = exec.execute(0, task_ctx)?; - let mut results = vec![]; - while let Some(batch) = it.next().await { - results.push(batch?); - } - - let expected = [ - "+---+", "| i |", "+---+", "| 0 |", "| 1 |", "| 2 |", "| 3 |", "+---+", - ]; - assert_batches_eq!(expected, &results); - Ok(()) - } - - #[tokio::test] - async fn values_empty_case() -> Result<()> { - let schema = test::aggr_test_schema(); - let empty = MemorySourceConfig::try_new_as_values(schema, vec![]); - assert!(empty.is_err()); - Ok(()) - } - - #[test] - fn new_exec_with_batches() { - let batch = make_partition(7); - let schema = batch.schema(); - let batches = vec![batch.clone(), batch]; - let _exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); - } - - #[test] - fn new_exec_with_batches_empty() { - let batch = make_partition(7); - let schema = batch.schema(); - let _ = MemorySourceConfig::try_new_from_batches(schema, Vec::new()).unwrap_err(); - } - - #[test] - fn new_exec_with_batches_invalid_schema() { - let batch = make_partition(7); - let batches = vec![batch.clone(), batch]; - - let invalid_schema = Arc::new(Schema::new(vec![ - Field::new("col0", DataType::UInt32, false), - Field::new("col1", DataType::Utf8, false), - ])); - let _ = MemorySourceConfig::try_new_from_batches(invalid_schema, batches) - .unwrap_err(); - } - - // Test issue: https://github.com/apache/datafusion/issues/8763 - #[test] - fn new_exec_with_non_nullable_schema() { - let schema = Arc::new(Schema::new(vec![Field::new( - "col0", - DataType::UInt32, - false, - )])); - let _ = MemorySourceConfig::try_new_as_values( - Arc::clone(&schema), - vec![vec![lit(1u32)]], - ) - .unwrap(); - // Test that a null value is rejected - let _ = MemorySourceConfig::try_new_as_values( - schema, - vec![vec![lit(ScalarValue::UInt32(None))]], - ) - .unwrap_err(); - } - - #[test] - fn values_stats_with_nulls_only() -> Result<()> { - let data = vec![ - vec![lit(ScalarValue::Null)], - vec![lit(ScalarValue::Null)], - vec![lit(ScalarValue::Null)], - ]; - let rows = data.len(); - let values = MemorySourceConfig::try_new_as_values( - Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])), - data, - )?; - - assert_eq!( - values.statistics()?, - Statistics { - num_rows: Precision::Exact(rows), - total_byte_size: Precision::Exact(8), // not important - column_statistics: vec![ColumnStatistics { - null_count: Precision::Exact(rows), // there are only nulls - distinct_count: Precision::Absent, - max_value: Precision::Absent, - min_value: Precision::Absent, - sum_value: Precision::Absent, - },], - } - ); - - Ok(()) - } -} diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index 6a8f247ec0e6..6e31f601e152 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -20,10 +20,10 @@ use std::any::Any; use std::sync::Arc; -use super::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::execution_plan::{Boundedness, EmissionType}; -use crate::{memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning}; - +use crate::memory::MemoryStream; +use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; +use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::array::{ArrayRef, NullArray}; use arrow::array::{RecordBatch, RecordBatchOptions}; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; @@ -178,7 +178,8 @@ impl ExecutionPlan for PlaceholderRowExec { #[cfg(test)] mod tests { use super::*; - use crate::{test, with_new_children_if_necessary}; + use crate::test; + use crate::with_new_children_if_necessary; #[test] fn with_new_children() -> Result<()> { diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index bf7d2c7f275c..05b78e4e1da4 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -21,12 +21,12 @@ use std::any::Any; use std::sync::Arc; use std::task::{Context, Poll}; -use super::{ +use super::work_table::{ReservedBatches, WorkTable, WorkTableExec}; +use crate::execution_plan::{Boundedness, EmissionType}; +use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, - work_table::{ReservedBatches, WorkTable, WorkTableExec}, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use crate::execution_plan::{Boundedness, EmissionType}; use crate::{DisplayAs, DisplayFormatType, ExecutionPlan}; use arrow::datatypes::SchemaRef; @@ -156,10 +156,10 @@ impl ExecutionPlan for RecursiveQueryExec { vec![false, false] } - fn required_input_distribution(&self) -> Vec { + fn required_input_distribution(&self) -> Vec { vec![ - datafusion_physical_expr::Distribution::SinglePartition, - datafusion_physical_expr::Distribution::SinglePartition, + crate::Distribution::SinglePartition, + crate::Distribution::SinglePartition, ] } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 25668fa67d5b..40e68cfcae83 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1051,6 +1051,7 @@ mod tests { use std::collections::HashSet; use super::*; + use crate::test::TestMemoryExec; use crate::{ test::{ assert_is_pending, @@ -1059,7 +1060,7 @@ mod tests { ErrorExec, MockExec, }, }, - {collect, expressions::col, memory::MemorySourceConfig}, + {collect, expressions::col}, }; use arrow::array::{ArrayRef, StringArray, UInt32Array}; @@ -1164,11 +1165,8 @@ mod tests { ) -> Result>> { let task_ctx = Arc::new(TaskContext::default()); // create physical plan - let exec = MemorySourceConfig::try_new_exec( - &input_partitions, - Arc::clone(schema), - None, - )?; + let exec = + TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(schema), None)?; let exec = RepartitionExec::try_new(exec, partitioning)?; // execute and collect results @@ -1559,11 +1557,8 @@ mod tests { let task_ctx = Arc::new(task_ctx); // create physical plan - let exec = MemorySourceConfig::try_new_exec( - &input_partitions, - Arc::clone(&schema), - None, - )?; + let exec = + TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?; let exec = RepartitionExec::try_new(exec, partitioning)?; // pull partitions @@ -1604,8 +1599,7 @@ mod test { use arrow::datatypes::{DataType, Field, Schema}; use super::*; - use crate::memory::MemorySourceConfig; - use crate::source::DataSourceExec; + use crate::test::TestMemoryExec; use crate::union::UnionExec; use datafusion_physical_expr::expressions::col; @@ -1711,15 +1705,15 @@ mod test { } fn memory_exec(schema: &SchemaRef) -> Arc { - MemorySourceConfig::try_new_exec(&[vec![]], Arc::clone(schema), None).unwrap() + TestMemoryExec::try_new_exec(&[vec![]], Arc::clone(schema), None).unwrap() } fn sorted_memory_exec( schema: &SchemaRef, sort_exprs: LexOrdering, ) -> Arc { - Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[vec![]], Arc::clone(schema), None) + Arc::new(TestMemoryExec::update_cache(Arc::new( + TestMemoryExec::try_new(&[vec![]], Arc::clone(schema), None) .unwrap() .try_with_sort_information(vec![sort_exprs]) .unwrap(), diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index eeef73c45fc4..dc03c012d9be 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -466,11 +466,11 @@ mod tests { use crate::collect; use crate::expressions::col; use crate::expressions::PhysicalSortExpr; - use crate::memory::MemorySourceConfig; use crate::sorts::sort::SortExec; use crate::test; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use super::*; @@ -696,7 +696,7 @@ mod tests { ); let schema = batch1.schema(); - MemorySourceConfig::try_new_exec( + TestMemoryExec::try_new_exec( &[vec![batch1, batch2, batch3, batch4]], Arc::clone(&schema), None, @@ -881,7 +881,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data])?; let input = - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?; let partial_sort_exec = Arc::new(PartialSortExec::new( LexOrdering::new(vec![PhysicalSortExpr { @@ -987,7 +987,7 @@ mod tests { options: option_desc, }, ]), - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?, + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?, 2, )); diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 649468260e56..30b5abcf8897 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1142,10 +1142,10 @@ mod tests { use crate::collect; use crate::execution_plan::Boundedness; use crate::expressions::col; - use crate::memory::MemorySourceConfig; use crate::test; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use arrow::array::*; use arrow::compute::SortOptions; @@ -1531,7 +1531,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data]).unwrap(); let input = - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) .unwrap(); let sort_exec = Arc::new(SortExec::new( @@ -1602,7 +1602,7 @@ mod tests { }, }, ]), - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?, + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?, )); assert_eq!(DataType::Int32, *sort_exec.schema().field(0).data_type()); @@ -1688,7 +1688,7 @@ mod tests { }, }, ]), - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?, + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?, )); assert_eq!(DataType::Float32, *sort_exec.schema().field(0).data_type()); diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 2cc55d60292a..454a06855175 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -384,12 +384,12 @@ mod tests { use crate::coalesce_partitions::CoalescePartitionsExec; use crate::execution_plan::{Boundedness, EmissionType}; use crate::expressions::col; - use crate::memory::MemorySourceConfig; use crate::metrics::{MetricValue, Timestamp}; use crate::repartition::RepartitionExec; use crate::sorts::sort::SortExec; use crate::stream::RecordBatchReceiverStream; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use crate::test::{self, assert_is_pending, make_partition}; use crate::{collect, common}; @@ -451,7 +451,7 @@ mod tests { ]); let repartition_exec = RepartitionExec::try_new( - MemorySourceConfig::try_new_exec(&[rbs], schema, None).unwrap(), + TestMemoryExec::try_new_exec(&[rbs], schema, None).unwrap(), Partitioning::RoundRobinBatch(2), )?; let coalesce_batches_exec = @@ -543,7 +543,7 @@ mod tests { let schema = batch.schema(); let sort = LexOrdering::default(); // no sort expressions - let exec = MemorySourceConfig::try_new_exec( + let exec = TestMemoryExec::try_new_exec( &[vec![batch.clone()], vec![batch]], schema, None, @@ -736,7 +736,7 @@ mod tests { options: Default::default(), }, ]); - let exec = MemorySourceConfig::try_new_exec(partitions, schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(partitions, schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, context).await.unwrap(); @@ -844,7 +844,7 @@ mod tests { let sorted = basic_sort(csv, sort, context).await; let split: Vec<_> = sizes.iter().map(|x| split_batch(&sorted, *x)).collect(); - Ok(MemorySourceConfig::try_new_exec(&split, sorted.schema(), None).unwrap()) + Ok(TestMemoryExec::try_new_exec(&split, sorted.schema(), None).unwrap()) } #[tokio::test] @@ -972,8 +972,8 @@ mod tests { }, }, ]); - let exec = MemorySourceConfig::try_new_exec(&[vec![b1], vec![b2]], schema, None) - .unwrap(); + let exec = + TestMemoryExec::try_new_exec(&[vec![b1], vec![b2]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, task_ctx).await.unwrap(); @@ -1015,8 +1015,7 @@ mod tests { nulls_first: true, }, }]); - let exec = - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec).with_fetch(Some(2))); @@ -1051,8 +1050,7 @@ mod tests { nulls_first: true, }, }]); - let exec = - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, task_ctx).await.unwrap(); @@ -1161,8 +1159,8 @@ mod tests { expr: col("b", &schema).unwrap(), options: Default::default(), }]); - let exec = MemorySourceConfig::try_new_exec(&[vec![b1], vec![b2]], schema, None) - .unwrap(); + let exec = + TestMemoryExec::try_new_exec(&[vec![b1], vec![b2]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(Arc::clone(&merge) as Arc, task_ctx) @@ -1273,7 +1271,7 @@ mod tests { }, }]); - let exec = MemorySourceConfig::try_new_exec(&partitions, schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(&partitions, schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, task_ctx).await.unwrap(); diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs index ad0e43503b2b..7d0e3778452f 100644 --- a/datafusion/physical-plan/src/test.rs +++ b/datafusion/physical-plan/src/test.rs @@ -17,27 +17,337 @@ //! Utilities for testing datafusion-physical-plan +use std::any::Any; use std::collections::HashMap; +use std::fmt; +use std::fmt::{Debug, Formatter}; use std::pin::Pin; use std::sync::Arc; +use std::task::Context; + +use crate::common; +use crate::execution_plan::{Boundedness, EmissionType}; +use crate::memory::MemoryStream; +use crate::metrics::MetricsSet; +use crate::stream::RecordBatchStreamAdapter; +use crate::streaming::PartitionStream; +use crate::ExecutionPlan; +use crate::{DisplayAs, DisplayFormatType, PlanProperties}; use arrow::array::{Array, ArrayRef, Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion_common::{ + config::ConfigOptions, internal_err, project_schema, Result, Statistics, +}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use futures::{Future, FutureExt}; +use datafusion_physical_expr::{ + equivalence::ProjectionMapping, expressions::Column, utils::collect_columns, + EquivalenceProperties, LexOrdering, Partitioning, +}; -use crate::memory::MemorySourceConfig; -use crate::source::DataSourceExec; -use crate::stream::RecordBatchStreamAdapter; -use crate::streaming::PartitionStream; -use crate::ExecutionPlan; +use futures::{Future, FutureExt}; pub mod exec; +/// `TestMemoryExec` is a mock equivalent to [`MemorySourceConfig`] with [`ExecutionPlan`] implemented for testing. +/// i.e. It has some but not all the functionality of [`MemorySourceConfig`]. +/// This implements an in-memory DataSource rather than explicitly implementing a trait. +/// It is implemented in this manner to keep relevant unit tests in place +/// while avoiding circular dependencies between `datafusion-physical-plan` and `datafusion-datasource`. +/// +/// [`MemorySourceConfig`]: https://github.com/apache/datafusion/tree/main/datafusion/datasource/src/memory.rs +#[derive(Clone, Debug)] +pub struct TestMemoryExec { + /// The partitions to query + partitions: Vec>, + /// Schema representing the data before projection + schema: SchemaRef, + /// Schema representing the data after the optional projection is applied + projected_schema: SchemaRef, + /// Optional projection + projection: Option>, + /// Sort information: one or more equivalent orderings + sort_information: Vec, + /// if partition sizes should be displayed + show_sizes: bool, + /// The maximum number of records to read from this plan. If `None`, + /// all records after filtering are returned. + fetch: Option, + cache: PlanProperties, +} + +impl DisplayAs for TestMemoryExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { + write!(f, "DataSourceExec: ")?; + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + let partition_sizes: Vec<_> = + self.partitions.iter().map(|b| b.len()).collect(); + + let output_ordering = self + .sort_information + .first() + .map(|output_ordering| { + format!(", output_ordering={}", output_ordering) + }) + .unwrap_or_default(); + + let eq_properties = self.eq_properties(); + let constraints = eq_properties.constraints(); + let constraints = if constraints.is_empty() { + String::new() + } else { + format!(", {}", constraints) + }; + + let limit = self + .fetch + .map_or(String::new(), |limit| format!(", fetch={}", limit)); + if self.show_sizes { + write!( + f, + "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } else { + write!( + f, + "partitions={}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } + } + } + } +} + +impl ExecutionPlan for TestMemoryExec { + fn name(&self) -> &'static str { + "DataSourceExec" + } + + fn as_any(&self) -> &dyn Any { + unimplemented!() + } + + fn properties(&self) -> &PlanProperties { + &self.cache + } + + fn children(&self) -> Vec<&Arc> { + Vec::new() + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> Result> { + unimplemented!() + } + + fn repartitioned( + &self, + _target_partitions: usize, + _config: &ConfigOptions, + ) -> Result>> { + unimplemented!() + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + self.open(partition, context) + } + + fn metrics(&self) -> Option { + unimplemented!() + } + + fn statistics(&self) -> Result { + self.statistics() + } + + fn fetch(&self) -> Option { + self.fetch + } +} + +impl TestMemoryExec { + fn open( + &self, + partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin( + MemoryStream::try_new( + self.partitions[partition].clone(), + Arc::clone(&self.projected_schema), + self.projection.clone(), + )? + .with_fetch(self.fetch), + )) + } + + fn compute_properties(&self) -> PlanProperties { + PlanProperties::new( + self.eq_properties(), + self.output_partitioning(), + EmissionType::Incremental, + Boundedness::Bounded, + ) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.partitions.len()) + } + + fn eq_properties(&self) -> EquivalenceProperties { + EquivalenceProperties::new_with_orderings( + Arc::clone(&self.projected_schema), + self.sort_information.as_slice(), + ) + } + + fn statistics(&self) -> Result { + Ok(common::compute_record_batch_statistics( + &self.partitions, + &self.schema, + self.projection.clone(), + )) + } + + pub fn try_new( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result { + let projected_schema = project_schema(&schema, projection.as_ref())?; + Ok(Self { + partitions: partitions.to_vec(), + schema, + cache: PlanProperties::new( + EquivalenceProperties::new_with_orderings( + Arc::clone(&projected_schema), + vec![].as_slice(), + ), + Partitioning::UnknownPartitioning(partitions.len()), + EmissionType::Incremental, + Boundedness::Bounded, + ), + projected_schema, + projection, + sort_information: vec![], + show_sizes: true, + fetch: None, + }) + } + + /// Create a new `DataSourceExec` Equivalent plan for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new_exec( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result> { + let mut source = Self::try_new(partitions, schema, projection)?; + let cache = source.compute_properties(); + source.cache = cache; + Ok(Arc::new(source)) + } + + // Equivalent of `DataSourceExec::new` + pub fn update_cache(source: Arc) -> TestMemoryExec { + let cache = source.compute_properties(); + let source = &*source; + let mut source = source.clone(); + source.cache = cache; + source + } + + /// Set the limit of the files + pub fn with_limit(mut self, limit: Option) -> Self { + self.fetch = limit; + self + } + + /// Ref to partitions + pub fn partitions(&self) -> &[Vec] { + &self.partitions + } + + /// Ref to projection + pub fn projection(&self) -> &Option> { + &self.projection + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + + /// refer to `try_with_sort_information` at MemorySourceConfig for more information. + /// https://github.com/apache/datafusion/tree/main/datafusion/datasource/src/memory.rs + pub fn try_with_sort_information( + mut self, + mut sort_information: Vec, + ) -> Result { + // All sort expressions must refer to the original schema + let fields = self.schema.fields(); + let ambiguous_column = sort_information + .iter() + .flat_map(|ordering| ordering.clone()) + .flat_map(|expr| collect_columns(&expr.expr)) + .find(|col| { + fields + .get(col.index()) + .map(|field| field.name() != col.name()) + .unwrap_or(true) + }); + if let Some(col) = ambiguous_column { + return internal_err!( + "Column {:?} is not found in the original schema of the TestMemoryExec", + col + ); + } + + // If there is a projection on the source, we also need to project orderings + if let Some(projection) = &self.projection { + let base_eqp = EquivalenceProperties::new_with_orderings( + self.original_schema(), + &sort_information, + ); + let proj_exprs = projection + .iter() + .map(|idx| { + let base_schema = self.original_schema(); + let name = base_schema.field(*idx).name(); + (Arc::new(Column::new(name, *idx)) as _, name.to_string()) + }) + .collect::>(); + let projection_mapping = + ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; + sort_information = base_eqp + .project(&projection_mapping, Arc::clone(&self.projected_schema)) + .into_oeq_class() + .into_inner(); + } + + self.sort_information = sort_information; + Ok(self) + } + + /// Arc clone of ref to original schema + pub fn original_schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + /// Asserts that given future is pending. pub fn assert_is_pending<'a, T>(fut: &mut Pin + Send + 'a>>) { let waker = futures::task::noop_waker(); - let mut cx = futures::task::Context::from_waker(&waker); + let mut cx = Context::from_waker(&waker); let poll = fut.poll_unpin(&mut cx); assert!(poll.is_pending()); @@ -117,7 +427,7 @@ pub fn build_table_scan_i32( ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } /// Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" @@ -157,26 +467,24 @@ pub fn scan_partitioned_utf8(partitions: usize) -> Arc { } /// Returns a `DataSourceExec` that scans `partitions` of 100 batches each -pub fn mem_exec(partitions: usize) -> DataSourceExec { +pub fn mem_exec(partitions: usize) -> TestMemoryExec { let data: Vec> = (0..partitions).map(|_| vec![make_partition(100)]).collect(); let schema = data[0][0].schema(); let projection = None; - DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&data, schema, projection).unwrap(), - )) + + TestMemoryExec::try_new(&data, schema, projection).unwrap() } -pub fn mem_exec_utf8(partitions: usize) -> DataSourceExec { +pub fn mem_exec_utf8(partitions: usize) -> TestMemoryExec { let data: Vec> = (0..partitions) .map(|_| vec![make_partition_utf8(100)]) .collect(); let schema = data[0][0].schema(); let projection = None; - DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&data, schema, projection).unwrap(), - )) + + TestMemoryExec::try_new(&data, schema, projection).unwrap() } // Construct a stream partition for test purposes diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index e1972d267b97..68d1803b7133 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -640,10 +640,9 @@ fn stats_union(mut left: Statistics, right: Statistics) -> Statistics { mod tests { use super::*; use crate::collect; - use crate::memory::MemorySourceConfig; use crate::test; + use crate::test::TestMemoryExec; - use crate::source::DataSourceExec; use arrow::compute::SortOptions; use arrow::datatypes::DataType; use datafusion_common::ScalarValue; @@ -865,12 +864,12 @@ mod tests { .iter() .map(|ordering| convert_to_sort_exprs(ordering)) .collect::>(); - let child1 = Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[], Arc::clone(&schema), None)? + let child1 = Arc::new(TestMemoryExec::update_cache(Arc::new( + TestMemoryExec::try_new(&[], Arc::clone(&schema), None)? .try_with_sort_information(first_orderings)?, ))); - let child2 = Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[], Arc::clone(&schema), None)? + let child2 = Arc::new(TestMemoryExec::update_cache(Arc::new( + TestMemoryExec::try_new(&[], Arc::clone(&schema), None)? .try_with_sort_information(second_orderings)?, ))); diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index 6ab5cc84a21f..b90c50510cb0 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -20,13 +20,12 @@ use std::any::Any; use std::sync::Arc; -use super::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::execution_plan::{Boundedness, EmissionType}; +use crate::memory::MemoryStream; +use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::{ - memory::MemoryStream, ColumnarValue, DisplayFormatType, ExecutionPlan, Partitioning, - PhysicalExpr, + ColumnarValue, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, }; - use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 1e21d0757c41..c78c870ff383 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1190,9 +1190,9 @@ mod tests { use crate::common::collect; use crate::expressions::PhysicalSortExpr; - use crate::memory::MemorySourceConfig; use crate::projection::ProjectionExec; use crate::streaming::{PartitionStream, StreamingTableExec}; + use crate::test::TestMemoryExec; use crate::windows::{ create_udwf_window_expr, create_window_expr, BoundedWindowAggExec, InputOrderMode, }; @@ -1551,7 +1551,7 @@ mod tests { vec![Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3]))], )?; - let memory_exec = MemorySourceConfig::try_new_exec( + let memory_exec = TestMemoryExec::try_new_exec( &[vec![batch.clone(), batch.clone(), batch.clone()]], Arc::clone(&schema), None, diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index a6ced527cbb2..d3d29bfad7ce 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -20,12 +20,12 @@ use std::any::Any; use std::sync::{Arc, Mutex}; -use super::{ +use crate::execution_plan::{Boundedness, EmissionType}; +use crate::memory::MemoryStream; +use crate::{ metrics::{ExecutionPlanMetricsSet, MetricsSet}, SendableRecordBatchStream, Statistics, }; -use crate::execution_plan::{Boundedness, EmissionType}; -use crate::memory::MemoryStream; use crate::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; use arrow::datatypes::SchemaRef; diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index a575a42d0b6c..2c596255587b 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -32,6 +32,7 @@ use datafusion::datasource::file_format::parquet::ParquetSink; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::physical_plan::{AvroSource, CsvSource, FileScanConfig}; +use datafusion::datasource::source::DataSourceExec; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; use datafusion::physical_expr::aggregate::AggregateFunctionExpr; @@ -57,7 +58,6 @@ use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion::physical_plan::source::DataSourceExec; use datafusion::physical_plan::union::{InterleaveExec, UnionExec}; use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec}; use datafusion::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 3fc94a33442b..e8c15731228c 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -23,8 +23,8 @@ use crate::variation_const::{ }; use datafusion::arrow::datatypes::DataType; +use datafusion::datasource::source::DataSourceExec; use datafusion::error::{DataFusionError, Result}; -use datafusion::physical_plan::source::DataSourceExec; use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; From faace2cddcfa47c99455b0f3c60403a8899b3d11 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 21 Feb 2025 12:51:16 +0100 Subject: [PATCH 42/71] Fix Clippy 1.85 warnings (#14800) --- datafusion/core/tests/execution/logical_plan.rs | 6 +++--- datafusion/execution/src/runtime_env.rs | 6 ++---- datafusion/physical-expr-common/src/binary_map.rs | 2 +- datafusion/sql/src/expr/value.rs | 2 +- test-utils/src/array_gen/string.rs | 1 - 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/datafusion/core/tests/execution/logical_plan.rs b/datafusion/core/tests/execution/logical_plan.rs index a17bb5eec8a3..b30636ddf6a8 100644 --- a/datafusion/core/tests/execution/logical_plan.rs +++ b/datafusion/core/tests/execution/logical_plan.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +//! Logical plans need to provide stable semantics, as downstream projects +//! create them and depend on them. Test executable semantics of logical plans. + use arrow::array::Int64Array; use arrow::datatypes::{DataType, Field}; use datafusion::execution::session_state::SessionStateBuilder; @@ -30,9 +33,6 @@ use std::fmt::Debug; use std::ops::Deref; use std::sync::Arc; -///! Logical plans need to provide stable semantics, as downstream projects -///! create them and depend on them. Test executable semantics of logical plans. - #[tokio::test] async fn count_only_nulls() -> Result<()> { // Input: VALUES (NULL), (NULL), (NULL) AS _(col) diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index 2b08b7ff9e88..95f14f485792 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -27,7 +27,7 @@ use crate::{ }; use crate::cache::cache_manager::{CacheManager, CacheManagerConfig}; -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::Result; use object_store::ObjectStore; use std::path::PathBuf; use std::sync::Arc; @@ -150,9 +150,7 @@ impl RuntimeEnv { /// registry. See [`ObjectStoreRegistry::get_store`] for more /// details. pub fn object_store(&self, url: impl AsRef) -> Result> { - self.object_store_registry - .get_store(url.as_ref()) - .map_err(DataFusionError::from) + self.object_store_registry.get_store(url.as_ref()) } } diff --git a/datafusion/physical-expr-common/src/binary_map.rs b/datafusion/physical-expr-common/src/binary_map.rs index 809c619e9845..b37d9a7773ee 100644 --- a/datafusion/physical-expr-common/src/binary_map.rs +++ b/datafusion/physical-expr-common/src/binary_map.rs @@ -384,7 +384,7 @@ where // value is "small" let payload = if value.len() <= SHORT_VALUE_LEN { - let inline = value.iter().fold(0usize, |acc, &x| acc << 8 | x as usize); + let inline = value.iter().fold(0usize, |acc, &x| (acc << 8) | x as usize); // is value is already present in the set? let entry = self.map.find_mut(hash, |header| { diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index e81bfa0dc55f..168348aee222 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -304,7 +304,7 @@ fn try_decode_hex_literal(s: &str) -> Option> { for i in (start_idx..hex_bytes.len()).step_by(2) { let high = try_decode_hex_char(hex_bytes[i])?; let low = try_decode_hex_char(hex_bytes[i + 1])?; - decoded_bytes.push(high << 4 | low); + decoded_bytes.push((high << 4) | low); } Some(decoded_bytes) diff --git a/test-utils/src/array_gen/string.rs b/test-utils/src/array_gen/string.rs index a405cb76b1bd..e2a983612b8b 100644 --- a/test-utils/src/array_gen/string.rs +++ b/test-utils/src/array_gen/string.rs @@ -97,7 +97,6 @@ fn random_string(rng: &mut StdRng, max_len: usize) -> String { let len = rng.gen_range(1..=max_len); rng.sample_iter::(rand::distributions::Standard) .take(len) - .map(char::from) .collect::() } } From 9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 21 Feb 2025 12:23:44 +0000 Subject: [PATCH 43/71] Allow `FileSource`-specific repartitioning (#14754) * FileSource specific repartitioning * fix doc typo * remove * Avro doesn't support repartitioning --- datafusion/core/src/datasource/data_source.rs | 34 ++++++++++++++++--- .../datasource/physical_plan/arrow_file.rs | 4 --- .../core/src/datasource/physical_plan/avro.rs | 13 ++++--- .../core/src/datasource/physical_plan/csv.rs | 3 -- .../physical_plan/file_scan_config.rs | 31 ++++++----------- .../core/src/datasource/physical_plan/json.rs | 4 --- .../physical_plan/parquet/source.rs | 3 -- 7 files changed, 50 insertions(+), 42 deletions(-) diff --git a/datafusion/core/src/datasource/data_source.rs b/datafusion/core/src/datasource/data_source.rs index d31b68019e30..fcb31194eab1 100644 --- a/datafusion/core/src/datasource/data_source.rs +++ b/datafusion/core/src/datasource/data_source.rs @@ -26,6 +26,8 @@ use crate::datasource::physical_plan::{FileOpener, FileScanConfig}; use arrow::datatypes::SchemaRef; use datafusion_common::Statistics; +use datafusion_datasource::file_groups::FileGroupPartitioner; +use datafusion_physical_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; @@ -62,9 +64,33 @@ pub trait FileSource: Send + Sync { fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result { Ok(()) } - /// Return true if the file format supports repartition + + /// If supported by the [`FileSource`], redistribute files across partitions according to their size. + /// Allows custom file formats to implement their own repartitioning logic. /// - /// If this returns true, the DataSourceExec may repartition the data - /// by breaking up the input files into multiple smaller groups. - fn supports_repartition(&self, config: &FileScanConfig) -> bool; + /// Provides a default repartitioning behavior, see comments on [`FileGroupPartitioner`] for more detail. + fn repartitioned( + &self, + target_partitions: usize, + repartition_file_min_size: usize, + output_ordering: Option, + config: &FileScanConfig, + ) -> datafusion_common::Result> { + if config.file_compression_type.is_compressed() || config.new_lines_in_values { + return Ok(None); + } + + let repartitioned_file_groups_option = FileGroupPartitioner::new() + .with_target_partitions(target_partitions) + .with_repartition_file_min_size(repartition_file_min_size) + .with_preserve_order_within_groups(output_ordering.is_some()) + .repartition_file_groups(&config.file_groups); + + if let Some(repartitioned_file_groups) = repartitioned_file_groups_option { + let mut source = config.clone(); + source.file_groups = repartitioned_file_groups; + return Ok(Some(source)); + } + Ok(None) + } } diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index c6e05893a979..d0d037924862 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -256,10 +256,6 @@ impl FileSource for ArrowSource { fn file_type(&self) -> &str { "arrow" } - - fn supports_repartition(&self, config: &FileScanConfig) -> bool { - !(config.file_compression_type.is_compressed() || config.new_lines_in_values) - } } /// The struct arrow that implements `[FileOpener]` trait diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 1a88dc31a64d..ae98c19a1615 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -255,10 +255,15 @@ impl FileSource for AvroSource { fn file_type(&self) -> &str { "avro" } - fn supports_repartition(&self, config: &FileScanConfig) -> bool { - !(config.file_compression_type.is_compressed() - || config.new_lines_in_values - || self.as_any().downcast_ref::().is_some()) + + fn repartitioned( + &self, + _target_partitions: usize, + _repartition_file_min_size: usize, + _output_ordering: Option, + _config: &FileScanConfig, + ) -> Result> { + Ok(None) } } diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 412c90726af0..8fcfd6b41e85 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -618,9 +618,6 @@ impl FileSource for CsvSource { fn fmt_extra(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { write!(f, ", has_header={}", self.has_header) } - fn supports_repartition(&self, config: &FileScanConfig) -> bool { - !(config.file_compression_type.is_compressed() || config.new_lines_in_values) - } } impl FileOpener for CsvOpener { diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 4996b6d97b58..5c882ed75109 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -19,8 +19,8 @@ //! file sources. use super::{ - get_projected_output_ordering, statistics::MinMaxStatistics, FileGroupPartitioner, - FileGroupsDisplay, FileStream, + get_projected_output_ordering, statistics::MinMaxStatistics, FileGroupsDisplay, + FileStream, }; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; @@ -203,30 +203,21 @@ impl DataSource for FileScanConfig { self.fmt_file_source(t, f) } - /// Redistribute files across partitions according to their size - /// See comments on [`FileGroupPartitioner`] for more detail. + /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size. fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option, ) -> Result>> { - if !self.source.supports_repartition(self) { - return Ok(None); - } - - let repartitioned_file_groups_option = FileGroupPartitioner::new() - .with_target_partitions(target_partitions) - .with_repartition_file_min_size(repartition_file_min_size) - .with_preserve_order_within_groups(output_ordering.is_some()) - .repartition_file_groups(&self.file_groups); - - if let Some(repartitioned_file_groups) = repartitioned_file_groups_option { - let mut source = self.clone(); - source.file_groups = repartitioned_file_groups; - return Ok(Some(Arc::new(source))); - } - Ok(None) + let source = self.source.repartitioned( + target_partitions, + repartition_file_min_size, + output_ordering, + self, + )?; + + Ok(source.map(|s| Arc::new(s) as _)) } fn output_partitioning(&self) -> Partitioning { diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 249f50efa544..f2304ed8a342 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -313,10 +313,6 @@ impl FileSource for JsonSource { fn file_type(&self) -> &str { "json" } - - fn supports_repartition(&self, config: &FileScanConfig) -> bool { - !(config.file_compression_type.is_compressed() || config.new_lines_in_values) - } } impl FileOpener for JsonOpener { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/source.rs b/datafusion/core/src/datasource/physical_plan/parquet/source.rs index 810a16de41af..26a5877e2d38 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/source.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/source.rs @@ -586,7 +586,4 @@ impl FileSource for ParquetSource { } } } - fn supports_repartition(&self, _config: &FileScanConfig) -> bool { - true - } } From 3750dc9eb9586e8e108a347e097f2da4d99fce0d Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 21 Feb 2025 16:32:08 +0100 Subject: [PATCH 44/71] Bump MSRV to 1.82, toolchain to 1.85 (#14811) * Bump MSRV to 1.82, toolchain to 1.85 * Fix some clippy warnings * Fix more clippy warnings --- Cargo.toml | 2 +- datafusion/common/src/table_reference.rs | 7 +++---- datafusion/expr/src/utils.rs | 2 +- datafusion/physical-expr-common/src/sort_expr.rs | 8 +++----- datafusion/physical-expr/src/equivalence/properties.rs | 2 +- rust-toolchain.toml | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b6098a636954..adb3ee23d947 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ homepage = "https://datafusion.apache.org" license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" -rust-version = "1.81.0" +rust-version = "1.82.0" version = "45.0.0" [workspace.dependencies] diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index bb53a30dcb23..9b6f9696c00b 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -193,8 +193,7 @@ impl TableReference { match self { TableReference::Bare { table } => **table == *other.table(), TableReference::Partial { schema, table } => { - **table == *other.table() - && other.schema().map_or(true, |s| *s == **schema) + **table == *other.table() && other.schema().is_none_or(|s| *s == **schema) } TableReference::Full { catalog, @@ -202,8 +201,8 @@ impl TableReference { table, } => { **table == *other.table() - && other.schema().map_or(true, |s| *s == **schema) - && other.catalog().map_or(true, |c| *c == **catalog) + && other.schema().is_none_or(|s| *s == **schema) + && other.catalog().is_none_or(|c| *c == **catalog) } } } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 86c0f9ad637c..56c1e64554a9 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -832,7 +832,7 @@ pub fn exprlist_len( .enumerate() .filter_map(|(idx, field)| { let (maybe_table_ref, _) = schema.qualified_field(idx); - if maybe_table_ref.map_or(true, |q| q == qualifier) { + if maybe_table_ref.is_none_or(|q| q == qualifier) { Some((maybe_table_ref.cloned(), Arc::clone(field))) } else { None diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs index b150d3dc9bd3..601b2a23d09d 100644 --- a/datafusion/physical-expr-common/src/sort_expr.rs +++ b/datafusion/physical-expr-common/src/sort_expr.rs @@ -172,13 +172,11 @@ impl PhysicalSortExpr { let nullable = self.expr.nullable(schema).unwrap_or(true); self.expr.eq(&requirement.expr) && if nullable { - requirement - .options - .map_or(true, |opts| self.options == opts) + requirement.options.is_none_or(|opts| self.options == opts) } else { requirement .options - .map_or(true, |opts| self.options.descending == opts.descending) + .is_none_or(|opts| self.options.descending == opts.descending) } } } @@ -293,7 +291,7 @@ impl PhysicalSortRequirement { self.expr.eq(&other.expr) && other .options - .map_or(true, |other_opts| self.options == Some(other_opts)) + .is_none_or(|other_opts| self.options == Some(other_opts)) } #[deprecated(since = "43.0.0", note = "use LexRequirement::from_lex_ordering")] diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 96208cc5e32c..838cb26807a9 100755 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -640,7 +640,7 @@ impl EquivalenceProperties { req.expr.eq(&existing.expr) && req .options - .map_or(true, |req_opts| req_opts == existing.options) + .is_none_or(|req_opts| req_opts == existing.options) }, ) }) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index bd764d201018..11f4fb798c37 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -19,5 +19,5 @@ # to compile this workspace and run CI jobs. [toolchain] -channel = "1.84.1" +channel = "1.85.0" components = ["rustfmt", "clippy"] From a8e1f2fa1859d38c64f3811550854c2ad1e53957 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 10:32:42 -0500 Subject: [PATCH 45/71] Chore/Add additional FFI unit tests (#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb --- datafusion/ffi/src/execution_plan.rs | 62 ++++++++++++++++++++++++---- datafusion/ffi/src/insert_op.rs | 23 +++++++++++ datafusion/ffi/src/table_source.rs | 40 ++++++++++++++++++ datafusion/ffi/src/volatility.rs | 21 ++++++++++ 4 files changed, 139 insertions(+), 7 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 8087acfa33c8..00602474d621 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -219,17 +219,17 @@ impl TryFrom<&FFI_ExecutionPlan> for ForeignExecutionPlan { let properties: PlanProperties = (plan.properties)(plan).try_into()?; let children_rvec = (plan.children)(plan); - let children: Result> = children_rvec + let children = children_rvec .iter() .map(ForeignExecutionPlan::try_from) .map(|child| child.map(|c| Arc::new(c) as Arc)) - .collect(); + .collect::>>()?; Ok(Self { name, plan: plan.clone(), properties, - children: children?, + children, }) } } @@ -281,6 +281,7 @@ impl ExecutionPlan for ForeignExecutionPlan { #[cfg(test)] mod tests { + use arrow::datatypes::{DataType, Field, Schema}; use datafusion::{ physical_plan::{ execution_plan::{Boundedness, EmissionType}, @@ -294,6 +295,7 @@ mod tests { #[derive(Debug)] pub struct EmptyExec { props: PlanProperties, + children: Vec>, } impl EmptyExec { @@ -305,6 +307,7 @@ mod tests { EmissionType::Incremental, Boundedness::Bounded, ), + children: Vec::default(), } } } @@ -333,14 +336,17 @@ mod tests { } fn children(&self) -> Vec<&Arc> { - vec![] + self.children.iter().collect() } fn with_new_children( self: Arc, - _: Vec>, + children: Vec>, ) -> Result> { - unimplemented!() + Ok(Arc::new(EmptyExec { + props: self.props.clone(), + children, + })) } fn execute( @@ -358,7 +364,6 @@ mod tests { #[test] fn test_round_trip_ffi_execution_plan() -> Result<()> { - use arrow::datatypes::{DataType, Field, Schema}; let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); let ctx = SessionContext::new(); @@ -372,6 +377,49 @@ mod tests { assert!(original_name == foreign_plan.name()); + let display = datafusion::physical_plan::display::DisplayableExecutionPlan::new( + &foreign_plan, + ); + + let buf = display.one_line().to_string(); + assert_eq!(buf.trim(), "FFI_ExecutionPlan(number_of_children=0)"); + + Ok(()) + } + + #[test] + fn test_ffi_execution_plan_children() -> Result<()> { + let schema = + Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); + let ctx = SessionContext::new(); + + // Version 1: Adding child to the foreign plan + let child_plan = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let child_local = FFI_ExecutionPlan::new(child_plan, ctx.task_ctx(), None); + let child_foreign = Arc::new(ForeignExecutionPlan::try_from(&child_local)?); + + let parent_plan = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let parent_local = FFI_ExecutionPlan::new(parent_plan, ctx.task_ctx(), None); + let parent_foreign = Arc::new(ForeignExecutionPlan::try_from(&parent_local)?); + + assert_eq!(parent_foreign.children().len(), 0); + assert_eq!(child_foreign.children().len(), 0); + + let parent_foreign = parent_foreign.with_new_children(vec![child_foreign])?; + assert_eq!(parent_foreign.children().len(), 1); + + // Version 2: Adding child to the local plan + let child_plan = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let child_local = FFI_ExecutionPlan::new(child_plan, ctx.task_ctx(), None); + let child_foreign = Arc::new(ForeignExecutionPlan::try_from(&child_local)?); + + let parent_plan = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let parent_plan = parent_plan.with_new_children(vec![child_foreign])?; + let parent_local = FFI_ExecutionPlan::new(parent_plan, ctx.task_ctx(), None); + let parent_foreign = Arc::new(ForeignExecutionPlan::try_from(&parent_local)?); + + assert_eq!(parent_foreign.children().len(), 1); + Ok(()) } } diff --git a/datafusion/ffi/src/insert_op.rs b/datafusion/ffi/src/insert_op.rs index e44262377405..8e8693076cc0 100644 --- a/datafusion/ffi/src/insert_op.rs +++ b/datafusion/ffi/src/insert_op.rs @@ -47,3 +47,26 @@ impl From for FFI_InsertOp { } } } + +#[cfg(test)] +mod tests { + use datafusion::logical_expr::dml::InsertOp; + + use super::FFI_InsertOp; + + fn test_round_trip_insert_op(insert_op: InsertOp) { + let ffi_insert_op: FFI_InsertOp = insert_op.into(); + let round_trip: InsertOp = ffi_insert_op.into(); + + assert_eq!(insert_op, round_trip); + } + + /// This test ensures we have not accidentally mapped the FFI + /// enums to the wrong internal enums values. + #[test] + fn test_all_round_trip_insert_ops() { + test_round_trip_insert_op(InsertOp::Append); + test_round_trip_insert_op(InsertOp::Overwrite); + test_round_trip_insert_op(InsertOp::Replace); + } +} diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs index a59836622ee6..418fdf16a564 100644 --- a/datafusion/ffi/src/table_source.rs +++ b/datafusion/ffi/src/table_source.rs @@ -85,3 +85,43 @@ impl From for FFI_TableType { } } } + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::error::Result; + + fn round_trip_filter_pushdown(pushdown: TableProviderFilterPushDown) -> Result<()> { + let ffi_pushdown: FFI_TableProviderFilterPushDown = (&pushdown).into(); + let round_trip: TableProviderFilterPushDown = (&ffi_pushdown).into(); + + assert_eq!(pushdown, round_trip); + Ok(()) + } + + #[test] + fn round_trip_all_filter_pushdowns() -> Result<()> { + round_trip_filter_pushdown(TableProviderFilterPushDown::Exact)?; + round_trip_filter_pushdown(TableProviderFilterPushDown::Inexact)?; + round_trip_filter_pushdown(TableProviderFilterPushDown::Unsupported)?; + + Ok(()) + } + + fn round_trip_table_type(table_type: TableType) -> Result<()> { + let ffi_type: FFI_TableType = table_type.into(); + let round_trip_type: TableType = ffi_type.into(); + + assert_eq!(table_type, round_trip_type); + Ok(()) + } + + #[test] + fn test_round_all_trip_table_type() -> Result<()> { + round_trip_table_type(TableType::Base)?; + round_trip_table_type(TableType::Temporary)?; + round_trip_table_type(TableType::View)?; + + Ok(()) + } +} diff --git a/datafusion/ffi/src/volatility.rs b/datafusion/ffi/src/volatility.rs index 8b565b91b76d..0aaf68a174cf 100644 --- a/datafusion/ffi/src/volatility.rs +++ b/datafusion/ffi/src/volatility.rs @@ -46,3 +46,24 @@ impl From<&FFI_Volatility> for Volatility { } } } + +#[cfg(test)] +mod tests { + use datafusion::logical_expr::Volatility; + + use super::FFI_Volatility; + + fn test_round_trip_volatility(volatility: Volatility) { + let ffi_volatility: FFI_Volatility = volatility.into(); + let round_trip: Volatility = (&ffi_volatility).into(); + + assert_eq!(volatility, round_trip); + } + + #[test] + fn test_all_round_trip_volatility() { + test_round_trip_volatility(Volatility::Immutable); + test_round_trip_volatility(Volatility::Stable); + test_round_trip_volatility(Volatility::Volatile); + } +} From 22156b2a6862e68495a82bd2579d3ba22c6c5cc0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 11:21:57 -0500 Subject: [PATCH 46/71] Chore/Add additional FFI unit tests (#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb From e03f9f6767a69f16c5a96d5d3863cefc209497e2 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Sat, 22 Feb 2025 07:45:46 +0800 Subject: [PATCH 47/71] Remove CountWildcardRule in Analyzer and move the functionality in ExprPlanner, add `plan_aggregate` and `plan_window` to planner (#14689) * count planner * window * update slt * remove rule * rm rule * doc * fix name * fix name * fix test * tpch test * fix avro * rename * switch to count(*) * use count(*) * rename * doc * rename window funciotn * fmt * rm print * upd logic * count null --- .../src/execution/session_state_defaults.rs | 2 + .../tests/dataframe/dataframe_functions.rs | 34 ++- datafusion/core/tests/dataframe/mod.rs | 30 +- datafusion/core/tests/sql/explain_analyze.rs | 2 +- datafusion/expr/src/expr.rs | 1 - datafusion/expr/src/planner.rs | 62 +++- datafusion/expr/src/udaf.rs | 21 +- datafusion/functions-aggregate/src/count.rs | 201 ++++++++++++- datafusion/functions-aggregate/src/lib.rs | 19 +- datafusion/functions-aggregate/src/planner.rs | 63 ++++ datafusion/functions-window/src/lib.rs | 3 + datafusion/functions-window/src/planner.rs | 61 ++++ .../src/analyzer/count_wildcard_rule.rs | 277 ------------------ datafusion/optimizer/src/analyzer/mod.rs | 3 - .../optimizer/tests/optimizer_integration.rs | 28 +- datafusion/sql/src/expr/function.rs | 70 ++++- datafusion/sql/tests/sql_integration.rs | 12 +- .../sqllogictest/test_files/aggregate.slt | 20 ++ datafusion/sqllogictest/test_files/avro.slt | 2 +- .../sqllogictest/test_files/coalesce.slt | 2 +- datafusion/sqllogictest/test_files/copy.slt | 1 - .../test_files/count_star_rule.slt | 32 +- datafusion/sqllogictest/test_files/ddl.slt | 1 - datafusion/sqllogictest/test_files/errors.slt | 2 +- .../sqllogictest/test_files/explain.slt | 3 +- datafusion/sqllogictest/test_files/insert.slt | 6 +- .../test_files/insert_to_external.slt | 4 +- datafusion/sqllogictest/test_files/joins.slt | 4 +- datafusion/sqllogictest/test_files/json.slt | 2 +- datafusion/sqllogictest/test_files/limit.slt | 8 +- .../optimizer_group_by_constant.slt | 16 +- datafusion/sqllogictest/test_files/select.slt | 2 +- .../sqllogictest/test_files/subquery.slt | 42 +-- .../test_files/tpch/plans/q1.slt.part | 2 +- .../test_files/tpch/plans/q13.slt.part | 2 +- .../test_files/tpch/plans/q21.slt.part | 2 +- .../test_files/tpch/plans/q22.slt.part | 2 +- .../test_files/tpch/plans/q4.slt.part | 2 +- datafusion/sqllogictest/test_files/union.slt | 6 +- datafusion/sqllogictest/test_files/window.slt | 12 +- .../tests/cases/consumer_integration.rs | 26 +- .../tests/cases/roundtrip_logical_plan.rs | 4 +- 42 files changed, 652 insertions(+), 442 deletions(-) create mode 100644 datafusion/functions-aggregate/src/planner.rs create mode 100644 datafusion/functions-window/src/planner.rs delete mode 100644 datafusion/optimizer/src/analyzer/count_wildcard_rule.rs diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs index 92f649781cfd..33bf01cf35cd 100644 --- a/datafusion/core/src/execution/session_state_defaults.rs +++ b/datafusion/core/src/execution/session_state_defaults.rs @@ -94,6 +94,8 @@ impl SessionStateDefaults { feature = "unicode_expressions" ))] Arc::new(functions::planner::UserDefinedFunctionPlanner), + Arc::new(functions_aggregate::planner::AggregateFunctionPlanner), + Arc::new(functions_window::planner::WindowFunctionPlanner), ]; expr_planners diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 29c24948fbf0..33f32e8f0f66 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -22,6 +22,7 @@ use arrow::{ array::{Int32Array, StringArray}, record_batch::RecordBatch, }; +use datafusion_functions_aggregate::count::count_all; use std::sync::Arc; use datafusion::error::Result; @@ -31,7 +32,7 @@ use datafusion::prelude::*; use datafusion::assert_batches_eq; use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::expr::Alias; -use datafusion_expr::ExprSchemable; +use datafusion_expr::{table_scan, ExprSchemable, LogicalPlanBuilder}; use datafusion_functions_aggregate::expr_fn::{approx_median, approx_percentile_cont}; use datafusion_functions_nested::map::map; @@ -1123,3 +1124,34 @@ async fn test_fn_map() -> Result<()> { Ok(()) } + +/// Call count wildcard from dataframe API +#[tokio::test] +async fn test_count_wildcard() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::UInt32, false), + Field::new("c", DataType::UInt32, false), + ]); + + let table_scan = table_scan(Some("test"), &schema, None)?.build()?; + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate(vec![col("b")], vec![count_all()]) + .unwrap() + .project(vec![count_all()]) + .unwrap() + .sort(vec![count_all().sort(true, false)]) + .unwrap() + .build() + .unwrap(); + + let expected = "Sort: count(*) ASC NULLS LAST [count(*):Int64]\ + \n Projection: count(*) [count(*):Int64]\ + \n Aggregate: groupBy=[[test.b]], aggr=[[count(*)]] [b:UInt32, count(*):Int64]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + let formatted_plan = plan.display_indent_schema().to_string(); + assert_eq!(formatted_plan, expected); + + Ok(()) +} diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index d545157607c7..b05029e8e3b1 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -32,7 +32,8 @@ use arrow::datatypes::{ }; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; -use datafusion_functions_aggregate::count::count_udaf; +use datafusion_expr::utils::COUNT_STAR_EXPANSION; +use datafusion_functions_aggregate::count::{count_all, count_udaf}; use datafusion_functions_aggregate::expr_fn::{ array_agg, avg, count, count_distinct, max, median, min, sum, }; @@ -72,7 +73,7 @@ use datafusion_expr::expr::{GroupingSet, Sort, WindowFunction}; use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, - scalar_subquery, when, wildcard, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, + scalar_subquery, when, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, ScalarFunctionImplementation, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, }; @@ -2463,8 +2464,8 @@ async fn test_count_wildcard_on_sort() -> Result<()> { let df_results = ctx .table("t1") .await? - .aggregate(vec![col("b")], vec![count(wildcard())])? - .sort(vec![count(wildcard()).sort(true, false)])? + .aggregate(vec![col("b")], vec![count_all()])? + .sort(vec![count_all().sort(true, false)])? .explain(false, false)? .collect() .await?; @@ -2498,8 +2499,8 @@ async fn test_count_wildcard_on_where_in() -> Result<()> { Arc::new( ctx.table("t2") .await? - .aggregate(vec![], vec![count(wildcard())])? - .select(vec![count(wildcard())])? + .aggregate(vec![], vec![count_all()])? + .select(vec![count_all()])? .into_optimized_plan()?, ), ))? @@ -2532,8 +2533,8 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { .filter(exists(Arc::new( ctx.table("t2") .await? - .aggregate(vec![], vec![count(wildcard())])? - .select(vec![count(wildcard())])? + .aggregate(vec![], vec![count_all()])? + .select(vec![count_all()])? .into_unoptimized_plan(), // Usually, into_optimized_plan() should be used here, but due to // https://github.com/apache/datafusion/issues/5771, @@ -2568,7 +2569,7 @@ async fn test_count_wildcard_on_window() -> Result<()> { .await? .select(vec![Expr::WindowFunction(WindowFunction::new( WindowFunctionDefinition::AggregateUDF(count_udaf()), - vec![wildcard()], + vec![Expr::Literal(COUNT_STAR_EXPANSION)], )) .order_by(vec![Sort::new(col("a"), false, true)]) .window_frame(WindowFrame::new_bounds( @@ -2599,17 +2600,16 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> { let sql_results = ctx .sql("select count(*) from t1") .await? - .select(vec![col("count(*)")])? .explain(false, false)? .collect() .await?; - // add `.select(vec![count(wildcard())])?` to make sure we can analyze all node instead of just top node. + // add `.select(vec![count_wildcard()])?` to make sure we can analyze all node instead of just top node. let df_results = ctx .table("t1") .await? - .aggregate(vec![], vec![count(wildcard())])? - .select(vec![count(wildcard())])? + .aggregate(vec![], vec![count_all()])? + .select(vec![count_all()])? .explain(false, false)? .collect() .await?; @@ -2646,8 +2646,8 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { ctx.table("t2") .await? .filter(out_ref_col(DataType::UInt32, "t1.a").eq(col("t2.a")))? - .aggregate(vec![], vec![count(wildcard())])? - .select(vec![col(count(wildcard()).to_string())])? + .aggregate(vec![], vec![count_all()])? + .select(vec![col(count_all().to_string())])? .into_unoptimized_plan(), )) .gt(lit(ScalarValue::UInt8(Some(0)))), diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index d4b5ae8b2820..128d1d0aa4b6 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -780,7 +780,7 @@ async fn explain_logical_plan_only() { let expected = vec![ vec![ "logical_plan", - "Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]\ + "Aggregate: groupBy=[[]], aggr=[[count(*)]]\ \n SubqueryAlias: t\ \n Projection: \ \n Values: (Utf8(\"a\"), Int64(1), Int64(100)), (Utf8(\"a\"), Int64(2), Int64(150))" diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index df79b3568ce6..f8baf9c94b3c 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -2294,7 +2294,6 @@ impl Display for SchemaDisplay<'_> { | Expr::OuterReferenceColumn(..) | Expr::Placeholder(_) | Expr::Wildcard { .. } => write!(f, "{}", self.0), - Expr::AggregateFunction(AggregateFunction { func, params }) => { match func.schema_name(params) { Ok(name) => { diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index 04cc26c910cb..a2ed0592efdb 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -25,9 +25,12 @@ use datafusion_common::{ config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema, Result, TableReference, }; -use sqlparser::ast; +use sqlparser::ast::{self, NullTreatment}; -use crate::{AggregateUDF, Expr, GetFieldAccess, ScalarUDF, TableSource, WindowUDF}; +use crate::{ + AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame, + WindowFunctionDefinition, WindowUDF, +}; /// Provides the `SQL` query planner meta-data about tables and /// functions referenced in SQL statements, without a direct dependency on the @@ -138,7 +141,7 @@ pub trait ExprPlanner: Debug + Send + Sync { /// Plan an array literal, such as `[1, 2, 3]` /// - /// Returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_array_literal( &self, exprs: Vec, @@ -149,14 +152,14 @@ pub trait ExprPlanner: Debug + Send + Sync { /// Plan a `POSITION` expression, such as `POSITION( in )` /// - /// returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_position(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } /// Plan a dictionary literal, such as `{ key: value, ...}` /// - /// Returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_dictionary_literal( &self, expr: RawDictionaryExpr, @@ -167,14 +170,14 @@ pub trait ExprPlanner: Debug + Send + Sync { /// Plan an extract expression, such as`EXTRACT(month FROM foo)` /// - /// Returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_extract(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } /// Plan an substring expression, such as `SUBSTRING( [FROM ] [FOR ])` /// - /// Returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_substring(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } @@ -195,14 +198,14 @@ pub trait ExprPlanner: Debug + Send + Sync { /// Plans an overlay expression, such as `overlay(str PLACING substr FROM pos [FOR count])` /// - /// Returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_overlay(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } /// Plans a `make_map` expression, such as `make_map(key1, value1, key2, value2, ...)` /// - /// Returns origin expression arguments if not possible + /// Returns original expression arguments if not possible fn plan_make_map(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } @@ -230,6 +233,23 @@ pub trait ExprPlanner: Debug + Send + Sync { fn plan_any(&self, expr: RawBinaryExpr) -> Result> { Ok(PlannerResult::Original(expr)) } + + /// Plans aggregate functions, such as `COUNT()` + /// + /// Returns original expression arguments if not possible + fn plan_aggregate( + &self, + expr: RawAggregateExpr, + ) -> Result> { + Ok(PlannerResult::Original(expr)) + } + + /// Plans window functions, such as `COUNT()` + /// + /// Returns original expression arguments if not possible + fn plan_window(&self, expr: RawWindowExpr) -> Result> { + Ok(PlannerResult::Original(expr)) + } } /// An operator with two arguments to plan @@ -266,6 +286,30 @@ pub struct RawDictionaryExpr { pub values: Vec, } +/// This structure is used by `AggregateFunctionPlanner` to plan operators with +/// custom expressions. +#[derive(Debug, Clone)] +pub struct RawAggregateExpr { + pub func: Arc, + pub args: Vec, + pub distinct: bool, + pub filter: Option>, + pub order_by: Option>, + pub null_treatment: Option, +} + +/// This structure is used by `WindowFunctionPlanner` to plan operators with +/// custom expressions. +#[derive(Debug, Clone)] +pub struct RawWindowExpr { + pub func_def: WindowFunctionDefinition, + pub args: Vec, + pub partition_by: Vec, + pub order_by: Vec, + pub window_frame: WindowFrame, + pub null_treatment: Option, +} + /// Result of planning a raw expr with [`ExprPlanner`] #[derive(Debug, Clone)] pub enum PlannerResult { diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 2b9e2bddd184..ae7196c9b10f 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -515,9 +515,9 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { null_treatment, } = params; - let mut schema_name = String::new(); + let mut display_name = String::new(); - schema_name.write_fmt(format_args!( + display_name.write_fmt(format_args!( "{}({}{})", self.name(), if *distinct { "DISTINCT " } else { "" }, @@ -525,17 +525,22 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { ))?; if let Some(nt) = null_treatment { - schema_name.write_fmt(format_args!(" {}", nt))?; + display_name.write_fmt(format_args!(" {}", nt))?; } if let Some(fe) = filter { - schema_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?; + display_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?; } - if let Some(order_by) = order_by { - schema_name - .write_fmt(format_args!(" ORDER BY [{}]", expr_vec_fmt!(order_by)))?; + if let Some(ob) = order_by { + display_name.write_fmt(format_args!( + " ORDER BY [{}]", + ob.iter() + .map(|o| format!("{o}")) + .collect::>() + .join(", ") + ))?; } - Ok(schema_name) + Ok(display_name) } /// Returns the user-defined display name of function, given the arguments diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index cb59042ef468..c11329d7f5b3 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -17,11 +17,15 @@ use ahash::RandomState; use datafusion_common::stats::Precision; +use datafusion_expr::expr::{ + schema_name_from_exprs, schema_name_from_sorts, AggregateFunctionParams, + WindowFunctionParams, +}; use datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator; use datafusion_macros::user_doc; use datafusion_physical_expr::expressions; use std::collections::HashSet; -use std::fmt::Debug; +use std::fmt::{Debug, Write}; use std::mem::{size_of, size_of_val}; use std::ops::BitAnd; use std::sync::Arc; @@ -47,11 +51,11 @@ use datafusion_common::{ downcast_value, internal_err, not_impl_err, Result, ScalarValue, }; use datafusion_expr::function::StateFieldsArgs; +use datafusion_expr::{expr_vec_fmt, Expr, ReversedUDAF, StatisticsArgs, TypeSignature}; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, SetMonotonicity, Signature, Volatility, }; -use datafusion_expr::{Expr, ReversedUDAF, StatisticsArgs, TypeSignature}; use datafusion_functions_aggregate_common::aggregate::count_distinct::{ BytesDistinctCountAccumulator, FloatDistinctCountAccumulator, PrimitiveDistinctCountAccumulator, @@ -79,6 +83,11 @@ pub fn count_distinct(expr: Expr) -> Expr { )) } +/// Creates aggregation to count all rows, equivalent to `COUNT(*)`, `COUNT()`, `COUNT(1)` +pub fn count_all() -> Expr { + count(Expr::Literal(COUNT_STAR_EXPANSION)) +} + #[user_doc( doc_section(label = "General Functions"), description = "Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`.", @@ -139,6 +148,185 @@ impl AggregateUDFImpl for Count { "count" } + fn schema_name(&self, params: &AggregateFunctionParams) -> Result { + let AggregateFunctionParams { + args, + distinct, + filter, + order_by, + null_treatment, + } = params; + + let mut schema_name = String::new(); + + if is_count_wildcard(args) { + schema_name.write_str("count(*)")?; + } else { + schema_name.write_fmt(format_args!( + "{}({}{})", + self.name(), + if *distinct { "DISTINCT " } else { "" }, + schema_name_from_exprs(args)? + ))?; + } + + if let Some(null_treatment) = null_treatment { + schema_name.write_fmt(format_args!(" {}", null_treatment))?; + } + + if let Some(filter) = filter { + schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?; + }; + + if let Some(order_by) = order_by { + schema_name.write_fmt(format_args!( + " ORDER BY [{}]", + schema_name_from_sorts(order_by)? + ))?; + }; + + Ok(schema_name) + } + + fn window_function_schema_name( + &self, + params: &WindowFunctionParams, + ) -> Result { + let WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = params; + + let mut schema_name = String::new(); + + if is_count_wildcard(args) { + schema_name.write_str("count(*)")?; + } else { + schema_name.write_fmt(format_args!( + "{}({})", + self.name(), + schema_name_from_exprs(args)? + ))?; + } + + if let Some(null_treatment) = null_treatment { + schema_name.write_fmt(format_args!(" {}", null_treatment))?; + } + + if !partition_by.is_empty() { + schema_name.write_fmt(format_args!( + " PARTITION BY [{}]", + schema_name_from_exprs(partition_by)? + ))?; + } + + if !order_by.is_empty() { + schema_name.write_fmt(format_args!( + " ORDER BY [{}]", + schema_name_from_sorts(order_by)? + ))?; + }; + + schema_name.write_fmt(format_args!(" {window_frame}"))?; + + Ok(schema_name) + } + + fn display_name(&self, params: &AggregateFunctionParams) -> Result { + let AggregateFunctionParams { + args, + distinct, + filter, + order_by, + null_treatment, + } = params; + + let mut display_name = String::new(); + + if is_count_wildcard(args) { + display_name.write_str("count(*)")?; + } else { + display_name.write_fmt(format_args!( + "{}({}{})", + self.name(), + if *distinct { "DISTINCT " } else { "" }, + args.iter() + .map(|arg| format!("{arg}")) + .collect::>() + .join(", ") + ))?; + } + + if let Some(nt) = null_treatment { + display_name.write_fmt(format_args!(" {}", nt))?; + } + if let Some(fe) = filter { + display_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?; + } + if let Some(ob) = order_by { + display_name.write_fmt(format_args!( + " ORDER BY [{}]", + ob.iter() + .map(|o| format!("{o}")) + .collect::>() + .join(", ") + ))?; + } + + Ok(display_name) + } + + fn window_function_display_name( + &self, + params: &WindowFunctionParams, + ) -> Result { + let WindowFunctionParams { + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = params; + + let mut display_name = String::new(); + + if is_count_wildcard(args) { + display_name.write_str("count(*)")?; + } else { + display_name.write_fmt(format_args!( + "{}({})", + self.name(), + expr_vec_fmt!(args) + ))?; + } + + if let Some(null_treatment) = null_treatment { + display_name.write_fmt(format_args!(" {}", null_treatment))?; + } + + if !partition_by.is_empty() { + display_name.write_fmt(format_args!( + " PARTITION BY [{}]", + expr_vec_fmt!(partition_by) + ))?; + } + + if !order_by.is_empty() { + display_name + .write_fmt(format_args!(" ORDER BY [{}]", expr_vec_fmt!(order_by)))?; + }; + + display_name.write_fmt(format_args!( + " {} BETWEEN {} AND {}", + window_frame.units, window_frame.start_bound, window_frame.end_bound + ))?; + + Ok(display_name) + } + fn signature(&self) -> &Signature { &self.signature } @@ -359,6 +547,15 @@ impl AggregateUDFImpl for Count { } } +fn is_count_wildcard(args: &[Expr]) -> bool { + match args { + [] => true, // count() + // All const should be coerced to int64 or rejected by the signature + [Expr::Literal(ScalarValue::Int64(Some(_)))] => true, // count(1) + _ => false, // More than one argument or non-matching cases + } +} + #[derive(Debug)] struct CountAccumulator { count: i64, diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index f4bdb53efd55..a5c84298e9d5 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -64,28 +64,29 @@ pub mod macros; pub mod approx_distinct; +pub mod approx_median; +pub mod approx_percentile_cont; +pub mod approx_percentile_cont_with_weight; pub mod array_agg; +pub mod average; +pub mod bit_and_or_xor; +pub mod bool_and_or; pub mod correlation; pub mod count; pub mod covariance; pub mod first_last; +pub mod grouping; pub mod hyperloglog; pub mod median; pub mod min_max; +pub mod nth_value; pub mod regr; pub mod stddev; +pub mod string_agg; pub mod sum; pub mod variance; -pub mod approx_median; -pub mod approx_percentile_cont; -pub mod approx_percentile_cont_with_weight; -pub mod average; -pub mod bit_and_or_xor; -pub mod bool_and_or; -pub mod grouping; -pub mod nth_value; -pub mod string_agg; +pub mod planner; use crate::approx_percentile_cont::approx_percentile_cont_udaf; use crate::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight_udaf; diff --git a/datafusion/functions-aggregate/src/planner.rs b/datafusion/functions-aggregate/src/planner.rs new file mode 100644 index 000000000000..1f0a42c4c71b --- /dev/null +++ b/datafusion/functions-aggregate/src/planner.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! SQL planning extensions like [`AggregateFunctionPlanner`] + +use datafusion_common::Result; +use datafusion_expr::{ + expr::AggregateFunction, + lit, + planner::{ExprPlanner, PlannerResult, RawAggregateExpr}, + utils::COUNT_STAR_EXPANSION, + Expr, +}; + +#[derive(Debug)] +pub struct AggregateFunctionPlanner; + +impl ExprPlanner for AggregateFunctionPlanner { + fn plan_aggregate( + &self, + expr: RawAggregateExpr, + ) -> Result> { + if expr.func.name() == "count" + && (expr.args.len() == 1 && matches!(expr.args[0], Expr::Wildcard { .. }) + || expr.args.is_empty()) + { + let RawAggregateExpr { + func, + args: _, + distinct, + filter, + order_by, + null_treatment, + } = expr; + return Ok(PlannerResult::Planned(Expr::AggregateFunction( + AggregateFunction::new_udf( + func, + vec![lit(COUNT_STAR_EXPANSION)], + distinct, + filter, + order_by, + null_treatment, + ), + ))); + } + + Ok(PlannerResult::Original(expr)) + } +} diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs index 0d932bf84725..718b0bf1587b 100644 --- a/datafusion/functions-window/src/lib.rs +++ b/datafusion/functions-window/src/lib.rs @@ -45,6 +45,9 @@ pub mod nth_value; pub mod ntile; pub mod rank; pub mod row_number; + +pub mod planner; + mod utils; /// Fluent-style API for creating `Expr`s diff --git a/datafusion/functions-window/src/planner.rs b/datafusion/functions-window/src/planner.rs new file mode 100644 index 000000000000..8f48ca8b18dc --- /dev/null +++ b/datafusion/functions-window/src/planner.rs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! SQL planning extensions like [`WindowFunctionPlanner`] + +use datafusion_common::Result; +use datafusion_expr::{ + expr::WindowFunction, + lit, + planner::{ExprPlanner, PlannerResult, RawWindowExpr}, + utils::COUNT_STAR_EXPANSION, + Expr, ExprFunctionExt, +}; + +#[derive(Debug)] +pub struct WindowFunctionPlanner; + +impl ExprPlanner for WindowFunctionPlanner { + fn plan_window(&self, expr: RawWindowExpr) -> Result> { + if expr.func_def.name() == "count" + && (expr.args.len() == 1 && matches!(expr.args[0], Expr::Wildcard { .. }) + || expr.args.is_empty()) + { + let RawWindowExpr { + func_def, + args: _, + partition_by, + order_by, + window_frame, + null_treatment, + } = expr; + return Ok(PlannerResult::Planned( + Expr::WindowFunction(WindowFunction::new( + func_def, + vec![lit(COUNT_STAR_EXPANSION)], + )) + .partition_by(partition_by) + .order_by(order_by) + .window_frame(window_frame) + .null_treatment(null_treatment) + .build()?, + )); + } + + Ok(PlannerResult::Original(expr)) + } +} diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs deleted file mode 100644 index f517761b1e33..000000000000 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ /dev/null @@ -1,277 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::analyzer::AnalyzerRule; - -use crate::utils::NamePreserver; -use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::Result; -use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams, WindowFunction}; -use datafusion_expr::utils::COUNT_STAR_EXPANSION; -use datafusion_expr::{lit, Expr, LogicalPlan, WindowFunctionDefinition}; - -/// Rewrite `Count(Expr::Wildcard)` to `Count(Expr::Literal)`. -/// -/// Resolves issue: -#[derive(Default, Debug)] -pub struct CountWildcardRule {} - -impl CountWildcardRule { - pub fn new() -> Self { - Self {} - } -} - -impl AnalyzerRule for CountWildcardRule { - fn analyze(&self, plan: LogicalPlan, _: &ConfigOptions) -> Result { - plan.transform_down_with_subqueries(analyze_internal).data() - } - - fn name(&self) -> &str { - "count_wildcard_rule" - } -} - -fn is_wildcard(expr: &Expr) -> bool { - matches!(expr, Expr::Wildcard { .. }) -} - -fn is_count_star_aggregate(aggregate_function: &AggregateFunction) -> bool { - matches!(aggregate_function, - AggregateFunction { - func, - params: AggregateFunctionParams { args, .. }, - } if func.name() == "count" && (args.len() == 1 && is_wildcard(&args[0]) || args.is_empty())) -} - -fn is_count_star_window_aggregate(window_function: &WindowFunction) -> bool { - let args = &window_function.params.args; - matches!(window_function.fun, - WindowFunctionDefinition::AggregateUDF(ref udaf) - if udaf.name() == "count" && (args.len() == 1 && is_wildcard(&args[0]) || args.is_empty())) -} - -fn analyze_internal(plan: LogicalPlan) -> Result> { - let name_preserver = NamePreserver::new(&plan); - plan.map_expressions(|expr| { - let original_name = name_preserver.save(&expr); - let transformed_expr = expr.transform_up(|expr| match expr { - Expr::WindowFunction(mut window_function) - if is_count_star_window_aggregate(&window_function) => - { - window_function.params.args = vec![lit(COUNT_STAR_EXPANSION)]; - Ok(Transformed::yes(Expr::WindowFunction(window_function))) - } - Expr::AggregateFunction(mut aggregate_function) - if is_count_star_aggregate(&aggregate_function) => - { - aggregate_function.params.args = vec![lit(COUNT_STAR_EXPANSION)]; - Ok(Transformed::yes(Expr::AggregateFunction( - aggregate_function, - ))) - } - _ => Ok(Transformed::no(expr)), - })?; - Ok(transformed_expr.update_data(|data| original_name.restore(data))) - }) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::test::*; - use arrow::datatypes::DataType; - use datafusion_common::ScalarValue; - use datafusion_expr::expr::Sort; - use datafusion_expr::ExprFunctionExt; - use datafusion_expr::{ - col, exists, in_subquery, logical_plan::LogicalPlanBuilder, out_ref_col, - scalar_subquery, wildcard, WindowFrame, WindowFrameBound, WindowFrameUnits, - }; - use datafusion_functions_aggregate::count::count_udaf; - use datafusion_functions_aggregate::expr_fn::max; - use std::sync::Arc; - - use datafusion_functions_aggregate::expr_fn::{count, sum}; - - fn assert_plan_eq(plan: LogicalPlan, expected: &str) -> Result<()> { - assert_analyzed_plan_eq_display_indent( - Arc::new(CountWildcardRule::new()), - plan, - expected, - ) - } - - #[test] - fn test_count_wildcard_on_sort() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .aggregate(vec![col("b")], vec![count(wildcard())])? - .project(vec![count(wildcard())])? - .sort(vec![count(wildcard()).sort(true, false)])? - .build()?; - let expected = "Sort: count(*) ASC NULLS LAST [count(*):Int64]\ - \n Projection: count(*) [count(*):Int64]\ - \n Aggregate: groupBy=[[test.b]], aggr=[[count(Int64(1)) AS count(*)]] [b:UInt32, count(*):Int64]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_count_wildcard_on_where_in() -> Result<()> { - let table_scan_t1 = test_table_scan_with_name("t1")?; - let table_scan_t2 = test_table_scan_with_name("t2")?; - - let plan = LogicalPlanBuilder::from(table_scan_t1) - .filter(in_subquery( - col("a"), - Arc::new( - LogicalPlanBuilder::from(table_scan_t2) - .aggregate(Vec::::new(), vec![count(wildcard())])? - .project(vec![count(wildcard())])? - .build()?, - ), - ))? - .build()?; - - let expected = "Filter: t1.a IN () [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [count(*):Int64]\ - \n Projection: count(*) [count(*):Int64]\ - \n Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] [count(*):Int64]\ - \n TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_count_wildcard_on_where_exists() -> Result<()> { - let table_scan_t1 = test_table_scan_with_name("t1")?; - let table_scan_t2 = test_table_scan_with_name("t2")?; - - let plan = LogicalPlanBuilder::from(table_scan_t1) - .filter(exists(Arc::new( - LogicalPlanBuilder::from(table_scan_t2) - .aggregate(Vec::::new(), vec![count(wildcard())])? - .project(vec![count(wildcard())])? - .build()?, - )))? - .build()?; - - let expected = "Filter: EXISTS () [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [count(*):Int64]\ - \n Projection: count(*) [count(*):Int64]\ - \n Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] [count(*):Int64]\ - \n TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { - let table_scan_t1 = test_table_scan_with_name("t1")?; - let table_scan_t2 = test_table_scan_with_name("t2")?; - - let plan = LogicalPlanBuilder::from(table_scan_t1) - .filter( - scalar_subquery(Arc::new( - LogicalPlanBuilder::from(table_scan_t2) - .filter(out_ref_col(DataType::UInt32, "t1.a").eq(col("t2.a")))? - .aggregate( - Vec::::new(), - vec![count(lit(COUNT_STAR_EXPANSION))], - )? - .project(vec![count(lit(COUNT_STAR_EXPANSION))])? - .build()?, - )) - .gt(lit(ScalarValue::UInt8(Some(0)))), - )? - .project(vec![col("t1.a"), col("t1.b")])? - .build()?; - - let expected = "Projection: t1.a, t1.b [a:UInt32, b:UInt32]\ - \n Filter: () > UInt8(0) [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [count(Int64(1)):Int64]\ - \n Projection: count(Int64(1)) [count(Int64(1)):Int64]\ - \n Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] [count(Int64(1)):Int64]\ - \n Filter: outer_ref(t1.a) = t2.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - #[test] - fn test_count_wildcard_on_window() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .window(vec![Expr::WindowFunction(WindowFunction::new( - WindowFunctionDefinition::AggregateUDF(count_udaf()), - vec![wildcard()], - )) - .order_by(vec![Sort::new(col("a"), false, true)]) - .window_frame(WindowFrame::new_bounds( - WindowFrameUnits::Range, - WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))), - WindowFrameBound::Following(ScalarValue::UInt32(Some(2))), - )) - .build()?])? - .project(vec![count(wildcard())])? - .build()?; - - let expected = "Projection: count(Int64(1)) AS count(*) [count(*):Int64]\ - \n WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [test.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [test.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] [a:UInt32, b:UInt32, c:UInt32, count(*) ORDER BY [test.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING:Int64]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_count_wildcard_on_aggregate() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .aggregate(Vec::::new(), vec![count(wildcard())])? - .project(vec![count(wildcard())])? - .build()?; - - let expected = "Projection: count(*) [count(*):Int64]\ - \n Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] [count(*):Int64]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_count_wildcard_on_non_count_aggregate() -> Result<()> { - let table_scan = test_table_scan()?; - let res = LogicalPlanBuilder::from(table_scan) - .aggregate(Vec::::new(), vec![sum(wildcard())]); - assert!(res.is_err()); - Ok(()) - } - - #[test] - fn test_count_wildcard_on_nesting() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .aggregate(Vec::::new(), vec![max(count(wildcard()))])? - .project(vec![count(wildcard())])? - .build()?; - - let expected = "Projection: count(Int64(1)) AS count(*) [count(*):Int64]\ - \n Aggregate: groupBy=[[]], aggr=[[max(count(Int64(1))) AS max(count(*))]] [max(count(*)):Int64;N]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } -} diff --git a/datafusion/optimizer/src/analyzer/mod.rs b/datafusion/optimizer/src/analyzer/mod.rs index 9d0ac6b54cf4..c506616d142e 100644 --- a/datafusion/optimizer/src/analyzer/mod.rs +++ b/datafusion/optimizer/src/analyzer/mod.rs @@ -28,7 +28,6 @@ use datafusion_common::Result; use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_expr::{InvariantLevel, LogicalPlan}; -use crate::analyzer::count_wildcard_rule::CountWildcardRule; use crate::analyzer::expand_wildcard_rule::ExpandWildcardRule; use crate::analyzer::inline_table_scan::InlineTableScan; use crate::analyzer::resolve_grouping_function::ResolveGroupingFunction; @@ -37,7 +36,6 @@ use crate::utils::log_plan; use self::function_rewrite::ApplyFunctionRewrites; -pub mod count_wildcard_rule; pub mod expand_wildcard_rule; pub mod function_rewrite; pub mod inline_table_scan; @@ -106,7 +104,6 @@ impl Analyzer { // [Expr::Wildcard] should be expanded before [TypeCoercion] Arc::new(ResolveGroupingFunction::new()), Arc::new(TypeCoercion::new()), - Arc::new(CountWildcardRule::new()), ]; Self::with_rules(rules) } diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index a33ecbc3a1fb..b59acd72a26d 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -23,11 +23,14 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use datafusion_common::config::ConfigOptions; use datafusion_common::{assert_contains, plan_err, Result, TableReference}; +use datafusion_expr::planner::ExprPlanner; use datafusion_expr::sqlparser::dialect::PostgreSqlDialect; use datafusion_expr::test::function_stub::sum_udaf; use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF}; use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; +use datafusion_functions_aggregate::planner::AggregateFunctionPlanner; +use datafusion_functions_window::planner::WindowFunctionPlanner; use datafusion_optimizer::analyzer::type_coercion::TypeCoercionRewriter; use datafusion_optimizer::analyzer::Analyzer; use datafusion_optimizer::optimizer::Optimizer; @@ -195,7 +198,7 @@ fn between_date32_plus_interval() -> Result<()> { WHERE col_date32 between '1998-03-18' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; let plan = test_sql(sql)?; let expected = - "Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]\ + "Aggregate: groupBy=[[]], aggr=[[count(*)]]\ \n Projection: \ \n Filter: test.col_date32 >= Date32(\"1998-03-18\") AND test.col_date32 <= Date32(\"1998-06-16\")\ \n TableScan: test projection=[col_date32]"; @@ -209,7 +212,7 @@ fn between_date64_plus_interval() -> Result<()> { WHERE col_date64 between '1998-03-18T00:00:00' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; let plan = test_sql(sql)?; let expected = - "Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]\ + "Aggregate: groupBy=[[]], aggr=[[count(*)]]\ \n Projection: \ \n Filter: test.col_date64 >= Date64(\"1998-03-18\") AND test.col_date64 <= Date64(\"1998-06-16\")\ \n TableScan: test projection=[col_date64]"; @@ -266,7 +269,7 @@ fn push_down_filter_groupby_expr_contains_alias() { let sql = "SELECT * FROM (SELECT (col_int32 + col_uint32) AS c, count(*) FROM test GROUP BY 1) where c > 3"; let plan = test_sql(sql).unwrap(); let expected = "Projection: test.col_int32 + test.col_uint32 AS c, count(*)\ - \n Aggregate: groupBy=[[test.col_int32 + CAST(test.col_uint32 AS Int32)]], aggr=[[count(Int64(1)) AS count(*)]]\ + \n Aggregate: groupBy=[[test.col_int32 + CAST(test.col_uint32 AS Int32)]], aggr=[[count(*)]]\ \n Filter: test.col_int32 + CAST(test.col_uint32 AS Int32) > Int32(3)\ \n TableScan: test projection=[col_int32, col_uint32]"; assert_eq!(expected, format!("{plan}")); @@ -311,7 +314,7 @@ fn eliminate_redundant_null_check_on_count() { let plan = test_sql(sql).unwrap(); let expected = "\ Projection: test.col_int32, count(*) AS c\ - \n Aggregate: groupBy=[[test.col_int32]], aggr=[[count(Int64(1)) AS count(*)]]\ + \n Aggregate: groupBy=[[test.col_int32]], aggr=[[count(*)]]\ \n TableScan: test projection=[col_int32]"; assert_eq!(expected, format!("{plan}")); } @@ -422,7 +425,12 @@ fn test_sql(sql: &str) -> Result { let context_provider = MyContextProvider::default() .with_udaf(sum_udaf()) .with_udaf(count_udaf()) - .with_udaf(avg_udaf()); + .with_udaf(avg_udaf()) + .with_expr_planners(vec![ + Arc::new(AggregateFunctionPlanner), + Arc::new(WindowFunctionPlanner), + ]); + let sql_to_rel = SqlToRel::new(&context_provider); let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?; @@ -440,6 +448,7 @@ fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {} struct MyContextProvider { options: ConfigOptions, udafs: HashMap>, + expr_planners: Vec>, } impl MyContextProvider { @@ -448,6 +457,11 @@ impl MyContextProvider { self.udafs.insert(udaf.name().to_lowercase(), udaf); self } + + fn with_expr_planners(mut self, expr_planners: Vec>) -> Self { + self.expr_planners = expr_planners; + self + } } impl ContextProvider for MyContextProvider { @@ -516,6 +530,10 @@ impl ContextProvider for MyContextProvider { fn udwf_names(&self) -> Vec { Vec::new() } + + fn get_expr_planners(&self) -> &[Arc] { + &self.expr_planners + } } struct MyTableSource { diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 1cf3dcb289a6..035749a78941 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -23,7 +23,7 @@ use datafusion_common::{ DFSchema, Dependency, Result, }; use datafusion_expr::expr::{ScalarFunction, Unnest}; -use datafusion_expr::planner::PlannerResult; +use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr}; use datafusion_expr::{ expr, qualified_wildcard, wildcard, Expr, ExprFunctionExt, ExprSchemable, WindowFrame, WindowFunctionDefinition, @@ -315,15 +315,38 @@ impl SqlToRel<'_, S> { }; if let Ok(fun) = self.find_window_func(&name) { - return Expr::WindowFunction(expr::WindowFunction::new( - fun, - self.function_args_to_expr(args, schema, planner_context)?, - )) - .partition_by(partition_by) - .order_by(order_by) - .window_frame(window_frame) - .null_treatment(null_treatment) - .build(); + let args = self.function_args_to_expr(args, schema, planner_context)?; + let mut window_expr = RawWindowExpr { + func_def: fun, + args, + partition_by, + order_by, + window_frame, + null_treatment, + }; + + for planner in self.context_provider.get_expr_planners().iter() { + match planner.plan_window(window_expr)? { + PlannerResult::Planned(expr) => return Ok(expr), + PlannerResult::Original(expr) => window_expr = expr, + } + } + + let RawWindowExpr { + func_def, + args, + partition_by, + order_by, + window_frame, + null_treatment, + } = window_expr; + + return Expr::WindowFunction(expr::WindowFunction::new(func_def, args)) + .partition_by(partition_by) + .order_by(order_by) + .window_frame(window_frame) + .null_treatment(null_treatment) + .build(); } } else { // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function @@ -341,8 +364,33 @@ impl SqlToRel<'_, S> { .map(|e| self.sql_expr_to_logical_expr(*e, schema, planner_context)) .transpose()? .map(Box::new); + + let mut aggregate_expr = RawAggregateExpr { + func: fm, + args, + distinct, + filter, + order_by, + null_treatment, + }; + for planner in self.context_provider.get_expr_planners().iter() { + match planner.plan_aggregate(aggregate_expr)? { + PlannerResult::Planned(expr) => return Ok(expr), + PlannerResult::Original(expr) => aggregate_expr = expr, + } + } + + let RawAggregateExpr { + func, + args, + distinct, + filter, + order_by, + null_treatment, + } = aggregate_expr; + return Ok(Expr::AggregateFunction(expr::AggregateFunction::new_udf( - fm, + func, args, distinct, filter, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 1df18302687e..9c0d6316adb2 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -1461,14 +1461,14 @@ fn select_simple_aggregate_with_groupby_and_column_is_in_aggregate_and_groupby() fn select_simple_aggregate_with_groupby_can_use_positions() { quick_test( "SELECT state, age AS b, count(1) FROM person GROUP BY 1, 2", - "Projection: person.state, person.age AS b, count(Int64(1))\ - \n Aggregate: groupBy=[[person.state, person.age]], aggr=[[count(Int64(1))]]\ + "Projection: person.state, person.age AS b, count(*)\ + \n Aggregate: groupBy=[[person.state, person.age]], aggr=[[count(*)]]\ \n TableScan: person", ); quick_test( "SELECT state, age AS b, count(1) FROM person GROUP BY 2, 1", - "Projection: person.state, person.age AS b, count(Int64(1))\ - \n Aggregate: groupBy=[[person.age, person.state]], aggr=[[count(Int64(1))]]\ + "Projection: person.state, person.age AS b, count(*)\ + \n Aggregate: groupBy=[[person.age, person.state]], aggr=[[count(*)]]\ \n TableScan: person", ); } @@ -1630,8 +1630,8 @@ fn test_wildcard() { #[test] fn select_count_one() { let sql = "SELECT count(1) FROM person"; - let expected = "Projection: count(Int64(1))\ - \n Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]\ + let expected = "Projection: count(*)\ + \n Aggregate: groupBy=[[]], aggr=[[count(*)]]\ \n TableScan: person"; quick_test(sql, expected); } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 7caa81d64e5b..f175973f92a1 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -6276,6 +6276,26 @@ physical_plan 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true +# test count(null) case (null with type) + +statement count 0 +create table t(a int, b int) as values (1, 3), (2, 4), (3, 5); + +query I +select count(null::bigint) from t; +---- +0 + +query TT +explain select count(null::bigint) from t; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[count(Int64(NULL)) AS count(NULL)]] +02)--TableScan: t projection=[] +physical_plan +01)AggregateExec: mode=Single, gby=[], aggr=[count(NULL)] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + ####### # Group median test ####### diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index 80bf0bc2dd5a..20179e0c5bdc 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -243,7 +243,7 @@ query TT EXPLAIN SELECT count(*) from alltypes_plain ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--TableScan: alltypes_plain projection=[] physical_plan 01)AggregateExec: mode=Final, gby=[], aggr=[count(*)] diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 5f2d2f0d1da9..e7cf31dc690b 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -442,4 +442,4 @@ drop table test query T select coalesce(arrow_cast('', 'Utf8View'), arrow_cast('', 'Dictionary(UInt32, Utf8)')); ---- -(empty) \ No newline at end of file +(empty) diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 7dd85b3ae2d8..f39ff56ce449 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -631,4 +631,3 @@ COPY source_table to '/tmp/table.parquet' (row_group_size 55 + 102); # Copy using execution.keep_partition_by_columns with an invalid value query error DataFusion error: Invalid or Unsupported Configuration: provided value for 'execution.keep_partition_by_columns' was not recognized: "invalid_value" COPY source_table to '/tmp/table.parquet' OPTIONS (execution.keep_partition_by_columns invalid_value); - diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt index d660257b609d..0efd9e99889f 100644 --- a/datafusion/sqllogictest/test_files/count_star_rule.slt +++ b/datafusion/sqllogictest/test_files/count_star_rule.slt @@ -31,44 +31,44 @@ query TT EXPLAIN SELECT COUNT() FROM (SELECT 1 AS a, 2 AS b) AS t; ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count()]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--SubqueryAlias: t 03)----EmptyRelation physical_plan -01)ProjectionExec: expr=[1 as count()] +01)ProjectionExec: expr=[1 as count(*)] 02)--PlaceholderRowExec query TT EXPLAIN SELECT t1.a, COUNT() FROM t1 GROUP BY t1.a; ---- logical_plan -01)Aggregate: groupBy=[[t1.a]], aggr=[[count(Int64(1)) AS count()]] +01)Aggregate: groupBy=[[t1.a]], aggr=[[count(*)]] 02)--TableScan: t1 projection=[a] physical_plan -01)AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count()] +01)AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count()] +05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] 06)----------DataSourceExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT t1.a, COUNT() AS cnt FROM t1 GROUP BY t1.a HAVING COUNT() > 0; ---- logical_plan -01)Projection: t1.a, count() AS cnt -02)--Filter: count() > Int64(0) -03)----Aggregate: groupBy=[[t1.a]], aggr=[[count(Int64(1)) AS count()]] +01)Projection: t1.a, count(*) AS cnt +02)--Filter: count(*) > Int64(0) +03)----Aggregate: groupBy=[[t1.a]], aggr=[[count(*)]] 04)------TableScan: t1 projection=[a] physical_plan -01)ProjectionExec: expr=[a@0 as a, count()@1 as cnt] +01)ProjectionExec: expr=[a@0 as a, count(*)@1 as cnt] 02)--CoalesceBatchesExec: target_batch_size=8192 -03)----FilterExec: count()@1 > 0 -04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count()] +03)----FilterExec: count(*)@1 > 0 +04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count()] +08)--------------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] 09)----------------DataSourceExec: partitions=1, partition_sizes=[1] query II @@ -80,12 +80,12 @@ query TT EXPLAIN SELECT a, COUNT() OVER (PARTITION BY a) AS count_a FROM t1; ---- logical_plan -01)Projection: t1.a, count() PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count_a -02)--WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count() PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +01)Projection: t1.a, count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count_a +02)--WindowAggr: windowExpr=[[count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] 03)----TableScan: t1 projection=[a] physical_plan -01)ProjectionExec: expr=[a@0 as a, count() PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a] -02)--WindowAggExec: wdw=[count() PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count() PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +01)ProjectionExec: expr=[a@0 as a, count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a] +02)--WindowAggExec: wdw=[count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index aefc2672b539..6f75a7d7f8fd 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -827,4 +827,3 @@ drop table table_with_pk; statement ok set datafusion.catalog.information_schema = false; - diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index a35a4d6f28dc..dc7a53adf889 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -184,4 +184,4 @@ query error DataFusion error: Schema error: No field named ammp\. Did you mean ' select ammp from a; statement ok -drop table a; \ No newline at end of file +drop table a; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 037565ce05f9..0d5eab6cf56d 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -178,7 +178,6 @@ logical_plan after inline_table_scan SAME TEXT AS ABOVE logical_plan after expand_wildcard_rule SAME TEXT AS ABOVE logical_plan after resolve_grouping_function SAME TEXT AS ABOVE logical_plan after type_coercion SAME TEXT AS ABOVE -logical_plan after count_wildcard_rule SAME TEXT AS ABOVE analyzed_logical_plan SAME TEXT AS ABOVE logical_plan after eliminate_nested_union SAME TEXT AS ABOVE logical_plan after simplify_expressions SAME TEXT AS ABOVE @@ -427,7 +426,7 @@ logical_plan 02)--TableScan: t1 projection=[a] 03)--SubqueryAlias: __correlated_sq_1 04)----Projection: -05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +05)------Aggregate: groupBy=[[]], aggr=[[count(*)]] 06)--------TableScan: t2 projection=[] physical_plan 01)NestedLoopJoinExec: join_type=LeftSemi diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index ee76ee1c5511..32428fdef765 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -61,7 +61,7 @@ logical_plan 02)--Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, aggregate_test_100.c1 -05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] +05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 06)----------TableScan: aggregate_test_100 projection=[c1, c4, c9] physical_plan 01)DataSinkExec: sink=MemoryTable (partitions=1) @@ -122,7 +122,7 @@ FROM aggregate_test_100 logical_plan 01)Dml: op=[Insert Into] table=[table_without_values] 02)--Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field2 -03)----WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] +03)----WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 04)------TableScan: aggregate_test_100 projection=[c1, c4, c9] physical_plan 01)DataSinkExec: sink=MemoryTable (partitions=1) @@ -172,7 +172,7 @@ logical_plan 02)--Projection: a1 AS a1, a2 AS a2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 -05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] +05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 06)----------TableScan: aggregate_test_100 projection=[c1, c4, c9] physical_plan 01)DataSinkExec: sink=MemoryTable (partitions=8) diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index ee1d67c5e26d..752e8ce0e4ff 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -352,7 +352,7 @@ logical_plan 02)--Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, aggregate_test_100.c1 -05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] +05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 06)----------TableScan: aggregate_test_100 projection=[c1, c4, c9] physical_plan 01)DataSinkExec: sink=ParquetSink(file_groups=[]) @@ -414,7 +414,7 @@ FROM aggregate_test_100 logical_plan 01)Dml: op=[Insert Into] table=[table_without_values] 02)--Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS field2 -03)----WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] +03)----WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 04)------TableScan: aggregate_test_100 projection=[c1, c4, c9] physical_plan 01)DataSinkExec: sink=ParquetSink(file_groups=[]) diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 5d311bc43293..5b5368f6b0f4 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1396,7 +1396,7 @@ group by t1_id ---- logical_plan 01)Projection: count(*) -02)--Aggregate: groupBy=[[join_t1.t1_id]], aggr=[[count(Int64(1)) AS count(*)]] +02)--Aggregate: groupBy=[[join_t1.t1_id]], aggr=[[count(*)]] 03)----Projection: join_t1.t1_id 04)------Inner Join: join_t1.t1_id = join_t2.t2_id 05)--------TableScan: join_t1 projection=[t1_id] @@ -4442,7 +4442,7 @@ FROM my_catalog.my_schema.table_with_many_types AS l JOIN my_catalog.my_schema.table_with_many_types AS r ON l.binary_col = r.binary_col ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--Projection: 03)----Inner Join: l.binary_col = r.binary_col 04)------SubqueryAlias: l diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt index dd310f7f2bf6..466bba556697 100644 --- a/datafusion/sqllogictest/test_files/json.slt +++ b/datafusion/sqllogictest/test_files/json.slt @@ -54,7 +54,7 @@ query TT EXPLAIN SELECT count(*) from json_test ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--TableScan: json_test projection=[] physical_plan 01)AggregateExec: mode=Final, gby=[], aggr=[count(*)] diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 4e74b27b875f..b4487be850ac 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -307,7 +307,7 @@ query TT EXPLAIN SELECT COUNT(*) FROM (SELECT a FROM t1 LIMIT 3 OFFSET 11); ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--Limit: skip=11, fetch=3 03)----TableScan: t1 projection=[], fetch=14 physical_plan @@ -325,7 +325,7 @@ query TT EXPLAIN SELECT COUNT(*) FROM (SELECT a FROM t1 LIMIT 3 OFFSET 8); ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--Limit: skip=8, fetch=3 03)----TableScan: t1 projection=[], fetch=11 physical_plan @@ -343,7 +343,7 @@ query TT EXPLAIN SELECT COUNT(*) FROM (SELECT a FROM t1 OFFSET 8); ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--Limit: skip=8, fetch=None 03)----TableScan: t1 projection=[] physical_plan @@ -360,7 +360,7 @@ query TT EXPLAIN SELECT COUNT(*) FROM (SELECT a FROM t1 WHERE a > 3 LIMIT 3 OFFSET 6); ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[]], aggr=[[count(*)]] 02)--Projection: 03)----Limit: skip=6, fetch=3 04)------Filter: t1.a > Int32(3) diff --git a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt index de6a153f58d9..8c87af75ed16 100644 --- a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt +++ b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt @@ -48,8 +48,8 @@ FROM test_table t GROUP BY 1, 2, 3, 4 ---- logical_plan -01)Projection: t.c1, Int64(99999), t.c5 + t.c8, Utf8("test"), count(Int64(1)) -02)--Aggregate: groupBy=[[t.c1, t.c5 + t.c8]], aggr=[[count(Int64(1))]] +01)Projection: t.c1, Int64(99999), t.c5 + t.c8, Utf8("test"), count(*) +02)--Aggregate: groupBy=[[t.c1, t.c5 + t.c8]], aggr=[[count(*)]] 03)----SubqueryAlias: t 04)------TableScan: test_table projection=[c1, c5, c8] @@ -60,8 +60,8 @@ FROM test_table t group by 1, 2, 3 ---- logical_plan -01)Projection: Int64(123), Int64(456), Int64(789), count(Int64(1)), avg(t.c12) -02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1)), avg(t.c12)]] +01)Projection: Int64(123), Int64(456), Int64(789), count(*), avg(t.c12) +02)--Aggregate: groupBy=[[]], aggr=[[count(*), avg(t.c12)]] 03)----SubqueryAlias: t 04)------TableScan: test_table projection=[c12] @@ -72,8 +72,8 @@ FROM test_table t GROUP BY 1, 2 ---- logical_plan -01)Projection: Date32("2023-05-04") AS dt, Boolean(true) AS today_filter, count(Int64(1)) -02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] +01)Projection: Date32("2023-05-04") AS dt, Boolean(true) AS today_filter, count(*) +02)--Aggregate: groupBy=[[]], aggr=[[count(*)]] 03)----SubqueryAlias: t 04)------TableScan: test_table projection=[] @@ -90,8 +90,8 @@ FROM test_table t GROUP BY 1 ---- logical_plan -01)Projection: Boolean(true) AS NOT date_part(Utf8("MONTH"),now()) BETWEEN Int64(50) AND Int64(60), count(Int64(1)) -02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] +01)Projection: Boolean(true) AS NOT date_part(Utf8("MONTH"),now()) BETWEEN Int64(50) AND Int64(60), count(*) +02)--Aggregate: groupBy=[[]], aggr=[[count(*)]] 03)----SubqueryAlias: t 04)------TableScan: test_table projection=[] diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index e12bdca37e6f..dcd373546d79 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1541,7 +1541,7 @@ LIMIT 4) GROUP BY c2; ---- logical_plan -01)Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[count(Int64(1)) AS count(*)]] +01)Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[count(*)]] 02)--Projection: aggregate_test_100.c2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST, fetch=4 04)------Projection: aggregate_test_100.c2, aggregate_test_100.c1 diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 264392fc1017..c847f433f7fc 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -555,7 +555,7 @@ logical_plan 03)----Subquery: 04)------Projection: count(*) 05)--------Filter: sum(outer_ref(t1.t1_int) + t2.t2_id) > Int64(0) -06)----------Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*), sum(CAST(outer_ref(t1.t1_int) + t2.t2_id AS Int64))]] +06)----------Aggregate: groupBy=[[]], aggr=[[count(*), sum(CAST(outer_ref(t1.t1_int) + t2.t2_id AS Int64))]] 07)------------Filter: outer_ref(t1.t1_name) = t2.t2_name 08)--------------TableScan: t2 09)----TableScan: t1 projection=[t1_id, t1_name, t1_int] @@ -738,7 +738,7 @@ explain select (select count(*) from t1) as b logical_plan 01)Projection: __scalar_sq_1.count(*) AS b 02)--SubqueryAlias: __scalar_sq_1 -03)----Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +03)----Aggregate: groupBy=[[]], aggr=[[count(*)]] 04)------TableScan: t1 projection=[] #simple_uncorrelated_scalar_subquery2 @@ -746,13 +746,13 @@ query TT explain select (select count(*) from t1) as b, (select count(1) from t2) ---- logical_plan -01)Projection: __scalar_sq_1.count(*) AS b, __scalar_sq_2.count(Int64(1)) AS count(Int64(1)) +01)Projection: __scalar_sq_1.count(*) AS b, __scalar_sq_2.count(*) AS count(*) 02)--Left Join: 03)----SubqueryAlias: __scalar_sq_1 -04)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +04)------Aggregate: groupBy=[[]], aggr=[[count(*)]] 05)--------TableScan: t1 projection=[] 06)----SubqueryAlias: __scalar_sq_2 -07)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] +07)------Aggregate: groupBy=[[]], aggr=[[count(*)]] 08)--------TableScan: t2 projection=[] statement ok @@ -762,20 +762,20 @@ query TT explain select (select count(*) from t1) as b, (select count(1) from t2) ---- logical_plan -01)Projection: __scalar_sq_1.count(*) AS b, __scalar_sq_2.count(Int64(1)) AS count(Int64(1)) +01)Projection: __scalar_sq_1.count(*) AS b, __scalar_sq_2.count(*) AS count(*) 02)--Left Join: 03)----SubqueryAlias: __scalar_sq_1 -04)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +04)------Aggregate: groupBy=[[]], aggr=[[count(*)]] 05)--------TableScan: t1 projection=[] 06)----SubqueryAlias: __scalar_sq_2 -07)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] +07)------Aggregate: groupBy=[[]], aggr=[[count(*)]] 08)--------TableScan: t2 projection=[] physical_plan -01)ProjectionExec: expr=[count(*)@0 as b, count(Int64(1))@1 as count(Int64(1))] +01)ProjectionExec: expr=[count(*)@0 as b, count(*)@1 as count(*)] 02)--NestedLoopJoinExec: join_type=Left 03)----ProjectionExec: expr=[4 as count(*)] 04)------PlaceholderRowExec -05)----ProjectionExec: expr=[4 as count(Int64(1))] +05)----ProjectionExec: expr=[4 as count(*)] 06)------PlaceholderRowExec statement ok @@ -796,7 +796,7 @@ logical_plan 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*), t2.t2_int, Boolean(true) AS __always_true -06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 07)----------TableScan: t2 projection=[t2_int] query II rowsort @@ -818,7 +818,7 @@ logical_plan 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*), t2.t2_int, Boolean(true) AS __always_true -06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 07)----------TableScan: t2 projection=[t2_int] query II rowsort @@ -839,7 +839,7 @@ logical_plan 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) AS _cnt, t2.t2_int, Boolean(true) AS __always_true -06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 07)----------TableScan: t2 projection=[t2_int] query II rowsort @@ -860,7 +860,7 @@ logical_plan 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) + Int64(2) AS _cnt, t2.t2_int, Boolean(true) AS __always_true -06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 07)----------TableScan: t2 projection=[t2_int] query II rowsort @@ -883,7 +883,7 @@ logical_plan 05)--------TableScan: t1 projection=[t1_id, t1_int] 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: count(*), t2.t2_id, Boolean(true) AS __always_true -08)------------Aggregate: groupBy=[[t2.t2_id]], aggr=[[count(Int64(1)) AS count(*)]] +08)------------Aggregate: groupBy=[[t2.t2_id]], aggr=[[count(*)]] 09)--------------TableScan: t2 projection=[t2_id] query I rowsort @@ -905,7 +905,7 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) + Int64(2) AS cnt_plus_2, t2.t2_int 06)--------Filter: count(*) > Int64(1) -07)----------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +07)----------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 08)------------TableScan: t2 projection=[t2_int] query II rowsort @@ -927,7 +927,7 @@ logical_plan 03)----TableScan: t1 projection=[t1_id, t1_int] 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) + Int64(2) AS cnt_plus_2, t2.t2_int, count(*), Boolean(true) AS __always_true -06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 07)----------TableScan: t2 projection=[t2_int] query II rowsort @@ -951,7 +951,7 @@ logical_plan 06)----------TableScan: t1 projection=[t1_int] 07)--------SubqueryAlias: __scalar_sq_1 08)----------Projection: count(*), t2.t2_int, Boolean(true) AS __always_true -09)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +09)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 10)--------------TableScan: t2 projection=[t2_int] query I rowsort @@ -972,7 +972,7 @@ logical_plan 05)--------TableScan: t1 projection=[t1_int] 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: count(*) AS cnt, t2.t2_int, Boolean(true) AS __always_true -08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 09)--------------TableScan: t2 projection=[t2_int] @@ -1002,7 +1002,7 @@ logical_plan 05)--------TableScan: t1 projection=[t1_int] 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: count(*) + Int64(1) + Int64(1) AS cnt_plus_two, t2.t2_int, count(*), Boolean(true) AS __always_true -08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 09)--------------TableScan: t2 projection=[t2_int] query I rowsort @@ -1031,7 +1031,7 @@ logical_plan 05)--------TableScan: t1 projection=[t1_int] 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: CASE WHEN count(*) = Int64(1) THEN Int64(NULL) ELSE count(*) END AS cnt, t2.t2_int, Boolean(true) AS __always_true -08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] +08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(*)]] 09)--------------TableScan: t2 projection=[t2_int] diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part index 2616b7b75b30..6a41ecb51bf4 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part @@ -42,7 +42,7 @@ explain select logical_plan 01)Sort: lineitem.l_returnflag ASC NULLS LAST, lineitem.l_linestatus ASC NULLS LAST 02)--Projection: lineitem.l_returnflag, lineitem.l_linestatus, sum(lineitem.l_quantity) AS sum_qty, sum(lineitem.l_extendedprice) AS sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax) AS sum_charge, avg(lineitem.l_quantity) AS avg_qty, avg(lineitem.l_extendedprice) AS avg_price, avg(lineitem.l_discount) AS avg_disc, count(*) AS count_order -03)----Aggregate: groupBy=[[lineitem.l_returnflag, lineitem.l_linestatus]], aggr=[[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(__common_expr_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(__common_expr_1 * (Decimal128(Some(1),20,0) + lineitem.l_tax)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1)) AS count(*)]] +03)----Aggregate: groupBy=[[lineitem.l_returnflag, lineitem.l_linestatus]], aggr=[[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(__common_expr_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(__common_expr_1 * (Decimal128(Some(1),20,0) + lineitem.l_tax)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)]] 04)------Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS __common_expr_1, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_tax, lineitem.l_returnflag, lineitem.l_linestatus 05)--------Filter: lineitem.l_shipdate <= Date32("1998-09-02") 06)----------TableScan: lineitem projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], partial_filters=[lineitem.l_shipdate <= Date32("1998-09-02")] diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part index eb41445c3c13..68532733c661 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part @@ -42,7 +42,7 @@ limit 10; logical_plan 01)Sort: custdist DESC NULLS FIRST, c_orders.c_count DESC NULLS FIRST, fetch=10 02)--Projection: c_orders.c_count, count(*) AS custdist -03)----Aggregate: groupBy=[[c_orders.c_count]], aggr=[[count(Int64(1)) AS count(*)]] +03)----Aggregate: groupBy=[[c_orders.c_count]], aggr=[[count(*)]] 04)------SubqueryAlias: c_orders 05)--------Projection: count(orders.o_orderkey) AS c_count 06)----------Aggregate: groupBy=[[customer.c_custkey]], aggr=[[count(orders.o_orderkey)]] diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part index 9e39732689da..eb10f4c8d195 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part @@ -60,7 +60,7 @@ order by logical_plan 01)Sort: numwait DESC NULLS FIRST, supplier.s_name ASC NULLS LAST 02)--Projection: supplier.s_name, count(*) AS numwait -03)----Aggregate: groupBy=[[supplier.s_name]], aggr=[[count(Int64(1)) AS count(*)]] +03)----Aggregate: groupBy=[[supplier.s_name]], aggr=[[count(*)]] 04)------Projection: supplier.s_name 05)--------LeftAnti Join: l1.l_orderkey = __correlated_sq_2.l_orderkey Filter: __correlated_sq_2.l_suppkey != l1.l_suppkey 06)----------LeftSemi Join: l1.l_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_suppkey != l1.l_suppkey diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part index 9ad99361256c..af8b7948c1cf 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part @@ -58,7 +58,7 @@ order by logical_plan 01)Sort: custsale.cntrycode ASC NULLS LAST 02)--Projection: custsale.cntrycode, count(*) AS numcust, sum(custsale.c_acctbal) AS totacctbal -03)----Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[count(Int64(1)) AS count(*), sum(custsale.c_acctbal)]] +03)----Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[count(*), sum(custsale.c_acctbal)]] 04)------SubqueryAlias: custsale 05)--------Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal 06)----------Inner Join: Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_2.avg(customer.c_acctbal) diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part index fb93850ab095..766b21c22f24 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part @@ -42,7 +42,7 @@ order by logical_plan 01)Sort: orders.o_orderpriority ASC NULLS LAST 02)--Projection: orders.o_orderpriority, count(*) AS order_count -03)----Aggregate: groupBy=[[orders.o_orderpriority]], aggr=[[count(Int64(1)) AS count(*)]] +03)----Aggregate: groupBy=[[orders.o_orderpriority]], aggr=[[count(*)]] 04)------Projection: orders.o_orderpriority 05)--------LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey 06)----------Projection: orders.o_orderkey, orders.o_orderpriority diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index dfac9c031074..57207f00f7ab 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -449,7 +449,7 @@ SELECT count(*) FROM ( ---- logical_plan 01)Projection: count(*) -02)--Aggregate: groupBy=[[t1.name]], aggr=[[count(Int64(1)) AS count(*)]] +02)--Aggregate: groupBy=[[t1.name]], aggr=[[count(*)]] 03)----Union 04)------Aggregate: groupBy=[[t1.name]], aggr=[[]] 05)--------TableScan: t1 projection=[name] @@ -493,7 +493,7 @@ logical_plan 02)--Union 03)----Projection: count(*) AS cnt 04)------Limit: skip=0, fetch=3 -05)--------Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +05)--------Aggregate: groupBy=[[]], aggr=[[count(*)]] 06)----------SubqueryAlias: a 07)------------Projection: 08)--------------Aggregate: groupBy=[[aggregate_test_100.c1]], aggr=[[]] @@ -651,7 +651,7 @@ select x, y from (select 1 as x , max(10) as y) b logical_plan 01)Union 02)--Projection: count(*) AS count, a.n -03)----Aggregate: groupBy=[[a.n]], aggr=[[count(Int64(1)) AS count(*)]] +03)----Aggregate: groupBy=[[a.n]], aggr=[[count(*)]] 04)------SubqueryAlias: a 05)--------Projection: Int64(5) AS n 06)----------EmptyRelation diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index ca4713e7d516..6c00af879e76 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -1305,7 +1305,7 @@ EXPLAIN SELECT ---- logical_plan 01)Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING -02)--WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] +02)--WindowAggr: windowExpr=[[count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 03)----Projection: aggregate_test_100.c1, aggregate_test_100.c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING 04)------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 05)--------TableScan: aggregate_test_100 projection=[c1, c2, c4] @@ -1765,7 +1765,7 @@ EXPLAIN SELECT count(*) as global_count FROM ---- logical_plan 01)Projection: count(*) AS global_count -02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] +02)--Aggregate: groupBy=[[]], aggr=[[count(*)]] 03)----SubqueryAlias: a 04)------Projection: 05)--------Aggregate: groupBy=[[aggregate_test_100.c1]], aggr=[[]] @@ -2571,10 +2571,10 @@ logical_plan 01)Projection: sum1, sum2, sum3, min1, min2, min3, max1, max2, max3, cnt1, cnt2, sumr1, sumr2, sumr3, minr1, minr2, minr3, maxr1, maxr2, maxr3, cntr1, cntr2, sum4, cnt3 02)--Sort: annotated_data_finite.inc_col DESC NULLS FIRST, fetch=5 03)----Projection: sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS sum1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING AS sum2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING AS sum3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS min1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING AS min2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING AS min3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS max1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING AS max2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING AS max3, count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING AS cnt1, count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS cnt2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING AS sumr1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING AS sumr2, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING AS sumr3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS minr1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING AS minr2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING AS minr3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS maxr1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING AS maxr2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING AS maxr3, count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS cntr1, count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS cntr2, sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS sum4, count(*) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS cnt3, annotated_data_finite.inc_col -04)------WindowAggr: windowExpr=[[sum(__common_expr_1 AS annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS count(*) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]] +04)------WindowAggr: windowExpr=[[sum(__common_expr_1 AS annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(*) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]] 05)--------Projection: __common_expr_1, annotated_data_finite.inc_col, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING -06)----------WindowAggr: windowExpr=[[sum(__common_expr_2 AS annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(__common_expr_1 AS annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(__common_expr_2 AS annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING AS count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]] -07)------------WindowAggr: windowExpr=[[sum(__common_expr_2 AS annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(__common_expr_1 AS annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(__common_expr_1 AS annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING AS count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]] +06)----------WindowAggr: windowExpr=[[sum(__common_expr_2 AS annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(__common_expr_1 AS annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(__common_expr_2 AS annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]] +07)------------WindowAggr: windowExpr=[[sum(__common_expr_2 AS annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(__common_expr_1 AS annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(__common_expr_1 AS annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(*) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]] 08)--------------Projection: CAST(annotated_data_finite.desc_col AS Int64) AS __common_expr_1, CAST(annotated_data_finite.inc_col AS Int64) AS __common_expr_2, annotated_data_finite.ts, annotated_data_finite.inc_col, annotated_data_finite.desc_col 09)----------------TableScan: annotated_data_finite projection=[ts, inc_col, desc_col] physical_plan @@ -4112,7 +4112,7 @@ EXPLAIN select count(*) over (partition by a order by a) from (select * from a w ---- logical_plan 01)Projection: count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW -02)--WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +02)--WindowAggr: windowExpr=[[count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] 03)----Filter: a.a = Int64(1) 04)------TableScan: a projection=[a] physical_plan diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 219f656bb471..086c0858115b 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -50,9 +50,9 @@ mod tests { let plan_str = tpch_plan_to_string(1).await?; assert_eq!( plan_str, - "Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, sum(LINEITEM.L_QUANTITY) AS SUM_QTY, sum(LINEITEM.L_EXTENDEDPRICE) AS SUM_BASE_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS SUM_DISC_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX) AS SUM_CHARGE, avg(LINEITEM.L_QUANTITY) AS AVG_QTY, avg(LINEITEM.L_EXTENDEDPRICE) AS AVG_PRICE, avg(LINEITEM.L_DISCOUNT) AS AVG_DISC, count(Int64(1)) AS COUNT_ORDER\ + "Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, sum(LINEITEM.L_QUANTITY) AS SUM_QTY, sum(LINEITEM.L_EXTENDEDPRICE) AS SUM_BASE_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS SUM_DISC_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX) AS SUM_CHARGE, avg(LINEITEM.L_QUANTITY) AS AVG_QTY, avg(LINEITEM.L_EXTENDEDPRICE) AS AVG_PRICE, avg(LINEITEM.L_DISCOUNT) AS AVG_DISC, count(*) AS COUNT_ORDER\ \n Sort: LINEITEM.L_RETURNFLAG ASC NULLS LAST, LINEITEM.L_LINESTATUS ASC NULLS LAST\ - \n Aggregate: groupBy=[[LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS]], aggr=[[sum(LINEITEM.L_QUANTITY), sum(LINEITEM.L_EXTENDEDPRICE), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX), avg(LINEITEM.L_QUANTITY), avg(LINEITEM.L_EXTENDEDPRICE), avg(LINEITEM.L_DISCOUNT), count(Int64(1))]]\ + \n Aggregate: groupBy=[[LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS]], aggr=[[sum(LINEITEM.L_QUANTITY), sum(LINEITEM.L_EXTENDEDPRICE), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX), avg(LINEITEM.L_QUANTITY), avg(LINEITEM.L_EXTENDEDPRICE), avg(LINEITEM.L_DISCOUNT), count(*)]]\ \n Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, LINEITEM.L_QUANTITY, LINEITEM.L_EXTENDEDPRICE, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT), LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) * (CAST(Int32(1) AS Decimal128(15, 2)) + LINEITEM.L_TAX), LINEITEM.L_DISCOUNT\ \n Filter: LINEITEM.L_SHIPDATE <= Date32(\"1998-12-01\") - IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 10368000 }\")\ \n TableScan: LINEITEM" @@ -119,9 +119,9 @@ mod tests { let plan_str = tpch_plan_to_string(4).await?; assert_eq!( plan_str, - "Projection: ORDERS.O_ORDERPRIORITY, count(Int64(1)) AS ORDER_COUNT\ + "Projection: ORDERS.O_ORDERPRIORITY, count(*) AS ORDER_COUNT\ \n Sort: ORDERS.O_ORDERPRIORITY ASC NULLS LAST\ - \n Aggregate: groupBy=[[ORDERS.O_ORDERPRIORITY]], aggr=[[count(Int64(1))]]\ + \n Aggregate: groupBy=[[ORDERS.O_ORDERPRIORITY]], aggr=[[count(*)]]\ \n Projection: ORDERS.O_ORDERPRIORITY\ \n Filter: ORDERS.O_ORDERDATE >= CAST(Utf8(\"1993-07-01\") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8(\"1993-10-01\") AS Date32) AND EXISTS ()\ \n Subquery:\ @@ -269,10 +269,10 @@ mod tests { let plan_str = tpch_plan_to_string(13).await?; assert_eq!( plan_str, - "Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(Int64(1)) AS CUSTDIST\ - \n Sort: count(Int64(1)) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST\ - \n Projection: count(ORDERS.O_ORDERKEY), count(Int64(1))\ - \n Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(Int64(1))]]\ + "Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(*) AS CUSTDIST\ + \n Sort: count(*) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST\ + \n Projection: count(ORDERS.O_ORDERKEY), count(*)\ + \n Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(*)]]\ \n Projection: count(ORDERS.O_ORDERKEY)\ \n Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY]], aggr=[[count(ORDERS.O_ORDERKEY)]]\ \n Projection: CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY\ @@ -410,10 +410,10 @@ mod tests { let plan_str = tpch_plan_to_string(21).await?; assert_eq!( plan_str, - "Projection: SUPPLIER.S_NAME, count(Int64(1)) AS NUMWAIT\ + "Projection: SUPPLIER.S_NAME, count(*) AS NUMWAIT\ \n Limit: skip=0, fetch=100\ - \n Sort: count(Int64(1)) DESC NULLS FIRST, SUPPLIER.S_NAME ASC NULLS LAST\ - \n Aggregate: groupBy=[[SUPPLIER.S_NAME]], aggr=[[count(Int64(1))]]\ + \n Sort: count(*) DESC NULLS FIRST, SUPPLIER.S_NAME ASC NULLS LAST\ + \n Aggregate: groupBy=[[SUPPLIER.S_NAME]], aggr=[[count(*)]]\ \n Projection: SUPPLIER.S_NAME\ \n Filter: SUPPLIER.S_SUPPKEY = LINEITEM.L_SUPPKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND ORDERS.O_ORDERSTATUS = Utf8(\"F\") AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE AND EXISTS () AND NOT EXISTS () AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8(\"SAUDI ARABIA\")\ \n Subquery:\ @@ -438,9 +438,9 @@ mod tests { let plan_str = tpch_plan_to_string(22).await?; assert_eq!( plan_str, - "Projection: substr(CUSTOMER.C_PHONE,Int32(1),Int32(2)) AS CNTRYCODE, count(Int64(1)) AS NUMCUST, sum(CUSTOMER.C_ACCTBAL) AS TOTACCTBAL\ + "Projection: substr(CUSTOMER.C_PHONE,Int32(1),Int32(2)) AS CNTRYCODE, count(*) AS NUMCUST, sum(CUSTOMER.C_ACCTBAL) AS TOTACCTBAL\ \n Sort: substr(CUSTOMER.C_PHONE,Int32(1),Int32(2)) ASC NULLS LAST\ - \n Aggregate: groupBy=[[substr(CUSTOMER.C_PHONE,Int32(1),Int32(2))]], aggr=[[count(Int64(1)), sum(CUSTOMER.C_ACCTBAL)]]\ + \n Aggregate: groupBy=[[substr(CUSTOMER.C_PHONE,Int32(1),Int32(2))]], aggr=[[count(*), sum(CUSTOMER.C_ACCTBAL)]]\ \n Projection: substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)), CUSTOMER.C_ACCTBAL\ \n Filter: (substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"13\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"31\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"23\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"29\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"30\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"18\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"17\") AS Utf8)) AND CUSTOMER.C_ACCTBAL > () AND NOT EXISTS ()\ \n Subquery:\ diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 5fb357dfcd23..68856117a38c 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -687,7 +687,7 @@ async fn simple_intersect() -> Result<()> { // Substrait treats both count(*) and count(1) the same assert_expected_plan( "SELECT count(*) FROM (SELECT data.a FROM data INTERSECT SELECT data2.a FROM data2);", - "Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]\ + "Aggregate: groupBy=[[]], aggr=[[count(*)]]\ \n Projection: \ \n LeftSemi Join: data.a = data2.a\ \n Aggregate: groupBy=[[data.a]], aggr=[[]]\ @@ -822,7 +822,7 @@ async fn simple_intersect_table_reuse() -> Result<()> { // Schema check works because we set aliases to what the Substrait consumer will generate. assert_expected_plan( "SELECT count(1) FROM (SELECT left.a FROM data AS left INTERSECT SELECT right.a FROM data AS right);", - "Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]\ + "Aggregate: groupBy=[[]], aggr=[[count(*)]]\ \n Projection: \ \n LeftSemi Join: left.a = right.a\ \n SubqueryAlias: left\ From bbc6049f1c87eba7e4eac7e6f03a26dc8679310e Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Sat, 22 Feb 2025 17:04:11 +0800 Subject: [PATCH 48/71] fix: normalize column names in table constraints (#14794) * fix: normalize column names in table constraints * newline * Move slt * restore ddl.slt --- datafusion/sql/src/statement.rs | 85 ++++++++++--------- .../test_files/ident_normalization.slt | 41 +++++++++ 2 files changed, 86 insertions(+), 40 deletions(-) diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 74055d979145..fbe6d6501c86 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -441,7 +441,7 @@ impl SqlToRel<'_, S> { plan }; - let constraints = Self::new_constraint_from_table_constraints( + let constraints = self.new_constraint_from_table_constraints( &all_constraints, plan.schema(), )?; @@ -465,7 +465,7 @@ impl SqlToRel<'_, S> { schema, }; let plan = LogicalPlan::EmptyRelation(plan); - let constraints = Self::new_constraint_from_table_constraints( + let constraints = self.new_constraint_from_table_constraints( &all_constraints, plan.schema(), )?; @@ -1434,7 +1434,7 @@ impl SqlToRel<'_, S> { let name = self.object_name_to_table_reference(name)?; let constraints = - Self::new_constraint_from_table_constraints(&all_constraints, &df_schema)?; + self.new_constraint_from_table_constraints(&all_constraints, &df_schema)?; Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable( PlanCreateExternalTable { schema: df_schema, @@ -1454,8 +1454,34 @@ impl SqlToRel<'_, S> { ))) } + /// Get the indices of the constraint columns in the schema. + /// If any column is not found, return an error. + fn get_constraint_column_indices( + &self, + df_schema: &DFSchemaRef, + columns: &[Ident], + constraint_name: &str, + ) -> Result> { + let field_names = df_schema.field_names(); + columns + .iter() + .map(|ident| { + let column = self.ident_normalizer.normalize(ident.clone()); + field_names + .iter() + .position(|item| *item == column) + .ok_or_else(|| { + plan_datafusion_err!( + "Column for {constraint_name} not found in schema: {column}" + ) + }) + }) + .collect::>>() + } + /// Convert each [TableConstraint] to corresponding [Constraint] fn new_constraint_from_table_constraints( + &self, constraints: &[TableConstraint], df_schema: &DFSchemaRef, ) -> Result { @@ -1463,46 +1489,25 @@ impl SqlToRel<'_, S> { .iter() .map(|c: &TableConstraint| match c { TableConstraint::Unique { name, columns, .. } => { - let field_names = df_schema.field_names(); - // Get unique constraint indices in the schema: - let indices = columns - .iter() - .map(|u| { - let idx = field_names - .iter() - .position(|item| *item == u.value) - .ok_or_else(|| { - let name = name - .as_ref() - .map(|name| format!("with name '{name}' ")) - .unwrap_or("".to_string()); - DataFusionError::Execution( - format!("Column for unique constraint {}not found in schema: {}", name,u.value) - ) - })?; - Ok(idx) - }) - .collect::>>()?; + let constraint_name = match name { + Some(name) => &format!("unique constraint with name '{name}'"), + None => "unique constraint", + }; + // Get unique constraint indices in the schema + let indices = self.get_constraint_column_indices( + df_schema, + columns, + constraint_name, + )?; Ok(Constraint::Unique(indices)) } TableConstraint::PrimaryKey { columns, .. } => { - let field_names = df_schema.field_names(); - // Get primary key indices in the schema: - let indices = columns - .iter() - .map(|pk| { - let idx = field_names - .iter() - .position(|item| *item == pk.value) - .ok_or_else(|| { - DataFusionError::Execution(format!( - "Column for primary key not found in schema: {}", - pk.value - )) - })?; - Ok(idx) - }) - .collect::>>()?; + // Get primary key indices in the schema + let indices = self.get_constraint_column_indices( + df_schema, + columns, + "primary key", + )?; Ok(Constraint::PrimaryKey(indices)) } TableConstraint::ForeignKey { .. } => { diff --git a/datafusion/sqllogictest/test_files/ident_normalization.slt b/datafusion/sqllogictest/test_files/ident_normalization.slt index 996093c3ad9c..ac2f460ebc43 100644 --- a/datafusion/sqllogictest/test_files/ident_normalization.slt +++ b/datafusion/sqllogictest/test_files/ident_normalization.slt @@ -132,3 +132,44 @@ query T SELECT CONCAT('Hello', 'World') ---- HelloWorld + +# Restore default setting +statement ok +set datafusion.sql_parser.enable_ident_normalization = true; + +########## +## Constraint Column Name Normalization +########## + +# Test issue https://github.com/apache/datafusion/issues/14340 +statement ok +create table test_pk_constraint(COLUMN_NAME TEXT NOT NULL, constraint COLUMN_NAME_PK primary key (COLUMN_NAME)) + +statement ok +create table test_unique_constraint(cOlUmn_name TEXT NOT NULL, unique(COLUMN_NAME)) + +statement ok +drop table test_pk_constraint; + +statement ok +drop table test_unique_constraint; + +# Test with normalization disabled +statement ok +set datafusion.sql_parser.enable_ident_normalization = false; + +statement error Error during planning: Column for primary key not found in schema: COLUMN_NAME +create table test_pk_constraint(column_name TEXT NOT NULL, constraint COLUMN_NAME_PK primary key (COLUMN_NAME)) + +statement ok +create table test_pk_constraint(COLUMN_NAME TEXT NOT NULL, constraint COLUMN_NAME_PK primary key (COLUMN_NAME)) + +statement error Error during planning: Column for unique constraint not found in schema: COLUMN_NAME +create table test_unique_constraint(column_name TEXT NOT NULL, unique(COLUMN_NAME)) + +statement ok +drop table test_pk_constraint; + +# Restore default setting +statement ok +set datafusion.sql_parser.enable_ident_normalization = true; From 6bfeb0272ed9093dbb543eab89c18be40aa8bef8 Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Sat, 22 Feb 2025 21:06:09 +0800 Subject: [PATCH 49/71] =?UTF-8?q?fix:=20we=20are=20missing=20the=20unlimit?= =?UTF-8?q?ed=20case=20for=20bounded=20streaming=20when=20usi=E2=80=A6=20(?= =?UTF-8?q?#14815)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: we are missing the unlimited case for bounded streaming when using datafusion-cli * Address comments --- datafusion-cli/src/exec.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index 84664794b7d9..d560dee987f5 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -258,17 +258,19 @@ pub(super) async fn exec_and_print( let mut stream = execute_stream(physical_plan, task_ctx.clone())?; let mut results = vec![]; let mut row_count = 0_usize; + let max_rows = match print_options.maxrows { + MaxRows::Unlimited => usize::MAX, + MaxRows::Limited(n) => n, + }; while let Some(batch) = stream.next().await { let batch = batch?; let curr_num_rows = batch.num_rows(); - if let MaxRows::Limited(max_rows) = print_options.maxrows { - // Stop collecting results if the number of rows exceeds the limit - // results batch should include the last batch that exceeds the limit - if row_count < max_rows + curr_num_rows { - // Try to grow the reservation to accommodate the batch in memory - reservation.try_grow(get_record_batch_memory_size(&batch))?; - results.push(batch); - } + // Stop collecting results if the number of rows exceeds the limit + // results batch should include the last batch that exceeds the limit + if row_count < max_rows + curr_num_rows { + // Try to grow the reservation to accommodate the batch in memory + reservation.try_grow(get_record_batch_memory_size(&batch))?; + results.push(batch); } row_count += curr_num_rows; } From 0bd9083a0b8bff6d261449a92dcd4d110976774a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 22 Feb 2025 15:08:06 -0500 Subject: [PATCH 50/71] Minor: comment in Cargo.toml about MSRV (#14809) - I wasn't able to quickly find where the MSRV was defined when filing https://github.com/apache/datafusion/issues/14808 so I would like to make it easier to find nex time --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index adb3ee23d947..45b146e2e21a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,9 @@ homepage = "https://datafusion.apache.org" license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" +# Define Minimum Supported Rust Version (MSRV) rust-version = "1.82.0" +# Define DataFusion version version = "45.0.0" [workspace.dependencies] From 1c54b38e4a4012fd8d1b4f48e2c3d6d35016bad0 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sun, 23 Feb 2025 13:20:40 +0000 Subject: [PATCH 51/71] Simplify `FileSource::create_file_opener`'s signature (#14798) * simplify fn signature * . --- .../examples/csv_json_opener.rs | 2 +- datafusion/core/src/datasource/data_source.rs | 4 ++-- .../datasource/physical_plan/arrow_file.rs | 10 ++++---- .../core/src/datasource/physical_plan/avro.rs | 14 +++++------ .../core/src/datasource/physical_plan/csv.rs | 10 ++++---- .../physical_plan/file_scan_config.rs | 4 ++-- .../core/src/datasource/physical_plan/json.rs | 10 ++++---- .../physical_plan/parquet/source.rs | 23 ++++++++----------- 8 files changed, 36 insertions(+), 41 deletions(-) diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 7613578e8c3a..ef4ff9d51e7f 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -71,7 +71,7 @@ async fn csv_opener() -> Result<()> { .with_batch_size(8192) .with_projection(&scan_config); - let opener = config.create_file_opener(Ok(object_store), &scan_config, 0)?; + let opener = config.create_file_opener(object_store, &scan_config, 0); let mut result = vec![]; let mut stream = diff --git a/datafusion/core/src/datasource/data_source.rs b/datafusion/core/src/datasource/data_source.rs index fcb31194eab1..2db79c5c839d 100644 --- a/datafusion/core/src/datasource/data_source.rs +++ b/datafusion/core/src/datasource/data_source.rs @@ -40,10 +40,10 @@ pub trait FileSource: Send + Sync { /// Creates a `dyn FileOpener` based on given parameters fn create_file_opener( &self, - object_store: datafusion_common::Result>, + object_store: Arc, base_config: &FileScanConfig, partition: usize, - ) -> datafusion_common::Result>; + ) -> Arc; /// Any fn as_any(&self) -> &dyn Any; /// Initialize new type with batch size configuration diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index d0d037924862..e5523063c782 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -211,14 +211,14 @@ pub struct ArrowSource { impl FileSource for ArrowSource { fn create_file_opener( &self, - object_store: Result>, + object_store: Arc, base_config: &FileScanConfig, _partition: usize, - ) -> Result> { - Ok(Arc::new(ArrowOpener { - object_store: object_store?, + ) -> Arc { + Arc::new(ArrowOpener { + object_store, projection: base_config.file_column_projection_indices(), - })) + }) } fn as_any(&self) -> &dyn Any { diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index ae98c19a1615..1674814d76a7 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -194,23 +194,23 @@ impl FileSource for AvroSource { #[cfg(feature = "avro")] fn create_file_opener( &self, - object_store: Result>, + object_store: Arc, _base_config: &FileScanConfig, _partition: usize, - ) -> Result> { - Ok(Arc::new(private::AvroOpener { + ) -> Arc { + Arc::new(private::AvroOpener { config: Arc::new(self.clone()), - object_store: object_store?, - })) + object_store, + }) } #[cfg(not(feature = "avro"))] fn create_file_opener( &self, - _object_store: Result>, + _object_store: Arc, _base_config: &FileScanConfig, _partition: usize, - ) -> Result> { + ) -> Arc { panic!("Avro feature is not enabled in this build") } diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 8fcfd6b41e85..629d452064f5 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -564,15 +564,15 @@ impl CsvOpener { impl FileSource for CsvSource { fn create_file_opener( &self, - object_store: Result>, + object_store: Arc, base_config: &FileScanConfig, _partition: usize, - ) -> Result> { - Ok(Arc::new(CsvOpener { + ) -> Arc { + Arc::new(CsvOpener { config: Arc::new(self.clone()), file_compression_type: base_config.file_compression_type, - object_store: object_store?, - })) + object_store, + }) } fn as_any(&self) -> &dyn Any { diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 5c882ed75109..6b74f6be79eb 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -162,7 +162,7 @@ impl DataSource for FileScanConfig { partition: usize, context: Arc, ) -> Result { - let object_store = context.runtime_env().object_store(&self.object_store_url); + let object_store = context.runtime_env().object_store(&self.object_store_url)?; let source = self .source @@ -170,7 +170,7 @@ impl DataSource for FileScanConfig { .with_schema(Arc::clone(&self.file_schema)) .with_projection(self); - let opener = source.create_file_opener(object_store, self, partition)?; + let opener = source.create_file_opener(object_store, self, partition); let stream = FileStream::new(self, partition, opener, source.metrics())?; Ok(Box::pin(stream)) diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index f2304ed8a342..d1ae13b083ab 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -262,18 +262,18 @@ impl JsonSource { impl FileSource for JsonSource { fn create_file_opener( &self, - object_store: Result>, + object_store: Arc, base_config: &FileScanConfig, _partition: usize, - ) -> Result> { - Ok(Arc::new(JsonOpener { + ) -> Arc { + Arc::new(JsonOpener { batch_size: self .batch_size .expect("Batch size must set before creating opener"), projected_schema: base_config.projected_file_schema(), file_compression_type: base_config.file_compression_type, - object_store: object_store?, - })) + object_store, + }) } fn as_any(&self) -> &dyn Any { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/source.rs b/datafusion/core/src/datasource/physical_plan/parquet/source.rs index 26a5877e2d38..178de8f51ae4 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/source.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/source.rs @@ -463,10 +463,10 @@ impl ParquetSource { impl FileSource for ParquetSource { fn create_file_opener( &self, - object_store: datafusion_common::Result>, + object_store: Arc, base_config: &FileScanConfig, partition: usize, - ) -> datafusion_common::Result> { + ) -> Arc { let projection = base_config .file_column_projection_indices() .unwrap_or_else(|| (0..base_config.file_schema.fields().len()).collect()); @@ -475,17 +475,12 @@ impl FileSource for ParquetSource { .clone() .unwrap_or_else(|| Arc::new(DefaultSchemaAdapterFactory)); - let parquet_file_reader_factory = self - .parquet_file_reader_factory - .as_ref() - .map(|f| Ok(Arc::clone(f))) - .unwrap_or_else(|| { - object_store.map(|store| { - Arc::new(DefaultParquetFileReaderFactory::new(store)) as _ - }) - })?; - - Ok(Arc::new(ParquetOpener { + let parquet_file_reader_factory = + self.parquet_file_reader_factory.clone().unwrap_or_else(|| { + Arc::new(DefaultParquetFileReaderFactory::new(object_store)) as _ + }); + + Arc::new(ParquetOpener { partition_index: partition, projection: Arc::from(projection), batch_size: self @@ -504,7 +499,7 @@ impl FileSource for ParquetSource { enable_page_index: self.enable_page_index(), enable_bloom_filter: self.bloom_filter_on_read(), schema_adapter_factory, - })) + }) } fn as_any(&self) -> &dyn Any { From c0b749da432feaf2e7f953d492c8c27756460063 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Sun, 23 Feb 2025 05:46:52 -0800 Subject: [PATCH 52/71] Remove unused crate dependencies (#14827) * Enable 'extended tests' on forks Allow contributors to run extended tests workflow if they wish to, just like they can run rust tests workflow on their forks, before opening a PR to DataFusion. GitHub allows enabling/disabling workflows in the web UI without needing to change workflow yaml file. * Remove unused crate dependencies Found by `cargo udeps`. Unfortunately there were false positives too. * one on workspace level --- .github/workflows/extended.yml | 5 +---- Cargo.lock | 3 --- Cargo.toml | 1 - datafusion/physical-optimizer/Cargo.toml | 1 - datafusion/wasmtest/Cargo.toml | 2 -- 5 files changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 0f52d14cdae2..3f882d7a3a82 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -23,16 +23,13 @@ concurrency: # https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#running-your-pull_request-workflow-when-a-pull-request-merges # -# These jobs only run on the `main` branch as they are time consuming +# These jobs are not run as part of PR checks as they are time-consuming # and should not fail often. # # We still run them as they provide important coverage to ensure correctness # in the (very rare) event of a hash failure or sqlite library query failure. on: - # Run on all commits to main push: - branches: - - main jobs: # Check crate compiles and base cargo check passes diff --git a/Cargo.lock b/Cargo.lock index 716e0cf10386..d5edabdf4585 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2232,7 +2232,6 @@ version = "45.0.0" dependencies = [ "arrow", "datafusion-common", - "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -2399,13 +2398,11 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-optimizer", - "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-sql", "getrandom 0.2.15", "tokio", "wasm-bindgen", - "wasm-bindgen-futures", "wasm-bindgen-test", ] diff --git a/Cargo.toml b/Cargo.toml index 45b146e2e21a..f7d39aeb3003 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -146,7 +146,6 @@ pbjson = { version = "0.7.0" } pbjson-types = "0.7" # Should match arrow-flight's version of prost. prost = "0.13.1" -prost-derive = "0.13.1" rand = "0.8.5" recursive = "0.1.1" regex = "1.8" diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index 9b7e6a0f5db0..e8473e6556d1 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -40,7 +40,6 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } datafusion-common = { workspace = true, default-features = true } -datafusion-datasource = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 6d64a15f4b99..30d5bcaedcb7 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -50,14 +50,12 @@ datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-optimizer = { workspace = true, default-features = true } -datafusion-physical-expr = { workspace = true, default-features = true } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } # getrandom must be compiled with js feature getrandom = { version = "0.2.8", features = ["js"] } wasm-bindgen = "0.2.99" -wasm-bindgen-futures = "0.4.49" [dev-dependencies] tokio = { workspace = true } From c92982c393c69cbc4f5d90e003a763f5b0eb8885 Mon Sep 17 00:00:00 2001 From: dentiny Date: Sun, 23 Feb 2025 13:55:19 -0800 Subject: [PATCH 53/71] docs: Add instruction to build (#14694) * Add instruction to build * format readme doc and rename getting started * Add instruction for devcontainer * remove dev container and vscode part --- .../{getting_started.md => development_environment.md} | 2 +- docs/source/contributor-guide/index.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) rename docs/source/contributor-guide/{getting_started.md => development_environment.md} (99%) diff --git a/docs/source/contributor-guide/getting_started.md b/docs/source/contributor-guide/development_environment.md similarity index 99% rename from docs/source/contributor-guide/getting_started.md rename to docs/source/contributor-guide/development_environment.md index a4938ead433b..cd1b8ea35642 100644 --- a/docs/source/contributor-guide/getting_started.md +++ b/docs/source/contributor-guide/development_environment.md @@ -17,7 +17,7 @@ under the License. --> -# Getting Started +# Development Environment This section describes how you can get started at developing DataFusion. diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 0fe86c174a31..65d92e7d7926 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -30,6 +30,10 @@ In addition to submitting new PRs, we have a healthy tradition of community members reviewing each other's PRs. Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. +## Development Environment + +You can find how to setup build and testing environment [here](https://datafusion.apache.org/user-guide/example-usage.html) + ## Finding and Creating Issues to Work On You can find a curated [good-first-issue] list to help you get started. From 7299d0e566caa1e10f47a74b8ae817b6fb146fdf Mon Sep 17 00:00:00 2001 From: oznur-synnada Date: Mon, 24 Feb 2025 10:48:24 +0300 Subject: [PATCH 54/71] Update website links (#14846) --- docs/source/contributor-guide/gsoc_application_guidelines.md | 2 +- docs/source/contributor-guide/gsoc_project_ideas.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/gsoc_application_guidelines.md b/docs/source/contributor-guide/gsoc_application_guidelines.md index fddd0b5e1805..e8ca9703a5dd 100644 --- a/docs/source/contributor-guide/gsoc_application_guidelines.md +++ b/docs/source/contributor-guide/gsoc_application_guidelines.md @@ -25,7 +25,7 @@ To apply, follow these steps: - Review the list of proposed GSoC projects for Apache DataFusion. - If you have your own project idea, discuss it with potential mentors before submitting your proposal. 2. **Engage with the Community** - - Join our [mailing list](mailto:dev@datafusion.apache.org) and [Discord](https://discord.gg/Q9eh6S2T) to introduce yourself and ask questions. + - Join our [mailing list](mailto:dev@datafusion.apache.org) and [Discord](https://discord.gg/jHzkpK4em5) to introduce yourself and ask questions. - Optional: Submit a small pull request (PR) for an issue marked with the **good first issue** tag to understand/test whether you enjoy working on Apache DataFusion, get comfortable with navigating the codebase and demonstrate your ability. 3. **Write a Clear Proposal** - You can use the template below to structure your proposal. diff --git a/docs/source/contributor-guide/gsoc_project_ideas.md b/docs/source/contributor-guide/gsoc_project_ideas.md index 3feaba559a48..da6c24e2921b 100644 --- a/docs/source/contributor-guide/gsoc_project_ideas.md +++ b/docs/source/contributor-guide/gsoc_project_ideas.md @@ -109,4 +109,4 @@ Welcome to the Apache DataFusion Google Summer of Code (GSoC) 2025 project ideas ## Contact Us -You can join our [mailing list](mailto:dev%40datafusion.apache.org) and [Discord](https://discord.gg/Q9eh6S2T) to introduce yourself and ask questions. +You can join our [mailing list](mailto:dev%40datafusion.apache.org) and [Discord](https://discord.gg/jHzkpK4em5) to introduce yourself and ask questions. From a5cc0313499d7c4b795af449f6a63e557d8c44ec Mon Sep 17 00:00:00 2001 From: Rohan Krishnaswamy <47869999+rkrishn7@users.noreply.github.com> Date: Mon, 24 Feb 2025 03:23:47 -0800 Subject: [PATCH 55/71] fix(physical-expr): Remove empty constants check when ordering is satisfied (#14829) * fix: Remove empty constants check when ordering is satisfied * fix: Update failing UNION [ALL] BY NAME SLTs --- .../src/equivalence/properties.rs | 2 +- .../sqllogictest/test_files/union_by_name.slt | 43 ++++++++++--------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 838cb26807a9..1ad4093b1f93 100755 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -2259,7 +2259,7 @@ impl UnionEquivalentOrderingBuilder { ) -> AddedOrdering { if ordering.is_empty() { AddedOrdering::Yes - } else if constants.is_empty() && properties.ordering_satisfy(ordering.as_ref()) { + } else if properties.ordering_satisfy(ordering.as_ref()) { // If the ordering satisfies the target properties, no need to // augment it with constants. self.orderings.push(ordering); diff --git a/datafusion/sqllogictest/test_files/union_by_name.slt b/datafusion/sqllogictest/test_files/union_by_name.slt index 0ba4c32ee5be..63a43a36ff16 100644 --- a/datafusion/sqllogictest/test_files/union_by_name.slt +++ b/datafusion/sqllogictest/test_files/union_by_name.slt @@ -92,15 +92,16 @@ NULL 1 NULL 3 5 NULL -# TODO: This should pass, but the sanity checker isn't allowing it. -# Commenting out the ordering check in the sanity checker produces the correct result. -query error +query II (SELECT x FROM t1 UNION ALL SELECT x FROM t1) UNION ALL BY NAME SELECT 5 ORDER BY x; ---- -DataFusion error: SanityCheckPlan -caused by -Error during planning: Plan: ["SortPreservingMergeExec: [x@1 ASC NULLS LAST]", " UnionExec", " SortExec: expr=[x@1 ASC NULLS LAST], preserve_partitioning=[true]", " ProjectionExec: expr=[NULL as Int64(5), x@0 as x]", " UnionExec", " DataSourceExec: partitions=1, partition_sizes=[1]", " DataSourceExec: partitions=1, partition_sizes=[1]", " ProjectionExec: expr=[5 as Int64(5), NULL as x]", " PlaceholderRowExec"] does not satisfy order requirements: [x@1 ASC NULLS LAST]. Child-0 order: [] - +NULL 1 +NULL 1 +NULL 3 +NULL 3 +NULL 3 +NULL 3 +5 NULL query II (SELECT x FROM t1 UNION ALL SELECT y FROM t1) UNION BY NAME SELECT 5 ORDER BY x; @@ -109,15 +110,16 @@ NULL 1 NULL 3 5 NULL -# TODO: This should pass, but the sanity checker isn't allowing it. -# Commenting out the ordering check in the sanity checker produces the correct result. -query error +query II (SELECT x FROM t1 UNION ALL SELECT y FROM t1) UNION ALL BY NAME SELECT 5 ORDER BY x; ---- -DataFusion error: SanityCheckPlan -caused by -Error during planning: Plan: ["SortPreservingMergeExec: [x@1 ASC NULLS LAST]", " UnionExec", " SortExec: expr=[x@1 ASC NULLS LAST], preserve_partitioning=[true]", " ProjectionExec: expr=[NULL as Int64(5), x@0 as x]", " UnionExec", " DataSourceExec: partitions=1, partition_sizes=[1]", " ProjectionExec: expr=[y@0 as x]", " DataSourceExec: partitions=1, partition_sizes=[1]", " ProjectionExec: expr=[5 as Int64(5), NULL as x]", " PlaceholderRowExec"] does not satisfy order requirements: [x@1 ASC NULLS LAST]. Child-0 order: [] - +NULL 1 +NULL 1 +NULL 3 +NULL 3 +NULL 3 +NULL 3 +5 NULL # Ambiguous name @@ -158,15 +160,14 @@ NULL NULL 4 NULL NULL 5 NULL NULL 6 -# TODO: This should pass, but the sanity checker isn't allowing it. -# Commenting out the ordering check in the sanity checker produces the correct result. -query error +query III SELECT 1 UNION ALL BY NAME SELECT * FROM unnest(range(2, 100)) UNION ALL BY NAME SELECT 999 ORDER BY 3, 1 LIMIT 5; ---- -DataFusion error: SanityCheckPlan -caused by -Error during planning: Plan: ["SortPreservingMergeExec: [UNNEST(range(Int64(2),Int64(100)))@2 ASC NULLS LAST, Int64(1)@0 ASC NULLS LAST], fetch=5", " UnionExec", " SortExec: TopK(fetch=5), expr=[UNNEST(range(Int64(2),Int64(100)))@2 ASC NULLS LAST], preserve_partitioning=[true]", " ProjectionExec: expr=[Int64(1)@0 as Int64(1), NULL as Int64(999), UNNEST(range(Int64(2),Int64(100)))@1 as UNNEST(range(Int64(2),Int64(100)))]", " UnionExec", " ProjectionExec: expr=[1 as Int64(1), NULL as UNNEST(range(Int64(2),Int64(100)))]", " PlaceholderRowExec", " ProjectionExec: expr=[NULL as Int64(1), __unnest_placeholder(range(Int64(2),Int64(100)),depth=1)@0 as UNNEST(range(Int64(2),Int64(100)))]", " UnnestExec", " ProjectionExec: expr=[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] as __unnest_placeholder(range(Int64(2),Int64(100)))]", " PlaceholderRowExec", " ProjectionExec: expr=[NULL as Int64(1), 999 as Int64(999), NULL as UNNEST(range(Int64(2),Int64(100)))]", " PlaceholderRowExec"] does not satisfy order requirements: [UNNEST(range(Int64(2),Int64(100)))@2 ASC NULLS LAST, Int64(1)@0 ASC NULLS LAST]. Child-0 order: [] - +NULL NULL 2 +NULL NULL 3 +NULL NULL 4 +NULL NULL 5 +NULL NULL 6 # Order by From 6c5f214643fe81affcad5f3aa9031f02c4390bf0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Feb 2025 19:33:48 +0800 Subject: [PATCH 56/71] chore(deps): bump log from 0.4.25 to 0.4.26 (#14847) Bumps [log](https://github.com/rust-lang/log) from 0.4.25 to 0.4.26. - [Release notes](https://github.com/rust-lang/log/releases) - [Changelog](https://github.com/rust-lang/log/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-lang/log/compare/0.4.25...0.4.26) --- updated-dependencies: - dependency-name: log dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d5edabdf4585..22f06f9932a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3705,9 +3705,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.25" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" [[package]] name = "lz4_flex" From c58a812d4d1005b8f9221fa76580a8df8f53cc19 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 24 Feb 2025 12:55:25 +0000 Subject: [PATCH 57/71] Minor: Ignore examples output directory (#14840) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 6196385144ab..4ae32925d908 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,6 @@ datafusion/core/benches/data/* # rat filtered_rat.txt rat.txt + +# data generated by examples +datafusion-examples/examples/datafusion-examples/ From 0fbd20c3f403f769cc291342b085f935440a7a37 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Mon, 24 Feb 2025 16:18:06 +0200 Subject: [PATCH 58/71] StatisticsV2: initial statistics framework redesign (#14699) * StatisticsV2: initial definition and validation method implementation * Implement mean, median and standard deviation extraction for StatsV2 * Move stats_v2 to `physical-expr` package * Introduce `ExprStatisticGraph` and `ExprStatisticGraphNode` * Split the StatisticsV2 and statistics graph locations, prepare the infrastructure for stats top-down propagation and final bottom-up calculation * Calculate variance instead of std_dev * Create a skeleton for statistics bottom-up evaluation * Introduce high-level test for 'evaluate_statistics()' * Refactor result distribution computation during the statistics evaluation phase; add compute_range function * Always produce Unknown distribution in non-mentioned combination cases, todos for the future * Introduce Bernoulli distribution to be used as result of comparisons and inequations distribution combinations * Implement initial statistics propagation of Uniform and Unknown distributions with known ranges * Implement evaluate_statistics for logical not and unary negation operator * Fix and add tests; make fmt happy * Add integration test, implement conversion into Bernoulli distribution for Eq and NotEq * Finish test, small cleanup * minor improvements * Update stats.rs * Addressing review comments * Implement median colmputation for Gaussian-Gaussian pair * Update stats_v2.rs * minor improvements * Addressing second review comments, part 1 * Return true in other cases * Finish addressing review requrests, part 2 * final clean-up * bug fix * final clean-up * apply reverse logic in stats framework as well * Update cp_solver.rs * revert data.parquet * Apply suggestions from code review * Update datafusion/physical-expr-common/src/stats_v2.rs * Update datafusion/physical-expr-common/src/stats_v2.rs * Apply suggestions from code review Fix links * Fix compilation issue * Fix mean/median formula for exponential distribution * casting + exp dir + remove opt's + is_valid refractor * Update stats_v2_graph.rs * remove inner mod * last todo: bernoulli propagation * Apply suggestions from code review * Apply suggestions from code review * prop_stats in binary * Update binary.rs * rename intervals * block explicit construction * test updates * Update binary.rs * revert renaming * impl range methods as well * Apply suggestions from code review * Apply suggestions from code review * Update datafusion/physical-expr-common/src/stats_v2.rs * Update stats_v2.rs * fmt * fix bernoulli or eval * fmt * Review * Review Part 2 * not propagate * clean-up * Review Part 3 * Review Part 4 * Review Part 5 * Review Part 6 * Review Part 7 * Review Part 8 * Review Part 9 * Review Part 10 * Review Part 11 * Review Part 12 * Review Part 13 * Review Part 14 * Review Part 15 | Fix equality comparisons between uniform distributions * Review Part 16 | Remove unnecessary temporary file * Review Part 17 | Leave TODOs for real-valued summary statistics * Review Part 18 * Review Part 19 | Fix variance calculations * Review Part 20 | Fix range calculations * Review Part 21 * Review Part 22 * Review Part 23 * Review Part 24 | Add default implementations for evaluate_statistics and propagate_statistics * Review Part 25 | Improve docs, refactor statistics graph code * Review Part 26 * Review Part 27 * Review Part 28 | Remove get_zero/get_one, simplify propagation in statistics graph * Review Part 29 * Review Part 30 | Move statistics-combining functions to core module, polish tests * Review Part 31 * Review Part 32 | Module reorganization * Review Part 33 * Add tests for bernoulli and gaussians combination * Incorporate community feedback * Fix merge issue --------- Co-authored-by: Sasha Syrotenko Co-authored-by: berkaysynnada Co-authored-by: Mehmet Ozan Kabak --- datafusion/common/src/spans.rs | 2 +- .../expr-common/src/interval_arithmetic.rs | 300 ++- datafusion/expr-common/src/lib.rs | 1 + datafusion/expr-common/src/statistics.rs | 1620 +++++++++++++++++ datafusion/expr/src/lib.rs | 4 +- .../physical-expr-common/src/physical_expr.rs | 130 +- .../physical-expr/src/expressions/binary.rs | 295 ++- .../physical-expr/src/expressions/negative.rs | 139 +- .../physical-expr/src/expressions/not.rs | 153 +- .../physical-expr/src/intervals/cp_solver.rs | 276 +-- datafusion/physical-expr/src/lib.rs | 1 + .../physical-expr/src/statistics/mod.rs | 20 + .../src/statistics/stats_solver.rs | 287 +++ test-utils/src/array_gen/string.rs | 2 +- 14 files changed, 3059 insertions(+), 171 deletions(-) create mode 100644 datafusion/expr-common/src/statistics.rs create mode 100644 datafusion/physical-expr/src/statistics/mod.rs create mode 100644 datafusion/physical-expr/src/statistics/stats_solver.rs diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index 40ebdeffb601..5111e264123c 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -140,7 +140,7 @@ impl Span { /// the column a that comes from SELECT 1 AS a UNION ALL SELECT 2 AS a you'll /// need two spans. #[derive(Debug, Clone)] -// Store teh first [`Span`] on the stack because that is by far the most common +// Store the first [`Span`] on the stack because that is by far the most common // case. More will spill onto the heap. pub struct Spans(pub Vec); diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 7f20020c3457..9d00b45962bc 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -17,12 +17,13 @@ //! Interval arithmetic library -use crate::operator::Operator; -use crate::type_coercion::binary::BinaryTypeCoercer; use std::borrow::Borrow; use std::fmt::{self, Display, Formatter}; use std::ops::{AddAssign, SubAssign}; +use crate::operator::Operator; +use crate::type_coercion::binary::{comparison_coercion_numeric, BinaryTypeCoercer}; + use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{ DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit, TimeUnit, @@ -168,7 +169,7 @@ macro_rules! value_transition { /// limits after any operation, they either become unbounded or they are fixed /// to the maximum/minimum value of the datatype, depending on the direction /// of the overflowing endpoint, opting for the safer choice. -/// +/// /// 4. **Floating-point special cases**: /// - `INF` values are converted to `NULL`s while constructing an interval to /// ensure consistency, with other data types. @@ -405,13 +406,18 @@ impl Interval { // There must be no way to create an interval whose endpoints have // different types. - assert!( + debug_assert!( lower_type == upper_type, "Interval bounds have different types: {lower_type} != {upper_type}" ); lower_type } + /// Checks if the interval is unbounded (on either side). + pub fn is_unbounded(&self) -> bool { + self.lower.is_null() || self.upper.is_null() + } + /// Casts this interval to `data_type` using `cast_options`. pub fn cast_to( &self, @@ -645,7 +651,7 @@ impl Interval { let upper = min_of_bounds(&self.upper, &rhs.upper); // New lower and upper bounds must always construct a valid interval. - assert!( + debug_assert!( (lower.is_null() || upper.is_null() || (lower <= upper)), "The intersection of two intervals can not be an invalid interval" ); @@ -653,26 +659,70 @@ impl Interval { Ok(Some(Self { lower, upper })) } - /// Decide if this interval certainly contains, possibly contains, or can't - /// contain a [`ScalarValue`] (`other`) by returning `[true, true]`, - /// `[false, true]` or `[false, false]` respectively. + /// Compute the union of this interval with the given interval. /// /// NOTE: This function only works with intervals of the same data type. /// Attempting to compare intervals of different data types will lead /// to an error. - pub fn contains_value>(&self, other: T) -> Result { + pub fn union>(&self, other: T) -> Result { let rhs = other.borrow(); if self.data_type().ne(&rhs.data_type()) { + return internal_err!( + "Cannot calculate the union of intervals with different data types, lhs:{}, rhs:{}", + self.data_type(), + rhs.data_type() + ); + }; + + let lower = if self.lower.is_null() + || (!rhs.lower.is_null() && self.lower <= rhs.lower) + { + self.lower.clone() + } else { + rhs.lower.clone() + }; + let upper = if self.upper.is_null() + || (!rhs.upper.is_null() && self.upper >= rhs.upper) + { + self.upper.clone() + } else { + rhs.upper.clone() + }; + + // New lower and upper bounds must always construct a valid interval. + debug_assert!( + (lower.is_null() || upper.is_null() || (lower <= upper)), + "The union of two intervals can not be an invalid interval" + ); + + Ok(Self { lower, upper }) + } + + /// Decide if this interval contains a [`ScalarValue`] (`other`) by returning `true` or `false`. + pub fn contains_value>(&self, other: T) -> Result { + let rhs = other.borrow(); + + let (lhs_lower, lhs_upper, rhs) = if self.data_type().eq(&rhs.data_type()) { + (&self.lower, &self.upper, rhs) + } else if let Some(common_type) = + comparison_coercion_numeric(&self.data_type(), &rhs.data_type()) + { + ( + &self.lower.cast_to(&common_type)?, + &self.upper.cast_to(&common_type)?, + &rhs.cast_to(&common_type)?, + ) + } else { return internal_err!( "Data types must be compatible for containment checks, lhs:{}, rhs:{}", self.data_type(), rhs.data_type() ); - } + }; // We only check the upper bound for a `None` value because `None` // values are less than `Some` values according to Rust. - Ok(&self.lower <= rhs && (self.upper.is_null() || rhs <= &self.upper)) + Ok(lhs_lower <= rhs && (lhs_upper.is_null() || rhs <= lhs_upper)) } /// Decide if this interval is a superset of, overlaps with, or @@ -825,6 +875,17 @@ impl Interval { } } + /// Computes the width of this interval; i.e. the difference between its + /// bounds. For unbounded intervals, this function will return a `NULL` + /// `ScalarValue` If the underlying data type doesn't support subtraction, + /// this function will return an error. + pub fn width(&self) -> Result { + let dt = self.data_type(); + let width_dt = + BinaryTypeCoercer::new(&dt, &Operator::Minus, &dt).get_result_type()?; + Ok(sub_bounds::(&width_dt, &self.upper, &self.lower)) + } + /// Returns the cardinality of this interval, which is the number of all /// distinct points inside it. This function returns `None` if: /// - The interval is unbounded from either side, or @@ -874,10 +935,10 @@ impl Interval { /// This method computes the arithmetic negation of the interval, reflecting /// it about the origin of the number line. This operation swaps and negates /// the lower and upper bounds of the interval. - pub fn arithmetic_negate(self) -> Result { + pub fn arithmetic_negate(&self) -> Result { Ok(Self { - lower: self.upper().clone().arithmetic_negate()?, - upper: self.lower().clone().arithmetic_negate()?, + lower: self.upper.arithmetic_negate()?, + upper: self.lower.arithmetic_negate()?, }) } } @@ -1119,11 +1180,11 @@ fn next_value_helper(value: ScalarValue) -> ScalarValue { match value { // f32/f64::NEG_INF/INF and f32/f64::NaN values should not emerge at this point. Float32(Some(val)) => { - assert!(val.is_finite(), "Non-standardized floating point usage"); + debug_assert!(val.is_finite(), "Non-standardized floating point usage"); Float32(Some(if INC { next_up(val) } else { next_down(val) })) } Float64(Some(val)) => { - assert!(val.is_finite(), "Non-standardized floating point usage"); + debug_assert!(val.is_finite(), "Non-standardized floating point usage"); Float64(Some(if INC { next_up(val) } else { next_down(val) })) } Int8(Some(val)) => Int8(Some(increment_decrement::(val))), @@ -1275,7 +1336,7 @@ pub fn satisfy_greater( } else { right.upper.clone() }; - + // No possibility to create an invalid interval: Ok(Some(( Interval::new(new_left_lower, left.upper.clone()), Interval::new(right.lower.clone(), new_right_upper), @@ -1868,6 +1929,7 @@ mod tests { }; use arrow::datatypes::DataType; + use datafusion_common::rounding::{next_down, next_up}; use datafusion_common::{Result, ScalarValue}; #[test] @@ -2532,6 +2594,126 @@ mod tests { Ok(()) } + #[test] + fn union_test() -> Result<()> { + let possible_cases = vec![ + ( + Interval::make(Some(1000_i64), None)?, + Interval::make::(None, None)?, + Interval::make_unbounded(&DataType::Int64)?, + ), + ( + Interval::make(Some(1000_i64), None)?, + Interval::make(None, Some(1000_i64))?, + Interval::make_unbounded(&DataType::Int64)?, + ), + ( + Interval::make(Some(1000_i64), None)?, + Interval::make(None, Some(2000_i64))?, + Interval::make_unbounded(&DataType::Int64)?, + ), + ( + Interval::make(Some(1000_i64), Some(2000_i64))?, + Interval::make(Some(1000_i64), None)?, + Interval::make(Some(1000_i64), None)?, + ), + ( + Interval::make(Some(1000_i64), Some(2000_i64))?, + Interval::make(Some(1000_i64), Some(1500_i64))?, + Interval::make(Some(1000_i64), Some(2000_i64))?, + ), + ( + Interval::make(Some(1000_i64), Some(2000_i64))?, + Interval::make(Some(500_i64), Some(1500_i64))?, + Interval::make(Some(500_i64), Some(2000_i64))?, + ), + ( + Interval::make::(None, None)?, + Interval::make::(None, None)?, + Interval::make::(None, None)?, + ), + ( + Interval::make(Some(1000_i64), None)?, + Interval::make(None, Some(0_i64))?, + Interval::make_unbounded(&DataType::Int64)?, + ), + ( + Interval::make(Some(1000_i64), None)?, + Interval::make(None, Some(999_i64))?, + Interval::make_unbounded(&DataType::Int64)?, + ), + ( + Interval::make(Some(1500_i64), Some(2000_i64))?, + Interval::make(Some(1000_i64), Some(1499_i64))?, + Interval::make(Some(1000_i64), Some(2000_i64))?, + ), + ( + Interval::make(Some(0_i64), Some(1000_i64))?, + Interval::make(Some(2000_i64), Some(3000_i64))?, + Interval::make(Some(0_i64), Some(3000_i64))?, + ), + ( + Interval::make(None, Some(2000_u64))?, + Interval::make(Some(500_u64), None)?, + Interval::make(Some(0_u64), None)?, + ), + ( + Interval::make(Some(0_u64), Some(0_u64))?, + Interval::make(Some(0_u64), None)?, + Interval::make(Some(0_u64), None)?, + ), + ( + Interval::make(Some(1000.0_f32), None)?, + Interval::make(None, Some(1000.0_f32))?, + Interval::make_unbounded(&DataType::Float32)?, + ), + ( + Interval::make(Some(1000.0_f32), Some(1500.0_f32))?, + Interval::make(Some(0.0_f32), Some(1500.0_f32))?, + Interval::make(Some(0.0_f32), Some(1500.0_f32))?, + ), + ( + Interval::try_new( + prev_value(ScalarValue::Float32(Some(1.0))), + prev_value(ScalarValue::Float32(Some(1.0))), + )?, + Interval::make(Some(1.0_f32), Some(1.0_f32))?, + Interval::try_new( + prev_value(ScalarValue::Float32(Some(1.0))), + ScalarValue::Float32(Some(1.0)), + )?, + ), + ( + Interval::try_new( + next_value(ScalarValue::Float32(Some(1.0))), + next_value(ScalarValue::Float32(Some(1.0))), + )?, + Interval::make(Some(1.0_f32), Some(1.0_f32))?, + Interval::try_new( + ScalarValue::Float32(Some(1.0)), + next_value(ScalarValue::Float32(Some(1.0))), + )?, + ), + ( + Interval::make(Some(-1000.0_f64), Some(1500.0_f64))?, + Interval::make(Some(-1500.0_f64), Some(2000.0_f64))?, + Interval::make(Some(-1500.0_f64), Some(2000.0_f64))?, + ), + ( + Interval::make(Some(16.0_f64), Some(32.0_f64))?, + Interval::make(Some(32.0_f64), Some(64.0_f64))?, + Interval::make(Some(16.0_f64), Some(64.0_f64))?, + ), + ]; + for (first, second, expected) in possible_cases { + println!("{}", first); + println!("{}", second); + assert_eq!(first.union(second)?, expected) + } + + Ok(()) + } + #[test] fn test_contains() -> Result<()> { let possible_cases = vec![ @@ -2594,6 +2776,43 @@ mod tests { Ok(()) } + #[test] + fn test_contains_value() -> Result<()> { + let possible_cases = vec![ + ( + Interval::make(Some(0), Some(100))?, + ScalarValue::Int32(Some(50)), + true, + ), + ( + Interval::make(Some(0), Some(100))?, + ScalarValue::Int32(Some(150)), + false, + ), + ( + Interval::make(Some(0), Some(100))?, + ScalarValue::Float64(Some(50.)), + true, + ), + ( + Interval::make(Some(0), Some(100))?, + ScalarValue::Float64(Some(next_down(100.))), + true, + ), + ( + Interval::make(Some(0), Some(100))?, + ScalarValue::Float64(Some(next_up(100.))), + false, + ), + ]; + + for (interval, value, expected) in possible_cases { + assert_eq!(interval.contains_value(value)?, expected) + } + + Ok(()) + } + #[test] fn test_add() -> Result<()> { let cases = vec![ @@ -3208,6 +3427,53 @@ mod tests { Ok(()) } + #[test] + fn test_width_of_intervals() -> Result<()> { + let intervals = [ + ( + Interval::make(Some(0.25_f64), Some(0.50_f64))?, + ScalarValue::from(0.25_f64), + ), + ( + Interval::make(Some(0.5_f64), Some(1.0_f64))?, + ScalarValue::from(0.5_f64), + ), + ( + Interval::make(Some(1.0_f64), Some(2.0_f64))?, + ScalarValue::from(1.0_f64), + ), + ( + Interval::make(Some(32.0_f64), Some(64.0_f64))?, + ScalarValue::from(32.0_f64), + ), + ( + Interval::make(Some(-0.50_f64), Some(-0.25_f64))?, + ScalarValue::from(0.25_f64), + ), + ( + Interval::make(Some(-32.0_f64), Some(-16.0_f64))?, + ScalarValue::from(16.0_f64), + ), + ( + Interval::make(Some(-0.50_f64), Some(0.25_f64))?, + ScalarValue::from(0.75_f64), + ), + ( + Interval::make(Some(-32.0_f64), Some(16.0_f64))?, + ScalarValue::from(48.0_f64), + ), + ( + Interval::make(Some(-32_i64), Some(16_i64))?, + ScalarValue::from(48_i64), + ), + ]; + for (interval, expected) in intervals { + assert_eq!(interval.width()?, expected); + } + + Ok(()) + } + #[test] fn test_cardinality_of_intervals() -> Result<()> { // In IEEE 754 standard for floating-point arithmetic, if we keep the sign and exponent fields same, diff --git a/datafusion/expr-common/src/lib.rs b/datafusion/expr-common/src/lib.rs index fede0bb8e57e..ee40038beb21 100644 --- a/datafusion/expr-common/src/lib.rs +++ b/datafusion/expr-common/src/lib.rs @@ -38,4 +38,5 @@ pub mod interval_arithmetic; pub mod operator; pub mod signature; pub mod sort_properties; +pub mod statistics; pub mod type_coercion; diff --git a/datafusion/expr-common/src/statistics.rs b/datafusion/expr-common/src/statistics.rs new file mode 100644 index 000000000000..7e0bc88087ef --- /dev/null +++ b/datafusion/expr-common/src/statistics.rs @@ -0,0 +1,1620 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::f64::consts::LN_2; + +use crate::interval_arithmetic::{apply_operator, Interval}; +use crate::operator::Operator; +use crate::type_coercion::binary::binary_numeric_coercion; + +use arrow::array::ArrowNativeTypeOp; +use arrow::datatypes::DataType; +use datafusion_common::rounding::alter_fp_rounding_mode; +use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue}; + +/// This object defines probabilistic distributions that encode uncertain +/// information about a single, scalar value. Currently, we support five core +/// statistical distributions. New variants will be added over time. +/// +/// This object is the lowest-level object in the statistics hierarchy, and it +/// is the main unit of calculus when evaluating expressions in a statistical +/// context. Notions like column and table statistics are built on top of this +/// object and the operations it supports. +#[derive(Clone, Debug, PartialEq)] +pub enum Distribution { + Uniform(UniformDistribution), + Exponential(ExponentialDistribution), + Gaussian(GaussianDistribution), + Bernoulli(BernoulliDistribution), + Generic(GenericDistribution), +} + +use Distribution::{Bernoulli, Exponential, Gaussian, Generic, Uniform}; + +impl Distribution { + /// Constructs a new [`Uniform`] distribution from the given [`Interval`]. + pub fn new_uniform(interval: Interval) -> Result { + UniformDistribution::try_new(interval).map(Uniform) + } + + /// Constructs a new [`Exponential`] distribution from the given rate/offset + /// pair, and validates the given parameters. + pub fn new_exponential( + rate: ScalarValue, + offset: ScalarValue, + positive_tail: bool, + ) -> Result { + ExponentialDistribution::try_new(rate, offset, positive_tail).map(Exponential) + } + + /// Constructs a new [`Gaussian`] distribution from the given mean/variance + /// pair, and validates the given parameters. + pub fn new_gaussian(mean: ScalarValue, variance: ScalarValue) -> Result { + GaussianDistribution::try_new(mean, variance).map(Gaussian) + } + + /// Constructs a new [`Bernoulli`] distribution from the given success + /// probability, and validates the given parameters. + pub fn new_bernoulli(p: ScalarValue) -> Result { + BernoulliDistribution::try_new(p).map(Bernoulli) + } + + /// Constructs a new [`Generic`] distribution from the given mean, median, + /// variance, and range values after validating the given parameters. + pub fn new_generic( + mean: ScalarValue, + median: ScalarValue, + variance: ScalarValue, + range: Interval, + ) -> Result { + GenericDistribution::try_new(mean, median, variance, range).map(Generic) + } + + /// Constructs a new [`Generic`] distribution from the given range. Other + /// parameters (mean, median and variance) are initialized with null values. + pub fn new_from_interval(range: Interval) -> Result { + let null = ScalarValue::try_from(range.data_type())?; + Distribution::new_generic(null.clone(), null.clone(), null, range) + } + + /// Extracts the mean value of this uncertain quantity, depending on its + /// distribution: + /// - A [`Uniform`] distribution's interval determines its mean value, which + /// is the arithmetic average of the interval endpoints. + /// - An [`Exponential`] distribution's mean is calculable by the formula + /// `offset + 1 / λ`, where `λ` is the (non-negative) rate. + /// - A [`Gaussian`] distribution contains the mean explicitly. + /// - A [`Bernoulli`] distribution's mean is equal to its success probability `p`. + /// - A [`Generic`] distribution _may_ have it explicitly, or this information + /// may be absent. + pub fn mean(&self) -> Result { + match &self { + Uniform(u) => u.mean(), + Exponential(e) => e.mean(), + Gaussian(g) => Ok(g.mean().clone()), + Bernoulli(b) => Ok(b.mean().clone()), + Generic(u) => Ok(u.mean().clone()), + } + } + + /// Extracts the median value of this uncertain quantity, depending on its + /// distribution: + /// - A [`Uniform`] distribution's interval determines its median value, which + /// is the arithmetic average of the interval endpoints. + /// - An [`Exponential`] distribution's median is calculable by the formula + /// `offset + ln(2) / λ`, where `λ` is the (non-negative) rate. + /// - A [`Gaussian`] distribution's median is equal to its mean, which is + /// specified explicitly. + /// - A [`Bernoulli`] distribution's median is `1` if `p > 0.5` and `0` + /// otherwise, where `p` is the success probability. + /// - A [`Generic`] distribution _may_ have it explicitly, or this information + /// may be absent. + pub fn median(&self) -> Result { + match &self { + Uniform(u) => u.median(), + Exponential(e) => e.median(), + Gaussian(g) => Ok(g.median().clone()), + Bernoulli(b) => b.median(), + Generic(u) => Ok(u.median().clone()), + } + } + + /// Extracts the variance value of this uncertain quantity, depending on + /// its distribution: + /// - A [`Uniform`] distribution's interval determines its variance value, which + /// is calculable by the formula `(upper - lower) ^ 2 / 12`. + /// - An [`Exponential`] distribution's variance is calculable by the formula + /// `1 / (λ ^ 2)`, where `λ` is the (non-negative) rate. + /// - A [`Gaussian`] distribution's variance is specified explicitly. + /// - A [`Bernoulli`] distribution's median is given by the formula `p * (1 - p)` + /// where `p` is the success probability. + /// - A [`Generic`] distribution _may_ have it explicitly, or this information + /// may be absent. + pub fn variance(&self) -> Result { + match &self { + Uniform(u) => u.variance(), + Exponential(e) => e.variance(), + Gaussian(g) => Ok(g.variance.clone()), + Bernoulli(b) => b.variance(), + Generic(u) => Ok(u.variance.clone()), + } + } + + /// Extracts the range of this uncertain quantity, depending on its + /// distribution: + /// - A [`Uniform`] distribution's range is simply its interval. + /// - An [`Exponential`] distribution's range is `[offset, +∞)`. + /// - A [`Gaussian`] distribution's range is unbounded. + /// - A [`Bernoulli`] distribution's range is [`Interval::UNCERTAIN`], if + /// `p` is neither `0` nor `1`. Otherwise, it is [`Interval::CERTAINLY_FALSE`] + /// and [`Interval::CERTAINLY_TRUE`], respectively. + /// - A [`Generic`] distribution is unbounded by default, but more information + /// may be present. + pub fn range(&self) -> Result { + match &self { + Uniform(u) => Ok(u.range().clone()), + Exponential(e) => e.range(), + Gaussian(g) => g.range(), + Bernoulli(b) => Ok(b.range()), + Generic(u) => Ok(u.range().clone()), + } + } + + /// Returns the data type of the statistical parameters comprising this + /// distribution. + pub fn data_type(&self) -> DataType { + match &self { + Uniform(u) => u.data_type(), + Exponential(e) => e.data_type(), + Gaussian(g) => g.data_type(), + Bernoulli(b) => b.data_type(), + Generic(u) => u.data_type(), + } + } + + pub fn target_type(args: &[&ScalarValue]) -> Result { + let mut arg_types = args + .iter() + .filter(|&&arg| (arg != &ScalarValue::Null)) + .map(|&arg| arg.data_type()); + + let Some(dt) = arg_types.next().map_or_else( + || Some(DataType::Null), + |first| { + arg_types + .try_fold(first, |target, arg| binary_numeric_coercion(&target, &arg)) + }, + ) else { + return internal_err!("Can only evaluate statistics for numeric types"); + }; + Ok(dt) + } +} + +/// Uniform distribution, represented by its range. If the given range extends +/// towards infinity, the distribution will be improper -- which is OK. For a +/// more in-depth discussion, see: +/// +/// +/// +#[derive(Clone, Debug, PartialEq)] +pub struct UniformDistribution { + interval: Interval, +} + +/// Exponential distribution with an optional shift. The probability density +/// function (PDF) is defined as follows: +/// +/// For a positive tail (when `positive_tail` is `true`): +/// +/// `f(x; λ, offset) = λ exp(-λ (x - offset)) for x ≥ offset` +/// +/// For a negative tail (when `positive_tail` is `false`): +/// +/// `f(x; λ, offset) = λ exp(-λ (offset - x)) for x ≤ offset` +/// +/// +/// In both cases, the PDF is `0` outside the specified domain. +/// +/// For more information, see: +/// +/// +#[derive(Clone, Debug, PartialEq)] +pub struct ExponentialDistribution { + rate: ScalarValue, + offset: ScalarValue, + /// Indicates whether the exponential distribution has a positive tail; + /// i.e. it extends towards positive infinity. + positive_tail: bool, +} + +/// Gaussian (normal) distribution, represented by its mean and variance. +/// For a more in-depth discussion, see: +/// +/// +#[derive(Clone, Debug, PartialEq)] +pub struct GaussianDistribution { + mean: ScalarValue, + variance: ScalarValue, +} + +/// Bernoulli distribution with success probability `p`. If `p` has a null value, +/// the success probability is unknown. For a more in-depth discussion, see: +/// +/// +#[derive(Clone, Debug, PartialEq)] +pub struct BernoulliDistribution { + p: ScalarValue, +} + +/// A generic distribution whose functional form is not available, which is +/// approximated via some summary statistics. For a more in-depth discussion, see: +/// +/// +#[derive(Clone, Debug, PartialEq)] +pub struct GenericDistribution { + mean: ScalarValue, + median: ScalarValue, + variance: ScalarValue, + range: Interval, +} + +impl UniformDistribution { + fn try_new(interval: Interval) -> Result { + if interval.data_type().eq(&DataType::Boolean) { + return internal_err!( + "Construction of a boolean `Uniform` distribution is prohibited, create a `Bernoulli` distribution instead." + ); + } + + Ok(Self { interval }) + } + + pub fn data_type(&self) -> DataType { + self.interval.data_type() + } + + /// Computes the mean value of this distribution. In case of improper + /// distributions (i.e. when the range is unbounded), the function returns + /// a `NULL` `ScalarValue`. + pub fn mean(&self) -> Result { + // TODO: Should we ensure that this always returns a real number data type? + let dt = self.data_type(); + let two = ScalarValue::from(2).cast_to(&dt)?; + let result = self + .interval + .lower() + .add_checked(self.interval.upper())? + .div(two); + debug_assert!( + !self.interval.is_unbounded() || result.as_ref().is_ok_and(|r| r.is_null()) + ); + result + } + + pub fn median(&self) -> Result { + self.mean() + } + + /// Computes the variance value of this distribution. In case of improper + /// distributions (i.e. when the range is unbounded), the function returns + /// a `NULL` `ScalarValue`. + pub fn variance(&self) -> Result { + // TODO: Should we ensure that this always returns a real number data type? + let width = self.interval.width()?; + let dt = width.data_type(); + let twelve = ScalarValue::from(12).cast_to(&dt)?; + let result = width.mul_checked(&width)?.div(twelve); + debug_assert!( + !self.interval.is_unbounded() || result.as_ref().is_ok_and(|r| r.is_null()) + ); + result + } + + pub fn range(&self) -> &Interval { + &self.interval + } +} + +impl ExponentialDistribution { + fn try_new( + rate: ScalarValue, + offset: ScalarValue, + positive_tail: bool, + ) -> Result { + let dt = rate.data_type(); + if offset.data_type() != dt { + internal_err!("Rate and offset must have the same data type") + } else if offset.is_null() { + internal_err!("Offset of an `ExponentialDistribution` cannot be null") + } else if rate.is_null() { + internal_err!("Rate of an `ExponentialDistribution` cannot be null") + } else if rate.le(&ScalarValue::new_zero(&dt)?) { + internal_err!("Rate of an `ExponentialDistribution` must be positive") + } else { + Ok(Self { + rate, + offset, + positive_tail, + }) + } + } + + pub fn data_type(&self) -> DataType { + self.rate.data_type() + } + + pub fn rate(&self) -> &ScalarValue { + &self.rate + } + + pub fn offset(&self) -> &ScalarValue { + &self.offset + } + + pub fn positive_tail(&self) -> bool { + self.positive_tail + } + + pub fn mean(&self) -> Result { + // TODO: Should we ensure that this always returns a real number data type? + let one = ScalarValue::new_one(&self.data_type())?; + let tail_mean = one.div(&self.rate)?; + if self.positive_tail { + self.offset.add_checked(tail_mean) + } else { + self.offset.sub_checked(tail_mean) + } + } + + pub fn median(&self) -> Result { + // TODO: Should we ensure that this always returns a real number data type? + let ln_two = ScalarValue::from(LN_2).cast_to(&self.data_type())?; + let tail_median = ln_two.div(&self.rate)?; + if self.positive_tail { + self.offset.add_checked(tail_median) + } else { + self.offset.sub_checked(tail_median) + } + } + + pub fn variance(&self) -> Result { + // TODO: Should we ensure that this always returns a real number data type? + let one = ScalarValue::new_one(&self.data_type())?; + let rate_squared = self.rate.mul_checked(&self.rate)?; + one.div(rate_squared) + } + + pub fn range(&self) -> Result { + let end = ScalarValue::try_from(&self.data_type())?; + if self.positive_tail { + Interval::try_new(self.offset.clone(), end) + } else { + Interval::try_new(end, self.offset.clone()) + } + } +} + +impl GaussianDistribution { + fn try_new(mean: ScalarValue, variance: ScalarValue) -> Result { + let dt = mean.data_type(); + if variance.data_type() != dt { + internal_err!("Mean and variance must have the same data type") + } else if variance.is_null() { + internal_err!("Variance of a `GaussianDistribution` cannot be null") + } else if variance.lt(&ScalarValue::new_zero(&dt)?) { + internal_err!("Variance of a `GaussianDistribution` must be positive") + } else { + Ok(Self { mean, variance }) + } + } + + pub fn data_type(&self) -> DataType { + self.mean.data_type() + } + + pub fn mean(&self) -> &ScalarValue { + &self.mean + } + + pub fn variance(&self) -> &ScalarValue { + &self.variance + } + + pub fn median(&self) -> &ScalarValue { + self.mean() + } + + pub fn range(&self) -> Result { + Interval::make_unbounded(&self.data_type()) + } +} + +impl BernoulliDistribution { + fn try_new(p: ScalarValue) -> Result { + if p.is_null() { + Ok(Self { p }) + } else { + let dt = p.data_type(); + let zero = ScalarValue::new_zero(&dt)?; + let one = ScalarValue::new_one(&dt)?; + if p.ge(&zero) && p.le(&one) { + Ok(Self { p }) + } else { + internal_err!( + "Success probability of a `BernoulliDistribution` must be in [0, 1]" + ) + } + } + } + + pub fn data_type(&self) -> DataType { + self.p.data_type() + } + + pub fn p_value(&self) -> &ScalarValue { + &self.p + } + + pub fn mean(&self) -> &ScalarValue { + &self.p + } + + /// Computes the median value of this distribution. In case of an unknown + /// success probability, the function returns a `NULL` `ScalarValue`. + pub fn median(&self) -> Result { + let dt = self.data_type(); + if self.p.is_null() { + ScalarValue::try_from(&dt) + } else { + let one = ScalarValue::new_one(&dt)?; + if one.sub_checked(&self.p)?.lt(&self.p) { + ScalarValue::new_one(&dt) + } else { + ScalarValue::new_zero(&dt) + } + } + } + + /// Computes the variance value of this distribution. In case of an unknown + /// success probability, the function returns a `NULL` `ScalarValue`. + pub fn variance(&self) -> Result { + let dt = self.data_type(); + let one = ScalarValue::new_one(&dt)?; + let result = one.sub_checked(&self.p)?.mul_checked(&self.p); + debug_assert!(!self.p.is_null() || result.as_ref().is_ok_and(|r| r.is_null())); + result + } + + pub fn range(&self) -> Interval { + let dt = self.data_type(); + // Unwraps are safe as the constructor guarantees that the data type + // supports zero and one values. + if ScalarValue::new_zero(&dt).unwrap().eq(&self.p) { + Interval::CERTAINLY_FALSE + } else if ScalarValue::new_one(&dt).unwrap().eq(&self.p) { + Interval::CERTAINLY_TRUE + } else { + Interval::UNCERTAIN + } + } +} + +impl GenericDistribution { + fn try_new( + mean: ScalarValue, + median: ScalarValue, + variance: ScalarValue, + range: Interval, + ) -> Result { + if range.data_type().eq(&DataType::Boolean) { + return internal_err!( + "Construction of a boolean `Generic` distribution is prohibited, create a `Bernoulli` distribution instead." + ); + } + + let validate_location = |m: &ScalarValue| -> Result { + // Checks whether the given location estimate is within the range. + if m.is_null() { + Ok(true) + } else { + range.contains_value(m) + } + }; + + if !validate_location(&mean)? + || !validate_location(&median)? + || (!variance.is_null() + && variance.lt(&ScalarValue::new_zero(&variance.data_type())?)) + { + internal_err!("Tried to construct an invalid `GenericDistribution` instance") + } else { + Ok(Self { + mean, + median, + variance, + range, + }) + } + } + + pub fn data_type(&self) -> DataType { + self.mean.data_type() + } + + pub fn mean(&self) -> &ScalarValue { + &self.mean + } + + pub fn median(&self) -> &ScalarValue { + &self.median + } + + pub fn variance(&self) -> &ScalarValue { + &self.variance + } + + pub fn range(&self) -> &Interval { + &self.range + } +} + +/// This function takes a logical operator and two Bernoulli distributions, +/// and it returns a new Bernoulli distribution that represents the result of +/// the operation. Currently, only `AND` and `OR` operations are supported. +pub fn combine_bernoullis( + op: &Operator, + left: &BernoulliDistribution, + right: &BernoulliDistribution, +) -> Result { + let left_p = left.p_value(); + let right_p = right.p_value(); + match op { + Operator::And => match (left_p.is_null(), right_p.is_null()) { + (false, false) => { + BernoulliDistribution::try_new(left_p.mul_checked(right_p)?) + } + (false, true) if left_p.eq(&ScalarValue::new_zero(&left_p.data_type())?) => { + Ok(left.clone()) + } + (true, false) + if right_p.eq(&ScalarValue::new_zero(&right_p.data_type())?) => + { + Ok(right.clone()) + } + _ => { + let dt = Distribution::target_type(&[left_p, right_p])?; + BernoulliDistribution::try_new(ScalarValue::try_from(&dt)?) + } + }, + Operator::Or => match (left_p.is_null(), right_p.is_null()) { + (false, false) => { + let sum = left_p.add_checked(right_p)?; + let product = left_p.mul_checked(right_p)?; + let or_success = sum.sub_checked(product)?; + BernoulliDistribution::try_new(or_success) + } + (false, true) if left_p.eq(&ScalarValue::new_one(&left_p.data_type())?) => { + Ok(left.clone()) + } + (true, false) if right_p.eq(&ScalarValue::new_one(&right_p.data_type())?) => { + Ok(right.clone()) + } + _ => { + let dt = Distribution::target_type(&[left_p, right_p])?; + BernoulliDistribution::try_new(ScalarValue::try_from(&dt)?) + } + }, + _ => { + not_impl_err!("Statistical evaluation only supports AND and OR operators") + } + } +} + +/// Applies the given operation to the given Gaussian distributions. Currently, +/// this function handles only addition and subtraction operations. If the +/// result is not a Gaussian random variable, it returns `None`. For details, +/// see: +/// +/// +pub fn combine_gaussians( + op: &Operator, + left: &GaussianDistribution, + right: &GaussianDistribution, +) -> Result> { + match op { + Operator::Plus => GaussianDistribution::try_new( + left.mean().add_checked(right.mean())?, + left.variance().add_checked(right.variance())?, + ) + .map(Some), + Operator::Minus => GaussianDistribution::try_new( + left.mean().sub_checked(right.mean())?, + left.variance().add_checked(right.variance())?, + ) + .map(Some), + _ => Ok(None), + } +} + +/// Creates a new `Bernoulli` distribution by computing the resulting probability. +/// Expects `op` to be a comparison operator, with `left` and `right` having +/// numeric distributions. The resulting distribution has the `Float64` data +/// type. +pub fn create_bernoulli_from_comparison( + op: &Operator, + left: &Distribution, + right: &Distribution, +) -> Result { + match (left, right) { + (Uniform(left), Uniform(right)) => { + match op { + Operator::Eq | Operator::NotEq => { + let (li, ri) = (left.range(), right.range()); + if let Some(intersection) = li.intersect(ri)? { + // If the ranges are not disjoint, calculate the probability + // of equality using cardinalities: + if let (Some(lc), Some(rc), Some(ic)) = ( + li.cardinality(), + ri.cardinality(), + intersection.cardinality(), + ) { + // Avoid overflow by widening the type temporarily: + let pairs = ((lc as u128) * (rc as u128)) as f64; + let p = (ic as f64).div_checked(pairs)?; + // Alternative approach that may be more stable: + // let p = (ic as f64) + // .div_checked(lc as f64)? + // .div_checked(rc as f64)?; + + let mut p_value = ScalarValue::from(p); + if op == &Operator::NotEq { + let one = ScalarValue::from(1.0); + p_value = alter_fp_rounding_mode::( + &one, + &p_value, + |lhs, rhs| lhs.sub_checked(rhs), + )?; + }; + return Distribution::new_bernoulli(p_value); + } + } else if op == &Operator::Eq { + // If the ranges are disjoint, probability of equality is 0. + return Distribution::new_bernoulli(ScalarValue::from(0.0)); + } else { + // If the ranges are disjoint, probability of not-equality is 1. + return Distribution::new_bernoulli(ScalarValue::from(1.0)); + } + } + Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => { + // TODO: We can handle inequality operators and calculate a + // `p` value instead of falling back to an unknown Bernoulli + // distribution. Note that the strict and non-strict inequalities + // may require slightly different logic in case of real vs. + // integral data types. + } + _ => {} + } + } + (Gaussian(_), Gaussian(_)) => { + // TODO: We can handle Gaussian comparisons and calculate a `p` value + // instead of falling back to an unknown Bernoulli distribution. + } + _ => {} + } + let (li, ri) = (left.range()?, right.range()?); + let range_evaluation = apply_operator(op, &li, &ri)?; + if range_evaluation.eq(&Interval::CERTAINLY_FALSE) { + Distribution::new_bernoulli(ScalarValue::from(0.0)) + } else if range_evaluation.eq(&Interval::CERTAINLY_TRUE) { + Distribution::new_bernoulli(ScalarValue::from(1.0)) + } else if range_evaluation.eq(&Interval::UNCERTAIN) { + Distribution::new_bernoulli(ScalarValue::try_from(&DataType::Float64)?) + } else { + internal_err!("This function must be called with a comparison operator") + } +} + +/// Creates a new [`Generic`] distribution that represents the result of the +/// given binary operation on two unknown quantities represented by their +/// [`Distribution`] objects. The function computes the mean, median and +/// variance if possible. +pub fn new_generic_from_binary_op( + op: &Operator, + left: &Distribution, + right: &Distribution, +) -> Result { + Distribution::new_generic( + compute_mean(op, left, right)?, + compute_median(op, left, right)?, + compute_variance(op, left, right)?, + apply_operator(op, &left.range()?, &right.range()?)?, + ) +} + +/// Computes the mean value for the result of the given binary operation on +/// two unknown quantities represented by their [`Distribution`] objects. +pub fn compute_mean( + op: &Operator, + left: &Distribution, + right: &Distribution, +) -> Result { + let (left_mean, right_mean) = (left.mean()?, right.mean()?); + + match op { + Operator::Plus => return left_mean.add_checked(right_mean), + Operator::Minus => return left_mean.sub_checked(right_mean), + // Note the independence assumption below: + Operator::Multiply => return left_mean.mul_checked(right_mean), + // TODO: We can calculate the mean for division when we support reciprocals, + // or know the distributions of the operands. For details, see: + // + // + // + // + // Fall back to an unknown mean value for division: + Operator::Divide => {} + // Fall back to an unknown mean value for other cases: + _ => {} + } + let target_type = Distribution::target_type(&[&left_mean, &right_mean])?; + ScalarValue::try_from(target_type) +} + +/// Computes the median value for the result of the given binary operation on +/// two unknown quantities represented by its [`Distribution`] objects. Currently, +/// the median is calculable only for addition and subtraction operations on: +/// - [`Uniform`] and [`Uniform`] distributions, and +/// - [`Gaussian`] and [`Gaussian`] distributions. +pub fn compute_median( + op: &Operator, + left: &Distribution, + right: &Distribution, +) -> Result { + match (left, right) { + (Uniform(lu), Uniform(ru)) => { + let (left_median, right_median) = (lu.median()?, ru.median()?); + // Under the independence assumption, the result is a symmetric + // triangular distribution, so we can simply add/subtract the + // median values: + match op { + Operator::Plus => return left_median.add_checked(right_median), + Operator::Minus => return left_median.sub_checked(right_median), + // Fall back to an unknown median value for other cases: + _ => {} + } + } + // Under the independence assumption, the result is another Gaussian + // distribution, so we can simply add/subtract the median values: + (Gaussian(lg), Gaussian(rg)) => match op { + Operator::Plus => return lg.mean().add_checked(rg.mean()), + Operator::Minus => return lg.mean().sub_checked(rg.mean()), + // Fall back to an unknown median value for other cases: + _ => {} + }, + // Fall back to an unknown median value for other cases: + _ => {} + } + + let (left_median, right_median) = (left.median()?, right.median()?); + let target_type = Distribution::target_type(&[&left_median, &right_median])?; + ScalarValue::try_from(target_type) +} + +/// Computes the variance value for the result of the given binary operation on +/// two unknown quantities represented by their [`Distribution`] objects. +pub fn compute_variance( + op: &Operator, + left: &Distribution, + right: &Distribution, +) -> Result { + let (left_variance, right_variance) = (left.variance()?, right.variance()?); + + match op { + // Note the independence assumption below: + Operator::Plus => return left_variance.add_checked(right_variance), + // Note the independence assumption below: + Operator::Minus => return left_variance.add_checked(right_variance), + // Note the independence assumption below: + Operator::Multiply => { + // For more details, along with an explanation of the formula below, see: + // + // + let (left_mean, right_mean) = (left.mean()?, right.mean()?); + let left_mean_sq = left_mean.mul_checked(&left_mean)?; + let right_mean_sq = right_mean.mul_checked(&right_mean)?; + let left_sos = left_variance.add_checked(&left_mean_sq)?; + let right_sos = right_variance.add_checked(&right_mean_sq)?; + let pos = left_mean_sq.mul_checked(right_mean_sq)?; + return left_sos.mul_checked(right_sos)?.sub_checked(pos); + } + // TODO: We can calculate the variance for division when we support reciprocals, + // or know the distributions of the operands. For details, see: + // + // + // + // + // Fall back to an unknown variance value for division: + Operator::Divide => {} + // Fall back to an unknown variance value for other cases: + _ => {} + } + let target_type = Distribution::target_type(&[&left_variance, &right_variance])?; + ScalarValue::try_from(target_type) +} + +#[cfg(test)] +mod tests { + use super::{ + combine_bernoullis, combine_gaussians, compute_mean, compute_median, + compute_variance, create_bernoulli_from_comparison, new_generic_from_binary_op, + BernoulliDistribution, Distribution, GaussianDistribution, UniformDistribution, + }; + use crate::interval_arithmetic::{apply_operator, Interval}; + use crate::operator::Operator; + + use arrow::datatypes::DataType; + use datafusion_common::{HashSet, Result, ScalarValue}; + + #[test] + fn uniform_dist_is_valid_test() -> Result<()> { + assert_eq!( + Distribution::new_uniform(Interval::make_zero(&DataType::Int8)?)?, + Distribution::Uniform(UniformDistribution { + interval: Interval::make_zero(&DataType::Int8)?, + }) + ); + + assert!(Distribution::new_uniform(Interval::UNCERTAIN).is_err()); + Ok(()) + } + + #[test] + fn exponential_dist_is_valid_test() { + // This array collects test cases of the form (distribution, validity). + let exponentials = vec![ + ( + Distribution::new_exponential(ScalarValue::Null, ScalarValue::Null, true), + false, + ), + ( + Distribution::new_exponential( + ScalarValue::from(0_f32), + ScalarValue::from(1_f32), + true, + ), + false, + ), + ( + Distribution::new_exponential( + ScalarValue::from(100_f32), + ScalarValue::from(1_f32), + true, + ), + true, + ), + ( + Distribution::new_exponential( + ScalarValue::from(-100_f32), + ScalarValue::from(1_f32), + true, + ), + false, + ), + ]; + for case in exponentials { + assert_eq!(case.0.is_ok(), case.1); + } + } + + #[test] + fn gaussian_dist_is_valid_test() { + // This array collects test cases of the form (distribution, validity). + let gaussians = vec![ + ( + Distribution::new_gaussian(ScalarValue::Null, ScalarValue::Null), + false, + ), + ( + Distribution::new_gaussian( + ScalarValue::from(0_f32), + ScalarValue::from(0_f32), + ), + true, + ), + ( + Distribution::new_gaussian( + ScalarValue::from(0_f32), + ScalarValue::from(0.5_f32), + ), + true, + ), + ( + Distribution::new_gaussian( + ScalarValue::from(0_f32), + ScalarValue::from(-0.5_f32), + ), + false, + ), + ]; + for case in gaussians { + assert_eq!(case.0.is_ok(), case.1); + } + } + + #[test] + fn bernoulli_dist_is_valid_test() { + // This array collects test cases of the form (distribution, validity). + let bernoullis = vec![ + (Distribution::new_bernoulli(ScalarValue::Null), true), + (Distribution::new_bernoulli(ScalarValue::from(0.)), true), + (Distribution::new_bernoulli(ScalarValue::from(0.25)), true), + (Distribution::new_bernoulli(ScalarValue::from(1.)), true), + (Distribution::new_bernoulli(ScalarValue::from(11.)), false), + (Distribution::new_bernoulli(ScalarValue::from(-11.)), false), + (Distribution::new_bernoulli(ScalarValue::from(0_i64)), true), + (Distribution::new_bernoulli(ScalarValue::from(1_i64)), true), + ( + Distribution::new_bernoulli(ScalarValue::from(11_i64)), + false, + ), + ( + Distribution::new_bernoulli(ScalarValue::from(-11_i64)), + false, + ), + ]; + for case in bernoullis { + assert_eq!(case.0.is_ok(), case.1); + } + } + + #[test] + fn generic_dist_is_valid_test() -> Result<()> { + // This array collects test cases of the form (distribution, validity). + let generic_dists = vec![ + // Using a boolean range to construct a Generic distribution is prohibited. + ( + Distribution::new_generic( + ScalarValue::Null, + ScalarValue::Null, + ScalarValue::Null, + Interval::UNCERTAIN, + ), + false, + ), + ( + Distribution::new_generic( + ScalarValue::Null, + ScalarValue::Null, + ScalarValue::Null, + Interval::make_zero(&DataType::Float32)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::from(0_f32), + ScalarValue::Float32(None), + ScalarValue::Float32(None), + Interval::make_zero(&DataType::Float32)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::Float64(None), + ScalarValue::from(0.), + ScalarValue::Float64(None), + Interval::make_zero(&DataType::Float32)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::from(-10_f32), + ScalarValue::Float32(None), + ScalarValue::Float32(None), + Interval::make_zero(&DataType::Float32)?, + ), + false, + ), + ( + Distribution::new_generic( + ScalarValue::Float32(None), + ScalarValue::from(10_f32), + ScalarValue::Float32(None), + Interval::make_zero(&DataType::Float32)?, + ), + false, + ), + ( + Distribution::new_generic( + ScalarValue::Null, + ScalarValue::Null, + ScalarValue::Null, + Interval::make_zero(&DataType::Float32)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::from(0), + ScalarValue::from(0), + ScalarValue::Int32(None), + Interval::make_zero(&DataType::Int32)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::from(0_f32), + ScalarValue::from(0_f32), + ScalarValue::Float32(None), + Interval::make_zero(&DataType::Float32)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::from(50.), + ScalarValue::from(50.), + ScalarValue::Float64(None), + Interval::make(Some(0.), Some(100.))?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::from(50.), + ScalarValue::from(50.), + ScalarValue::Float64(None), + Interval::make(Some(-100.), Some(0.))?, + ), + false, + ), + ( + Distribution::new_generic( + ScalarValue::Float64(None), + ScalarValue::Float64(None), + ScalarValue::from(1.), + Interval::make_zero(&DataType::Float64)?, + ), + true, + ), + ( + Distribution::new_generic( + ScalarValue::Float64(None), + ScalarValue::Float64(None), + ScalarValue::from(-1.), + Interval::make_zero(&DataType::Float64)?, + ), + false, + ), + ]; + for case in generic_dists { + assert_eq!(case.0.is_ok(), case.1, "{:?}", case.0); + } + + Ok(()) + } + + #[test] + fn mean_extraction_test() -> Result<()> { + // This array collects test cases of the form (distribution, mean value). + let dists = vec![ + ( + Distribution::new_uniform(Interval::make_zero(&DataType::Int64)?), + ScalarValue::from(0_i64), + ), + ( + Distribution::new_uniform(Interval::make_zero(&DataType::Float64)?), + ScalarValue::from(0.), + ), + ( + Distribution::new_uniform(Interval::make(Some(1), Some(100))?), + ScalarValue::from(50), + ), + ( + Distribution::new_uniform(Interval::make(Some(-100), Some(-1))?), + ScalarValue::from(-50), + ), + ( + Distribution::new_uniform(Interval::make(Some(-100), Some(100))?), + ScalarValue::from(0), + ), + ( + Distribution::new_exponential( + ScalarValue::from(2.), + ScalarValue::from(0.), + true, + ), + ScalarValue::from(0.5), + ), + ( + Distribution::new_exponential( + ScalarValue::from(2.), + ScalarValue::from(1.), + true, + ), + ScalarValue::from(1.5), + ), + ( + Distribution::new_gaussian(ScalarValue::from(0.), ScalarValue::from(1.)), + ScalarValue::from(0.), + ), + ( + Distribution::new_gaussian( + ScalarValue::from(-2.), + ScalarValue::from(0.5), + ), + ScalarValue::from(-2.), + ), + ( + Distribution::new_bernoulli(ScalarValue::from(0.5)), + ScalarValue::from(0.5), + ), + ( + Distribution::new_generic( + ScalarValue::from(42.), + ScalarValue::from(42.), + ScalarValue::Float64(None), + Interval::make(Some(25.), Some(50.))?, + ), + ScalarValue::from(42.), + ), + ]; + + for case in dists { + assert_eq!(case.0?.mean()?, case.1); + } + + Ok(()) + } + + #[test] + fn median_extraction_test() -> Result<()> { + // This array collects test cases of the form (distribution, median value). + let dists = vec![ + ( + Distribution::new_uniform(Interval::make_zero(&DataType::Int64)?), + ScalarValue::from(0_i64), + ), + ( + Distribution::new_uniform(Interval::make(Some(25.), Some(75.))?), + ScalarValue::from(50.), + ), + ( + Distribution::new_exponential( + ScalarValue::from(2_f64.ln()), + ScalarValue::from(0.), + true, + ), + ScalarValue::from(1.), + ), + ( + Distribution::new_gaussian(ScalarValue::from(2.), ScalarValue::from(1.)), + ScalarValue::from(2.), + ), + ( + Distribution::new_bernoulli(ScalarValue::from(0.25)), + ScalarValue::from(0.), + ), + ( + Distribution::new_bernoulli(ScalarValue::from(0.75)), + ScalarValue::from(1.), + ), + ( + Distribution::new_gaussian(ScalarValue::from(2.), ScalarValue::from(1.)), + ScalarValue::from(2.), + ), + ( + Distribution::new_generic( + ScalarValue::from(12.), + ScalarValue::from(12.), + ScalarValue::Float64(None), + Interval::make(Some(0.), Some(25.))?, + ), + ScalarValue::from(12.), + ), + ]; + + for case in dists { + assert_eq!(case.0?.median()?, case.1); + } + + Ok(()) + } + + #[test] + fn variance_extraction_test() -> Result<()> { + // This array collects test cases of the form (distribution, variance value). + let dists = vec![ + ( + Distribution::new_uniform(Interval::make(Some(0.), Some(12.))?), + ScalarValue::from(12.), + ), + ( + Distribution::new_exponential( + ScalarValue::from(10.), + ScalarValue::from(0.), + true, + ), + ScalarValue::from(0.01), + ), + ( + Distribution::new_gaussian(ScalarValue::from(0.), ScalarValue::from(1.)), + ScalarValue::from(1.), + ), + ( + Distribution::new_bernoulli(ScalarValue::from(0.5)), + ScalarValue::from(0.25), + ), + ( + Distribution::new_generic( + ScalarValue::Float64(None), + ScalarValue::Float64(None), + ScalarValue::from(0.02), + Interval::make_zero(&DataType::Float64)?, + ), + ScalarValue::from(0.02), + ), + ]; + + for case in dists { + assert_eq!(case.0?.variance()?, case.1); + } + + Ok(()) + } + + #[test] + fn test_calculate_generic_properties_gauss_gauss() -> Result<()> { + let dist_a = + Distribution::new_gaussian(ScalarValue::from(10.), ScalarValue::from(0.0))?; + let dist_b = + Distribution::new_gaussian(ScalarValue::from(20.), ScalarValue::from(0.0))?; + + let test_data = vec![ + // Mean: + ( + compute_mean(&Operator::Plus, &dist_a, &dist_b)?, + ScalarValue::from(30.), + ), + ( + compute_mean(&Operator::Minus, &dist_a, &dist_b)?, + ScalarValue::from(-10.), + ), + // Median: + ( + compute_median(&Operator::Plus, &dist_a, &dist_b)?, + ScalarValue::from(30.), + ), + ( + compute_median(&Operator::Minus, &dist_a, &dist_b)?, + ScalarValue::from(-10.), + ), + ]; + for (actual, expected) in test_data { + assert_eq!(actual, expected); + } + + Ok(()) + } + + #[test] + fn test_combine_bernoullis_and_op() -> Result<()> { + let op = Operator::And; + let left = BernoulliDistribution::try_new(ScalarValue::from(0.5))?; + let right = BernoulliDistribution::try_new(ScalarValue::from(0.4))?; + let left_null = BernoulliDistribution::try_new(ScalarValue::Null)?; + let right_null = BernoulliDistribution::try_new(ScalarValue::Null)?; + + assert_eq!( + combine_bernoullis(&op, &left, &right)?.p_value(), + &ScalarValue::from(0.5 * 0.4) + ); + assert_eq!( + combine_bernoullis(&op, &left_null, &right)?.p_value(), + &ScalarValue::Float64(None) + ); + assert_eq!( + combine_bernoullis(&op, &left, &right_null)?.p_value(), + &ScalarValue::Float64(None) + ); + assert_eq!( + combine_bernoullis(&op, &left_null, &left_null)?.p_value(), + &ScalarValue::Null + ); + + Ok(()) + } + + #[test] + fn test_combine_bernoullis_or_op() -> Result<()> { + let op = Operator::Or; + let left = BernoulliDistribution::try_new(ScalarValue::from(0.6))?; + let right = BernoulliDistribution::try_new(ScalarValue::from(0.4))?; + let left_null = BernoulliDistribution::try_new(ScalarValue::Null)?; + let right_null = BernoulliDistribution::try_new(ScalarValue::Null)?; + + assert_eq!( + combine_bernoullis(&op, &left, &right)?.p_value(), + &ScalarValue::from(0.6 + 0.4 - (0.6 * 0.4)) + ); + assert_eq!( + combine_bernoullis(&op, &left_null, &right)?.p_value(), + &ScalarValue::Float64(None) + ); + assert_eq!( + combine_bernoullis(&op, &left, &right_null)?.p_value(), + &ScalarValue::Float64(None) + ); + assert_eq!( + combine_bernoullis(&op, &left_null, &left_null)?.p_value(), + &ScalarValue::Null + ); + + Ok(()) + } + + #[test] + fn test_combine_bernoullis_unsupported_ops() -> Result<()> { + let mut operator_set = operator_set(); + operator_set.remove(&Operator::And); + operator_set.remove(&Operator::Or); + + let left = BernoulliDistribution::try_new(ScalarValue::from(0.6))?; + let right = BernoulliDistribution::try_new(ScalarValue::from(0.4))?; + for op in operator_set { + assert!( + combine_bernoullis(&op, &left, &right).is_err(), + "Operator {op} should not be supported for Bernoulli distributions" + ); + } + + Ok(()) + } + + #[test] + fn test_combine_gaussians_addition() -> Result<()> { + let op = Operator::Plus; + let left = GaussianDistribution::try_new( + ScalarValue::from(3.0), + ScalarValue::from(2.0), + )?; + let right = GaussianDistribution::try_new( + ScalarValue::from(4.0), + ScalarValue::from(1.0), + )?; + + let result = combine_gaussians(&op, &left, &right)?.unwrap(); + + assert_eq!(result.mean(), &ScalarValue::from(7.0)); // 3.0 + 4.0 + assert_eq!(result.variance(), &ScalarValue::from(3.0)); // 2.0 + 1.0 + Ok(()) + } + + #[test] + fn test_combine_gaussians_subtraction() -> Result<()> { + let op = Operator::Minus; + let left = GaussianDistribution::try_new( + ScalarValue::from(7.0), + ScalarValue::from(2.0), + )?; + let right = GaussianDistribution::try_new( + ScalarValue::from(4.0), + ScalarValue::from(1.0), + )?; + + let result = combine_gaussians(&op, &left, &right)?.unwrap(); + + assert_eq!(result.mean(), &ScalarValue::from(3.0)); // 7.0 - 4.0 + assert_eq!(result.variance(), &ScalarValue::from(3.0)); // 2.0 + 1.0 + + Ok(()) + } + + #[test] + fn test_combine_gaussians_unsupported_ops() -> Result<()> { + let mut operator_set = operator_set(); + operator_set.remove(&Operator::Plus); + operator_set.remove(&Operator::Minus); + + let left = GaussianDistribution::try_new( + ScalarValue::from(7.0), + ScalarValue::from(2.0), + )?; + let right = GaussianDistribution::try_new( + ScalarValue::from(4.0), + ScalarValue::from(1.0), + )?; + for op in operator_set { + assert!( + combine_gaussians(&op, &left, &right)?.is_none(), + "Operator {op} should not be supported for Gaussian distributions" + ); + } + + Ok(()) + } + + // Expected test results were calculated in Wolfram Mathematica, by using: + // + // *METHOD_NAME*[TransformedDistribution[ + // x *op* y, + // {x ~ *DISTRIBUTION_X*[..], y ~ *DISTRIBUTION_Y*[..]} + // ]] + #[test] + fn test_calculate_generic_properties_uniform_uniform() -> Result<()> { + let dist_a = Distribution::new_uniform(Interval::make(Some(0.), Some(12.))?)?; + let dist_b = Distribution::new_uniform(Interval::make(Some(12.), Some(36.))?)?; + + let test_data = vec![ + // Mean: + ( + compute_mean(&Operator::Plus, &dist_a, &dist_b)?, + ScalarValue::from(30.), + ), + ( + compute_mean(&Operator::Minus, &dist_a, &dist_b)?, + ScalarValue::from(-18.), + ), + ( + compute_mean(&Operator::Multiply, &dist_a, &dist_b)?, + ScalarValue::from(144.), + ), + // Median: + ( + compute_median(&Operator::Plus, &dist_a, &dist_b)?, + ScalarValue::from(30.), + ), + ( + compute_median(&Operator::Minus, &dist_a, &dist_b)?, + ScalarValue::from(-18.), + ), + // Variance: + ( + compute_variance(&Operator::Plus, &dist_a, &dist_b)?, + ScalarValue::from(60.), + ), + ( + compute_variance(&Operator::Minus, &dist_a, &dist_b)?, + ScalarValue::from(60.), + ), + ( + compute_variance(&Operator::Multiply, &dist_a, &dist_b)?, + ScalarValue::from(9216.), + ), + ]; + for (actual, expected) in test_data { + assert_eq!(actual, expected); + } + + Ok(()) + } + + /// Test for `Uniform`-`Uniform`, `Uniform`-`Generic`, `Generic`-`Uniform`, + /// `Generic`-`Generic` pairs, where range is always present. + #[test] + fn test_compute_range_where_present() -> Result<()> { + let a = &Interval::make(Some(0.), Some(12.0))?; + let b = &Interval::make(Some(0.), Some(12.0))?; + let mean = ScalarValue::from(6.0); + for (dist_a, dist_b) in [ + ( + Distribution::new_uniform(a.clone())?, + Distribution::new_uniform(b.clone())?, + ), + ( + Distribution::new_generic( + mean.clone(), + mean.clone(), + ScalarValue::Float64(None), + a.clone(), + )?, + Distribution::new_uniform(b.clone())?, + ), + ( + Distribution::new_uniform(a.clone())?, + Distribution::new_generic( + mean.clone(), + mean.clone(), + ScalarValue::Float64(None), + b.clone(), + )?, + ), + ( + Distribution::new_generic( + mean.clone(), + mean.clone(), + ScalarValue::Float64(None), + a.clone(), + )?, + Distribution::new_generic( + mean.clone(), + mean.clone(), + ScalarValue::Float64(None), + b.clone(), + )?, + ), + ] { + use super::Operator::{ + Divide, Eq, Gt, GtEq, Lt, LtEq, Minus, Multiply, NotEq, Plus, + }; + for op in [Plus, Minus, Multiply, Divide] { + assert_eq!( + new_generic_from_binary_op(&op, &dist_a, &dist_b)?.range()?, + apply_operator(&op, a, b)?, + "Failed for {:?} {op} {:?}", + dist_a, + dist_b + ); + } + for op in [Gt, GtEq, Lt, LtEq, Eq, NotEq] { + assert_eq!( + create_bernoulli_from_comparison(&op, &dist_a, &dist_b)?.range()?, + apply_operator(&op, a, b)?, + "Failed for {:?} {op} {:?}", + dist_a, + dist_b + ); + } + } + + Ok(()) + } + + fn operator_set() -> HashSet { + use super::Operator::*; + + let all_ops = vec![ + And, + Or, + Eq, + NotEq, + Gt, + GtEq, + Lt, + LtEq, + Plus, + Minus, + Multiply, + Divide, + Modulo, + IsDistinctFrom, + IsNotDistinctFrom, + RegexMatch, + RegexIMatch, + RegexNotMatch, + RegexNotIMatch, + LikeMatch, + ILikeMatch, + NotLikeMatch, + NotILikeMatch, + BitwiseAnd, + BitwiseOr, + BitwiseXor, + BitwiseShiftRight, + BitwiseShiftLeft, + StringConcat, + AtArrow, + ArrowAt, + ]; + + all_ops.into_iter().collect() + } +} diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index d2ea6e809150..02684928bac7 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -51,7 +51,6 @@ pub mod function; pub mod groups_accumulator { pub use datafusion_expr_common::groups_accumulator::*; } - pub mod interval_arithmetic { pub use datafusion_expr_common::interval_arithmetic::*; } @@ -62,6 +61,9 @@ pub mod simplify; pub mod sort_properties { pub use datafusion_expr_common::sort_properties::*; } +pub mod statistics { + pub use datafusion_expr_common::statistics::*; +} pub mod test; pub mod tree_node; pub mod type_coercion; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index b1b889136b35..cc2ff2f24790 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -26,10 +26,13 @@ use arrow::array::BooleanArray; use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; -use datafusion_common::{internal_err, not_impl_err, Result}; +use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue}; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; use datafusion_expr_common::sort_properties::ExprProperties; +use datafusion_expr_common::statistics::Distribution; + +use itertools::izip; /// Shared [`PhysicalExpr`]. pub type PhysicalExprRef = Arc; @@ -98,11 +101,16 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { /// Computes the output interval for the expression, given the input /// intervals. /// - /// # Arguments + /// # Parameters /// /// * `children` are the intervals for the children (inputs) of this /// expression. /// + /// # Returns + /// + /// A `Result` containing the output interval for the expression in + /// case of success, or an error object in case of failure. + /// /// # Example /// /// If the expression is `a + b`, and the input intervals are `a: [1, 2]` @@ -116,19 +124,20 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { /// /// This is used to propagate constraints down through an expression tree. /// - /// # Arguments + /// # Parameters /// /// * `interval` is the currently known interval for this expression. /// * `children` are the current intervals for the children of this expression. /// /// # Returns /// - /// A `Vec` of new intervals for the children, in order. + /// A `Result` containing a `Vec` of new intervals for the children (in order) + /// in case of success, or an error object in case of failure. /// /// If constraint propagation reveals an infeasibility for any child, returns - /// [`None`]. If none of the children intervals change as a result of propagation, - /// may return an empty vector instead of cloning `children`. This is the default - /// (and conservative) return value. + /// [`None`]. If none of the children intervals change as a result of + /// propagation, may return an empty vector instead of cloning `children`. + /// This is the default (and conservative) return value. /// /// # Example /// @@ -144,6 +153,111 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { Ok(Some(vec![])) } + /// Computes the output statistics for the expression, given the input + /// statistics. + /// + /// # Parameters + /// + /// * `children` are the statistics for the children (inputs) of this + /// expression. + /// + /// # Returns + /// + /// A `Result` containing the output statistics for the expression in + /// case of success, or an error object in case of failure. + /// + /// Expressions (should) implement this function and utilize the independence + /// assumption, match on children distribution types and compute the output + /// statistics accordingly. The default implementation simply creates an + /// unknown output distribution by combining input ranges. This logic loses + /// distribution information, but is a safe default. + fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { + let children_ranges = children + .iter() + .map(|c| c.range()) + .collect::>>()?; + let children_ranges_refs = children_ranges.iter().collect::>(); + let output_interval = self.evaluate_bounds(children_ranges_refs.as_slice())?; + let dt = output_interval.data_type(); + if dt.eq(&DataType::Boolean) { + let p = if output_interval.eq(&Interval::CERTAINLY_TRUE) { + ScalarValue::new_one(&dt) + } else if output_interval.eq(&Interval::CERTAINLY_FALSE) { + ScalarValue::new_zero(&dt) + } else { + ScalarValue::try_from(&dt) + }?; + Distribution::new_bernoulli(p) + } else { + Distribution::new_from_interval(output_interval) + } + } + + /// Updates children statistics using the given parent statistic for this + /// expression. + /// + /// This is used to propagate statistics down through an expression tree. + /// + /// # Parameters + /// + /// * `parent` is the currently known statistics for this expression. + /// * `children` are the current statistics for the children of this expression. + /// + /// # Returns + /// + /// A `Result` containing a `Vec` of new statistics for the children (in order) + /// in case of success, or an error object in case of failure. + /// + /// If statistics propagation reveals an infeasibility for any child, returns + /// [`None`]. If none of the children statistics change as a result of + /// propagation, may return an empty vector instead of cloning `children`. + /// This is the default (and conservative) return value. + /// + /// Expressions (should) implement this function and apply Bayes rule to + /// reconcile and update parent/children statistics. This involves utilizing + /// the independence assumption, and matching on distribution types. The + /// default implementation simply creates an unknown distribution if it can + /// narrow the range by propagating ranges. This logic loses distribution + /// information, but is a safe default. + fn propagate_statistics( + &self, + parent: &Distribution, + children: &[&Distribution], + ) -> Result>> { + let children_ranges = children + .iter() + .map(|c| c.range()) + .collect::>>()?; + let children_ranges_refs = children_ranges.iter().collect::>(); + let parent_range = parent.range()?; + let Some(propagated_children) = + self.propagate_constraints(&parent_range, children_ranges_refs.as_slice())? + else { + return Ok(None); + }; + izip!(propagated_children.into_iter(), children_ranges, children) + .map(|(new_interval, old_interval, child)| { + if new_interval == old_interval { + // We weren't able to narrow the range, preserve the old statistics. + Ok((*child).clone()) + } else if new_interval.data_type().eq(&DataType::Boolean) { + let dt = old_interval.data_type(); + let p = if new_interval.eq(&Interval::CERTAINLY_TRUE) { + ScalarValue::new_one(&dt) + } else if new_interval.eq(&Interval::CERTAINLY_FALSE) { + ScalarValue::new_zero(&dt) + } else { + unreachable!("Given that we have a range reduction for a boolean interval, we should have certainty") + }?; + Distribution::new_bernoulli(p) + } else { + Distribution::new_from_interval(new_interval) + } + }) + .collect::>() + .map(Some) + } + /// Calculates the properties of this [`PhysicalExpr`] based on its /// children's properties (i.e. order and range), recursively aggregating /// the information from its children. In cases where the [`PhysicalExpr`] @@ -155,7 +269,7 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { } /// [`PhysicalExpr`] can't be constrained by [`Eq`] directly because it must remain object -/// safe. To ease implementation blanket implementation is provided for [`Eq`] types. +/// safe. To ease implementation, blanket implementation is provided for [`Eq`] types. pub trait DynEq { fn dyn_eq(&self, other: &dyn Any) -> bool; } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 052054bad6c1..1f16c5471ed7 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -20,6 +20,7 @@ mod kernels; use std::hash::Hash; use std::{any::Any, sync::Arc}; +use crate::expressions::binary::kernels::concat_elements_utf8view; use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison}; use crate::PhysicalExpr; @@ -36,10 +37,14 @@ use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::binary::BinaryTypeCoercer; use datafusion_expr::interval_arithmetic::{apply_operator, Interval}; use datafusion_expr::sort_properties::ExprProperties; +use datafusion_expr::statistics::Distribution::{Bernoulli, Gaussian}; +use datafusion_expr::statistics::{ + combine_bernoullis, combine_gaussians, create_bernoulli_from_comparison, + new_generic_from_binary_op, Distribution, +}; use datafusion_expr::{ColumnarValue, Operator}; use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested}; -use crate::expressions::binary::kernels::concat_elements_utf8view; use kernels::{ bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar, bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn, @@ -486,6 +491,37 @@ impl PhysicalExpr for BinaryExpr { } } + fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { + let (left, right) = (children[0], children[1]); + + if self.op.is_numerical_operators() { + // We might be able to construct the output statistics more accurately, + // without falling back to an unknown distribution, if we are dealing + // with Gaussian distributions and numerical operations. + if let (Gaussian(left), Gaussian(right)) = (left, right) { + if let Some(result) = combine_gaussians(&self.op, left, right)? { + return Ok(Gaussian(result)); + } + } + } else if self.op.is_logic_operator() { + // If we are dealing with logical operators, we expect (and can only + // operate on) Bernoulli distributions. + return if let (Bernoulli(left), Bernoulli(right)) = (left, right) { + combine_bernoullis(&self.op, left, right).map(Bernoulli) + } else { + internal_err!( + "Logical operators are only compatible with `Bernoulli` distributions" + ) + }; + } else if self.op.supports_propagation() { + // If we are handling comparison operators, we expect (and can only + // operate on) numeric distributions. + return create_bernoulli_from_comparison(&self.op, left, right); + } + // Fall back to an unknown distribution with only summary statistics: + new_generic_from_binary_op(&self.op, left, right) + } + /// For each operator, [`BinaryExpr`] has distinct rules. /// TODO: There may be rules specific to some data types and expression ranges. fn get_properties(&self, children: &[ExprProperties]) -> Result { @@ -732,6 +768,7 @@ pub fn similar_to( mod tests { use super::*; use crate::expressions::{col, lit, try_cast, Column, Literal}; + use datafusion_common::plan_datafusion_err; /// Performs a binary operation, applying any type coercion necessary @@ -4379,4 +4416,260 @@ mod tests { ) .unwrap(); } + + pub fn binary_expr( + left: Arc, + op: Operator, + right: Arc, + schema: &Schema, + ) -> Result { + Ok(binary_op(left, op, right, schema)? + .as_any() + .downcast_ref::() + .unwrap() + .clone()) + } + + /// Test for Uniform-Uniform, Unknown-Uniform, Uniform-Unknown and Unknown-Unknown evaluation. + #[test] + fn test_evaluate_statistics_combination_of_range_holders() -> Result<()> { + let schema = &Schema::new(vec![Field::new("a", DataType::Float64, false)]); + let a = Arc::new(Column::new("a", 0)) as _; + let b = lit(ScalarValue::from(12.0)); + + let left_interval = Interval::make(Some(0.0), Some(12.0))?; + let right_interval = Interval::make(Some(12.0), Some(36.0))?; + let (left_mean, right_mean) = (ScalarValue::from(6.0), ScalarValue::from(24.0)); + let (left_med, right_med) = (ScalarValue::from(6.0), ScalarValue::from(24.0)); + + for children in [ + vec![ + &Distribution::new_uniform(left_interval.clone())?, + &Distribution::new_uniform(right_interval.clone())?, + ], + vec![ + &Distribution::new_generic( + left_mean.clone(), + left_med.clone(), + ScalarValue::Float64(None), + left_interval.clone(), + )?, + &Distribution::new_uniform(right_interval.clone())?, + ], + vec![ + &Distribution::new_uniform(right_interval.clone())?, + &Distribution::new_generic( + right_mean.clone(), + right_med.clone(), + ScalarValue::Float64(None), + right_interval.clone(), + )?, + ], + vec![ + &Distribution::new_generic( + left_mean.clone(), + left_med.clone(), + ScalarValue::Float64(None), + left_interval.clone(), + )?, + &Distribution::new_generic( + right_mean.clone(), + right_med.clone(), + ScalarValue::Float64(None), + right_interval.clone(), + )?, + ], + ] { + let ops = vec![ + Operator::Plus, + Operator::Minus, + Operator::Multiply, + Operator::Divide, + ]; + + for op in ops { + let expr = binary_expr(Arc::clone(&a), op, Arc::clone(&b), schema)?; + assert_eq!( + expr.evaluate_statistics(&children)?, + new_generic_from_binary_op(&op, children[0], children[1])? + ); + } + } + Ok(()) + } + + #[test] + fn test_evaluate_statistics_bernoulli() -> Result<()> { + let schema = &Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + ]); + let a = Arc::new(Column::new("a", 0)) as _; + let b = Arc::new(Column::new("b", 1)) as _; + let eq = Arc::new(binary_expr( + Arc::clone(&a), + Operator::Eq, + Arc::clone(&b), + schema, + )?); + let neq = Arc::new(binary_expr(a, Operator::NotEq, b, schema)?); + + let left_stat = &Distribution::new_uniform(Interval::make(Some(0), Some(7))?)?; + let right_stat = &Distribution::new_uniform(Interval::make(Some(4), Some(11))?)?; + + // Intervals: [0, 7], [4, 11]. + // The intersection is [4, 7], so the probability of equality is 4 / 64 = 1 / 16. + assert_eq!( + eq.evaluate_statistics(&[left_stat, right_stat])?, + Distribution::new_bernoulli(ScalarValue::from(1.0 / 16.0))? + ); + + // The probability of being distinct is 1 - 1 / 16 = 15 / 16. + assert_eq!( + neq.evaluate_statistics(&[left_stat, right_stat])?, + Distribution::new_bernoulli(ScalarValue::from(15.0 / 16.0))? + ); + + Ok(()) + } + + #[test] + fn test_propagate_statistics_combination_of_range_holders_arithmetic() -> Result<()> { + let schema = &Schema::new(vec![Field::new("a", DataType::Float64, false)]); + let a = Arc::new(Column::new("a", 0)) as _; + let b = lit(ScalarValue::from(12.0)); + + let left_interval = Interval::make(Some(0.0), Some(12.0))?; + let right_interval = Interval::make(Some(12.0), Some(36.0))?; + + let parent = Distribution::new_uniform(Interval::make(Some(-432.), Some(432.))?)?; + let children = vec![ + vec![ + Distribution::new_uniform(left_interval.clone())?, + Distribution::new_uniform(right_interval.clone())?, + ], + vec![ + Distribution::new_generic( + ScalarValue::from(6.), + ScalarValue::from(6.), + ScalarValue::Float64(None), + left_interval.clone(), + )?, + Distribution::new_uniform(right_interval.clone())?, + ], + vec![ + Distribution::new_uniform(left_interval.clone())?, + Distribution::new_generic( + ScalarValue::from(12.), + ScalarValue::from(12.), + ScalarValue::Float64(None), + right_interval.clone(), + )?, + ], + vec![ + Distribution::new_generic( + ScalarValue::from(6.), + ScalarValue::from(6.), + ScalarValue::Float64(None), + left_interval.clone(), + )?, + Distribution::new_generic( + ScalarValue::from(12.), + ScalarValue::from(12.), + ScalarValue::Float64(None), + right_interval.clone(), + )?, + ], + ]; + + let ops = vec![ + Operator::Plus, + Operator::Minus, + Operator::Multiply, + Operator::Divide, + ]; + + for child_view in children { + let child_refs = child_view.iter().collect::>(); + for op in &ops { + let expr = binary_expr(Arc::clone(&a), *op, Arc::clone(&b), schema)?; + assert_eq!( + expr.propagate_statistics(&parent, child_refs.as_slice())?, + Some(child_view.clone()) + ); + } + } + Ok(()) + } + + #[test] + fn test_propagate_statistics_combination_of_range_holders_comparison() -> Result<()> { + let schema = &Schema::new(vec![Field::new("a", DataType::Float64, false)]); + let a = Arc::new(Column::new("a", 0)) as _; + let b = lit(ScalarValue::from(12.0)); + + let left_interval = Interval::make(Some(0.0), Some(12.0))?; + let right_interval = Interval::make(Some(6.0), Some(18.0))?; + + let one = ScalarValue::from(1.0); + let parent = Distribution::new_bernoulli(one)?; + let children = vec![ + vec![ + Distribution::new_uniform(left_interval.clone())?, + Distribution::new_uniform(right_interval.clone())?, + ], + vec![ + Distribution::new_generic( + ScalarValue::from(6.), + ScalarValue::from(6.), + ScalarValue::Float64(None), + left_interval.clone(), + )?, + Distribution::new_uniform(right_interval.clone())?, + ], + vec![ + Distribution::new_uniform(left_interval.clone())?, + Distribution::new_generic( + ScalarValue::from(12.), + ScalarValue::from(12.), + ScalarValue::Float64(None), + right_interval.clone(), + )?, + ], + vec![ + Distribution::new_generic( + ScalarValue::from(6.), + ScalarValue::from(6.), + ScalarValue::Float64(None), + left_interval.clone(), + )?, + Distribution::new_generic( + ScalarValue::from(12.), + ScalarValue::from(12.), + ScalarValue::Float64(None), + right_interval.clone(), + )?, + ], + ]; + + let ops = vec![ + Operator::Eq, + Operator::Gt, + Operator::GtEq, + Operator::Lt, + Operator::LtEq, + ]; + + for child_view in children { + let child_refs = child_view.iter().collect::>(); + for op in &ops { + let expr = binary_expr(Arc::clone(&a), *op, Arc::clone(&b), schema)?; + assert!(expr + .propagate_statistics(&parent, child_refs.as_slice())? + .is_some()); + } + } + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index dc863ccff511..8795545274a2 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -28,9 +28,12 @@ use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; -use datafusion_common::{plan_err, Result}; +use datafusion_common::{internal_err, plan_err, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::ExprProperties; +use datafusion_expr::statistics::Distribution::{ + self, Bernoulli, Exponential, Gaussian, Generic, Uniform, +}; use datafusion_expr::{ type_coercion::{is_interval, is_null, is_signed_numeric, is_timestamp}, ColumnarValue, @@ -89,14 +92,13 @@ impl PhysicalExpr for NegativeExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - let arg = self.arg.evaluate(batch)?; - match arg { + match self.arg.evaluate(batch)? { ColumnarValue::Array(array) => { let result = neg_wrapping(array.as_ref())?; Ok(ColumnarValue::Array(result)) } ColumnarValue::Scalar(scalar) => { - Ok(ColumnarValue::Scalar((scalar.arithmetic_negate())?)) + Ok(ColumnarValue::Scalar(scalar.arithmetic_negate()?)) } } } @@ -116,10 +118,7 @@ impl PhysicalExpr for NegativeExpr { /// It replaces the upper and lower bounds after multiplying them with -1. /// Ex: `(a, b]` => `[-b, -a)` fn evaluate_bounds(&self, children: &[&Interval]) -> Result { - Interval::try_new( - children[0].upper().arithmetic_negate()?, - children[0].lower().arithmetic_negate()?, - ) + children[0].arithmetic_negate() } /// Returns a new [`Interval`] of a NegativeExpr that has the existing `interval` given that @@ -129,17 +128,37 @@ impl PhysicalExpr for NegativeExpr { interval: &Interval, children: &[&Interval], ) -> Result>> { - let child_interval = children[0]; - let negated_interval = Interval::try_new( - interval.upper().arithmetic_negate()?, - interval.lower().arithmetic_negate()?, - )?; + let negated_interval = interval.arithmetic_negate()?; - Ok(child_interval + Ok(children[0] .intersect(negated_interval)? .map(|result| vec![result])) } + fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { + match children[0] { + Uniform(u) => Distribution::new_uniform(u.range().arithmetic_negate()?), + Exponential(e) => Distribution::new_exponential( + e.rate().clone(), + e.offset().arithmetic_negate()?, + !e.positive_tail(), + ), + Gaussian(g) => Distribution::new_gaussian( + g.mean().arithmetic_negate()?, + g.variance().clone(), + ), + Bernoulli(_) => { + internal_err!("NegativeExpr cannot operate on Boolean datatypes") + } + Generic(u) => Distribution::new_generic( + u.mean().arithmetic_negate()?, + u.median().arithmetic_negate()?, + u.variance().clone(), + u.range().arithmetic_negate()?, + ), + } + } + /// The ordering of a [`NegativeExpr`] is simply the reverse of its child. fn get_properties(&self, children: &[ExprProperties]) -> Result { Ok(ExprProperties { @@ -181,7 +200,7 @@ mod tests { use arrow::datatypes::DataType::{Float32, Float64, Int16, Int32, Int64, Int8}; use arrow::datatypes::*; use datafusion_common::cast::as_primitive_array; - use datafusion_common::DataFusionError; + use datafusion_common::{DataFusionError, ScalarValue}; use paste::paste; @@ -233,6 +252,67 @@ mod tests { Ok(()) } + #[test] + fn test_evaluate_statistics() -> Result<()> { + let negative_expr = NegativeExpr::new(Arc::new(Column::new("a", 0))); + + // Uniform + assert_eq!( + negative_expr.evaluate_statistics(&[&Distribution::new_uniform( + Interval::make(Some(-2.), Some(3.))? + )?])?, + Distribution::new_uniform(Interval::make(Some(-3.), Some(2.))?)? + ); + + // Bernoulli + assert!(negative_expr + .evaluate_statistics(&[&Distribution::new_bernoulli(ScalarValue::from( + 0.75 + ))?]) + .is_err()); + + // Exponential + assert_eq!( + negative_expr.evaluate_statistics(&[&Distribution::new_exponential( + ScalarValue::from(1.), + ScalarValue::from(1.), + true + )?])?, + Distribution::new_exponential( + ScalarValue::from(1.), + ScalarValue::from(-1.), + false + )? + ); + + // Gaussian + assert_eq!( + negative_expr.evaluate_statistics(&[&Distribution::new_gaussian( + ScalarValue::from(15), + ScalarValue::from(225), + )?])?, + Distribution::new_gaussian(ScalarValue::from(-15), ScalarValue::from(225),)? + ); + + // Unknown + assert_eq!( + negative_expr.evaluate_statistics(&[&Distribution::new_generic( + ScalarValue::from(15), + ScalarValue::from(15), + ScalarValue::from(10), + Interval::make(Some(10), Some(20))? + )?])?, + Distribution::new_generic( + ScalarValue::from(-15), + ScalarValue::from(-15), + ScalarValue::from(10), + Interval::make(Some(-20), Some(-10))? + )? + ); + + Ok(()) + } + #[test] fn test_propagate_constraints() -> Result<()> { let negative_expr = NegativeExpr::new(Arc::new(Column::new("a", 0))); @@ -249,6 +329,35 @@ mod tests { Ok(()) } + #[test] + fn test_propagate_statistics_range_holders() -> Result<()> { + let negative_expr = NegativeExpr::new(Arc::new(Column::new("a", 0))); + let original_child_interval = Interval::make(Some(-2), Some(3))?; + let after_propagation = Interval::make(Some(-2), Some(0))?; + + let parent = Distribution::new_uniform(Interval::make(Some(0), Some(4))?)?; + let children: Vec> = vec![ + vec![Distribution::new_uniform(original_child_interval.clone())?], + vec![Distribution::new_generic( + ScalarValue::from(0), + ScalarValue::from(0), + ScalarValue::Int32(None), + original_child_interval.clone(), + )?], + ]; + + for child_view in children { + let child_refs: Vec<_> = child_view.iter().collect(); + let actual = negative_expr.propagate_statistics(&parent, &child_refs)?; + let expected = Some(vec![Distribution::new_from_interval( + after_propagation.clone(), + )?]); + assert_eq!(actual, expected); + } + + Ok(()) + } + #[test] fn test_negation_valid_types() -> Result<()> { let negatable_types = [ diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr/src/expressions/not.rs index 440c4e9557bd..ddf7c739b692 100644 --- a/datafusion/physical-expr/src/expressions/not.rs +++ b/datafusion/physical-expr/src/expressions/not.rs @@ -23,10 +23,12 @@ use std::hash::Hash; use std::sync::Arc; use crate::PhysicalExpr; + use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; -use datafusion_common::{cast::as_boolean_array, Result, ScalarValue}; +use datafusion_common::{cast::as_boolean_array, internal_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; +use datafusion_expr::statistics::Distribution::{self, Bernoulli}; use datafusion_expr::ColumnarValue; /// Not expression @@ -82,8 +84,7 @@ impl PhysicalExpr for NotExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - let evaluate_arg = self.arg.evaluate(batch)?; - match evaluate_arg { + match self.arg.evaluate(batch)? { ColumnarValue::Array(array) => { let array = as_boolean_array(&array)?; Ok(ColumnarValue::Array(Arc::new( @@ -95,9 +96,7 @@ impl PhysicalExpr for NotExpr { return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); } let bool_value: bool = scalar.try_into()?; - Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some( - !bool_value, - )))) + Ok(ColumnarValue::Scalar(ScalarValue::from(!bool_value))) } } } @@ -112,9 +111,70 @@ impl PhysicalExpr for NotExpr { ) -> Result> { Ok(Arc::new(NotExpr::new(Arc::clone(&children[0])))) } + fn evaluate_bounds(&self, children: &[&Interval]) -> Result { children[0].not() } + + fn propagate_constraints( + &self, + interval: &Interval, + children: &[&Interval], + ) -> Result>> { + let complemented_interval = interval.not()?; + + Ok(children[0] + .intersect(complemented_interval)? + .map(|result| vec![result])) + } + + fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { + match children[0] { + Bernoulli(b) => { + let p_value = b.p_value(); + if p_value.is_null() { + Ok(children[0].clone()) + } else { + let one = ScalarValue::new_one(&p_value.data_type())?; + Distribution::new_bernoulli(one.sub_checked(p_value)?) + } + } + _ => internal_err!("NotExpr can only operate on Boolean datatypes"), + } + } + + fn propagate_statistics( + &self, + parent: &Distribution, + children: &[&Distribution], + ) -> Result>> { + match (parent, children[0]) { + (Bernoulli(parent), Bernoulli(child)) => { + let parent_range = parent.range(); + let result = if parent_range == Interval::CERTAINLY_TRUE { + if child.range() == Interval::CERTAINLY_TRUE { + None + } else { + Some(vec![Distribution::new_bernoulli(ScalarValue::new_zero( + &child.data_type(), + )?)?]) + } + } else if parent_range == Interval::CERTAINLY_FALSE { + if child.range() == Interval::CERTAINLY_FALSE { + None + } else { + Some(vec![Distribution::new_bernoulli(ScalarValue::new_one( + &child.data_type(), + )?)?]) + } + } else { + Some(vec![]) + }; + Ok(result) + } + _ => internal_err!("NotExpr can only operate on Boolean datatypes"), + } + } } /// Creates a unary expression NOT @@ -124,10 +184,12 @@ pub fn not(arg: Arc) -> Result> { #[cfg(test)] mod tests { + use std::sync::LazyLock; + use super::*; - use crate::expressions::col; + use crate::expressions::{col, Column}; + use arrow::{array::BooleanArray, datatypes::*}; - use std::sync::LazyLock; #[test] fn neg_op() -> Result<()> { @@ -182,10 +244,81 @@ mod tests { expected_interval: Interval, ) -> Result<()> { let not_expr = not(col("a", &schema())?)?; + assert_eq!(not_expr.evaluate_bounds(&[&interval])?, expected_interval); + Ok(()) + } + + #[test] + fn test_evaluate_statistics() -> Result<()> { + let _schema = &Schema::new(vec![Field::new("a", DataType::Boolean, false)]); + let a = Arc::new(Column::new("a", 0)) as _; + let expr = not(a)?; + + // Uniform with non-boolean bounds + assert!(expr + .evaluate_statistics(&[&Distribution::new_uniform( + Interval::make_unbounded(&DataType::Float64)? + )?]) + .is_err()); + + // Exponential + assert!(expr + .evaluate_statistics(&[&Distribution::new_exponential( + ScalarValue::from(1.0), + ScalarValue::from(1.0), + true + )?]) + .is_err()); + + // Gaussian + assert!(expr + .evaluate_statistics(&[&Distribution::new_gaussian( + ScalarValue::from(1.0), + ScalarValue::from(1.0), + )?]) + .is_err()); + + // Bernoulli assert_eq!( - not_expr.evaluate_bounds(&[&interval]).unwrap(), - expected_interval + expr.evaluate_statistics(&[&Distribution::new_bernoulli( + ScalarValue::from(0.0), + )?])?, + Distribution::new_bernoulli(ScalarValue::from(1.))? ); + + assert_eq!( + expr.evaluate_statistics(&[&Distribution::new_bernoulli( + ScalarValue::from(1.0), + )?])?, + Distribution::new_bernoulli(ScalarValue::from(0.))? + ); + + assert_eq!( + expr.evaluate_statistics(&[&Distribution::new_bernoulli( + ScalarValue::from(0.25), + )?])?, + Distribution::new_bernoulli(ScalarValue::from(0.75))? + ); + + assert!(expr + .evaluate_statistics(&[&Distribution::new_generic( + ScalarValue::Null, + ScalarValue::Null, + ScalarValue::Null, + Interval::make_unbounded(&DataType::UInt8)? + )?]) + .is_err()); + + // Unknown with non-boolean interval as range + assert!(expr + .evaluate_statistics(&[&Distribution::new_generic( + ScalarValue::Null, + ScalarValue::Null, + ScalarValue::Null, + Interval::make_unbounded(&DataType::Float64)? + )?]) + .is_err()); + Ok(()) } diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index cb29109684fe..a53814c3ad2b 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -15,7 +15,130 @@ // specific language governing permissions and limitations // under the License. -//! Constraint propagator/solver for custom PhysicalExpr graphs. +//! Constraint propagator/solver for custom [`PhysicalExpr`] graphs. +//! +//! The constraint propagator/solver in DataFusion uses interval arithmetic to +//! perform mathematical operations on intervals, which represent a range of +//! possible values rather than a single point value. This allows for the +//! propagation of ranges through mathematical operations, and can be used to +//! compute bounds for a complicated expression. The key idea is that by +//! breaking down a complicated expression into simpler terms, and then +//! combining the bounds for those simpler terms, one can obtain bounds for the +//! overall expression. +//! +//! This way of using interval arithmetic to compute bounds for a complex +//! expression by combining the bounds for the constituent terms within the +//! original expression allows us to reason about the range of possible values +//! of the expression. This information later can be used in range pruning of +//! the provably unnecessary parts of `RecordBatch`es. +//! +//! # Example +//! +//! For example, consider a mathematical expression such as `x^2 + y = 4` \[1\]. +//! Since this expression would be a binary tree in [`PhysicalExpr`] notation, +//! this type of an hierarchical computation is well-suited for a graph based +//! implementation. In such an implementation, an equation system `f(x) = 0` is +//! represented by a directed acyclic expression graph (DAEG). +//! +//! In order to use interval arithmetic to compute bounds for this expression, +//! one would first determine intervals that represent the possible values of +//! `x` and `y`` Let's say that the interval for `x` is `[1, 2]` and the interval +//! for `y` is `[-3, 1]`. In the chart below, you can see how the computation +//! takes place. +//! +//! # References +//! +//! 1. Kabak, Mehmet Ozan. Analog Circuit Start-Up Behavior Analysis: An Interval +//! Arithmetic Based Approach, Chapter 4. Stanford University, 2015. +//! 2. Moore, Ramon E. Interval analysis. Vol. 4. Englewood Cliffs: Prentice-Hall, 1966. +//! 3. F. Messine, "Deterministic global optimization using interval constraint +//! propagation techniques," RAIRO-Operations Research, vol. 38, no. 04, +//! pp. 277-293, 2004. +//! +//! # Illustration +//! +//! ## Computing bounds for an expression using interval arithmetic +//! +//! ```text +//! +-----+ +-----+ +//! +----| + |----+ +----| + |----+ +//! | | | | | | | | +//! | +-----+ | | +-----+ | +//! | | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | 2 | | y | | 2 | [1, 4] | y | +//! |[.] | | | |[.] | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | | +//! | | +//! +---+ +---+ +//! | x | [1, 2] | x | [1, 2] +//! +---+ +---+ +//! +//! (a) Bottom-up evaluation: Step 1 (b) Bottom up evaluation: Step 2 +//! +//! [1 - 3, 4 + 1] = [-2, 5] +//! +-----+ +-----+ +//! +----| + |----+ +----| + |----+ +//! | | | | | | | | +//! | +-----+ | | +-----+ | +//! | | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | 2 |[1, 4] | y | | 2 |[1, 4] | y | +//! |[.] | | | |[.] | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | [-3, 1] | [-3, 1] +//! | | +//! +---+ +---+ +//! | x | [1, 2] | x | [1, 2] +//! +---+ +---+ +//! +//! (c) Bottom-up evaluation: Step 3 (d) Bottom-up evaluation: Step 4 +//! ``` +//! +//! ## Top-down constraint propagation using inverse semantics +//! +//! ```text +//! [-2, 5] ∩ [4, 4] = [4, 4] [4, 4] +//! +-----+ +-----+ +//! +----| + |----+ +----| + |----+ +//! | | | | | | | | +//! | +-----+ | | +-----+ | +//! | | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | 2 | [1, 4] | y | | 2 | [1, 4] | y | [0, 1]* +//! |[.] | | | |[.] | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | [-3, 1] | +//! | | +//! +---+ +---+ +//! | x | [1, 2] | x | [1, 2] +//! +---+ +---+ +//! +//! (a) Top-down propagation: Step 1 (b) Top-down propagation: Step 2 +//! +//! [1 - 3, 4 + 1] = [-2, 5] +//! +-----+ +-----+ +//! +----| + |----+ +----| + |----+ +//! | | | | | | | | +//! | +-----+ | | +-----+ | +//! | | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | 2 |[3, 4]** | y | | 2 |[3, 4] | y | +//! |[.] | | | |[.] | | | +//! +-----+ +-----+ +-----+ +-----+ +//! | [0, 1] | [-3, 1] +//! | | +//! +---+ +---+ +//! | x | [1, 2] | x | [sqrt(3), 2]*** +//! +---+ +---+ +//! +//! (c) Top-down propagation: Step 3 (d) Top-down propagation: Step 4 +//! +//! * [-3, 1] ∩ ([4, 4] - [1, 4]) = [0, 1] +//! ** [1, 4] ∩ ([4, 4] - [0, 1]) = [3, 4] +//! *** [1, 2] ∩ [sqrt(3), sqrt(4)] = [sqrt(3), 2] +//! ``` use std::collections::HashSet; use std::fmt::{Display, Formatter}; @@ -39,84 +162,6 @@ use petgraph::stable_graph::{DefaultIx, StableGraph}; use petgraph::visit::{Bfs, Dfs, DfsPostOrder, EdgeRef}; use petgraph::Outgoing; -// Interval arithmetic provides a way to perform mathematical operations on -// intervals, which represent a range of possible values rather than a single -// point value. This allows for the propagation of ranges through mathematical -// operations, and can be used to compute bounds for a complicated expression. -// The key idea is that by breaking down a complicated expression into simpler -// terms, and then combining the bounds for those simpler terms, one can -// obtain bounds for the overall expression. -// -// For example, consider a mathematical expression such as x^2 + y = 4. Since -// it would be a binary tree in [PhysicalExpr] notation, this type of an -// hierarchical computation is well-suited for a graph based implementation. -// In such an implementation, an equation system f(x) = 0 is represented by a -// directed acyclic expression graph (DAEG). -// -// In order to use interval arithmetic to compute bounds for this expression, -// one would first determine intervals that represent the possible values of x -// and y. Let's say that the interval for x is [1, 2] and the interval for y -// is [-3, 1]. In the chart below, you can see how the computation takes place. -// -// This way of using interval arithmetic to compute bounds for a complex -// expression by combining the bounds for the constituent terms within the -// original expression allows us to reason about the range of possible values -// of the expression. This information later can be used in range pruning of -// the provably unnecessary parts of `RecordBatch`es. -// -// References -// 1 - Kabak, Mehmet Ozan. Analog Circuit Start-Up Behavior Analysis: An Interval -// Arithmetic Based Approach, Chapter 4. Stanford University, 2015. -// 2 - Moore, Ramon E. Interval analysis. Vol. 4. Englewood Cliffs: Prentice-Hall, 1966. -// 3 - F. Messine, "Deterministic global optimization using interval constraint -// propagation techniques," RAIRO-Operations Research, vol. 38, no. 04, -// pp. 277{293, 2004. -// -// ``` text -// Computing bounds for an expression using interval arithmetic. Constraint propagation through a top-down evaluation of the expression -// graph using inverse semantics. -// -// [-2, 5] ∩ [4, 4] = [4, 4] [4, 4] -// +-----+ +-----+ +-----+ +-----+ -// +----| + |----+ +----| + |----+ +----| + |----+ +----| + |----+ -// | | | | | | | | | | | | | | | | -// | +-----+ | | +-----+ | | +-----+ | | +-----+ | -// | | | | | | | | -// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -// | 2 | | y | | 2 | [1, 4] | y | | 2 | [1, 4] | y | | 2 | [1, 4] | y | [0, 1]* -// |[.] | | | |[.] | | | |[.] | | | |[.] | | | -// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -// | | | [-3, 1] | -// | | | | -// +---+ +---+ +---+ +---+ -// | x | [1, 2] | x | [1, 2] | x | [1, 2] | x | [1, 2] -// +---+ +---+ +---+ +---+ -// -// (a) Bottom-up evaluation: Step1 (b) Bottom up evaluation: Step2 (a) Top-down propagation: Step1 (b) Top-down propagation: Step2 -// -// [1 - 3, 4 + 1] = [-2, 5] [1 - 3, 4 + 1] = [-2, 5] -// +-----+ +-----+ +-----+ +-----+ -// +----| + |----+ +----| + |----+ +----| + |----+ +----| + |----+ -// | | | | | | | | | | | | | | | | -// | +-----+ | | +-----+ | | +-----+ | | +-----+ | -// | | | | | | | | -// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -// | 2 |[1, 4] | y | | 2 |[1, 4] | y | | 2 |[3, 4]** | y | | 2 |[1, 4] | y | -// |[.] | | | |[.] | | | |[.] | | | |[.] | | | -// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -// | [-3, 1] | [-3, 1] | [0, 1] | [-3, 1] -// | | | | -// +---+ +---+ +---+ +---+ -// | x | [1, 2] | x | [1, 2] | x | [1, 2] | x | [sqrt(3), 2]*** -// +---+ +---+ +---+ +---+ -// -// (c) Bottom-up evaluation: Step3 (d) Bottom-up evaluation: Step4 (c) Top-down propagation: Step3 (d) Top-down propagation: Step4 -// -// * [-3, 1] ∩ ([4, 4] - [1, 4]) = [0, 1] -// ** [1, 4] ∩ ([4, 4] - [0, 1]) = [3, 4] -// *** [1, 2] ∩ [sqrt(3), sqrt(4)] = [sqrt(3), 2] -// ``` - /// This object implements a directed acyclic expression graph (DAEG) that /// is used to compute ranges for expressions through interval arithmetic. #[derive(Clone, Debug)] @@ -125,18 +170,6 @@ pub struct ExprIntervalGraph { root: NodeIndex, } -impl ExprIntervalGraph { - /// Estimate size of bytes including `Self`. - pub fn size(&self) -> usize { - let node_memory_usage = self.graph.node_count() - * (size_of::() + size_of::()); - let edge_memory_usage = - self.graph.edge_count() * (size_of::() + size_of::() * 2); - - size_of_val(self) + node_memory_usage + edge_memory_usage - } -} - /// This object encapsulates all possible constraint propagation results. #[derive(PartialEq, Debug)] pub enum PropagationResult { @@ -153,6 +186,12 @@ pub struct ExprIntervalGraphNode { interval: Interval, } +impl PartialEq for ExprIntervalGraphNode { + fn eq(&self, other: &Self) -> bool { + self.expr.eq(&other.expr) + } +} + impl Display for ExprIntervalGraphNode { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.expr) @@ -160,7 +199,7 @@ impl Display for ExprIntervalGraphNode { } impl ExprIntervalGraphNode { - /// Constructs a new DAEG node with an [-∞, ∞] range. + /// Constructs a new DAEG node with an `[-∞, ∞]` range. pub fn new_unbounded(expr: Arc, dt: &DataType) -> Result { Interval::make_unbounded(dt) .map(|interval| ExprIntervalGraphNode { expr, interval }) @@ -178,7 +217,7 @@ impl ExprIntervalGraphNode { /// This function creates a DAEG node from DataFusion's [`ExprTreeNode`] /// object. Literals are created with definite, singleton intervals while - /// any other expression starts with an indefinite interval ([-∞, ∞]). + /// any other expression starts with an indefinite interval (`[-∞, ∞]`). pub fn make_node(node: &ExprTreeNode, schema: &Schema) -> Result { let expr = Arc::clone(&node.expr); if let Some(literal) = expr.as_any().downcast_ref::() { @@ -192,30 +231,24 @@ impl ExprIntervalGraphNode { } } -impl PartialEq for ExprIntervalGraphNode { - fn eq(&self, other: &Self) -> bool { - self.expr.eq(&other.expr) - } -} - /// This function refines intervals `left_child` and `right_child` by applying /// constraint propagation through `parent` via operation. The main idea is /// that we can shrink ranges of variables x and y using parent interval p. /// -/// Assuming that x,y and p has ranges [xL, xU], [yL, yU], and [pL, pU], we +/// Assuming that x,y and p has ranges `[xL, xU]`, `[yL, yU]`, and `[pL, pU]`, we /// apply the following operations: /// - For plus operation, specifically, we would first do -/// - [xL, xU] <- ([pL, pU] - [yL, yU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([pL, pU] - [xL, xU]) ∩ [yL, yU]. +/// - `[xL, xU]` <- (`[pL, pU]` - `[yL, yU]`) ∩ `[xL, xU]`, and then +/// - `[yL, yU]` <- (`[pL, pU]` - `[xL, xU]`) ∩ `[yL, yU]`. /// - For minus operation, specifically, we would first do -/// - [xL, xU] <- ([yL, yU] + [pL, pU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([xL, xU] - [pL, pU]) ∩ [yL, yU]. +/// - `[xL, xU]` <- (`[yL, yU]` + `[pL, pU]`) ∩ `[xL, xU]`, and then +/// - `[yL, yU]` <- (`[xL, xU]` - `[pL, pU]`) ∩ `[yL, yU]`. /// - For multiplication operation, specifically, we would first do -/// - [xL, xU] <- ([pL, pU] / [yL, yU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([pL, pU] / [xL, xU]) ∩ [yL, yU]. +/// - `[xL, xU]` <- (`[pL, pU]` / `[yL, yU]`) ∩ `[xL, xU]`, and then +/// - `[yL, yU]` <- (`[pL, pU]` / `[xL, xU]`) ∩ `[yL, yU]`. /// - For division operation, specifically, we would first do -/// - [xL, xU] <- ([yL, yU] * [pL, pU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([xL, xU] / [pL, pU]) ∩ [yL, yU]. +/// - `[xL, xU]` <- (`[yL, yU]` * `[pL, pU]`) ∩ `[xL, xU]`, and then +/// - `[yL, yU]` <- (`[xL, xU]` / `[pL, pU]`) ∩ `[yL, yU]`. pub fn propagate_arithmetic( op: &Operator, parent: &Interval, @@ -361,18 +394,30 @@ impl ExprIntervalGraph { self.graph.node_count() } + /// Estimate size of bytes including `Self`. + pub fn size(&self) -> usize { + let node_memory_usage = self.graph.node_count() + * (size_of::() + size_of::()); + let edge_memory_usage = + self.graph.edge_count() * (size_of::() + size_of::() * 2); + + size_of_val(self) + node_memory_usage + edge_memory_usage + } + // Sometimes, we do not want to calculate and/or propagate intervals all // way down to leaf expressions. For example, assume that we have a // `SymmetricHashJoin` which has a child with an output ordering like: // + // ```text // PhysicalSortExpr { // expr: BinaryExpr('a', +, 'b'), // sort_option: .. // } + // ``` // - // i.e. its output order comes from a clause like "ORDER BY a + b". In such - // a case, we must calculate the interval for the BinaryExpr('a', +, 'b') - // instead of the columns inside this BinaryExpr, because this interval + // i.e. its output order comes from a clause like `ORDER BY a + b`. In such + // a case, we must calculate the interval for the `BinaryExpr(a, +, b)` + // instead of the columns inside this `BinaryExpr`, because this interval // decides whether we prune or not. Therefore, children `PhysicalExpr`s of // this `BinaryExpr` may be pruned for performance. The figure below // explains this example visually. @@ -510,9 +555,6 @@ impl ExprIntervalGraph { /// Computes bounds for an expression using interval arithmetic via a /// bottom-up traversal. /// - /// # Arguments - /// * `leaf_bounds` - &[(usize, Interval)]. Provide NodeIndex, Interval tuples for leaf variables. - /// /// # Examples /// /// ``` @@ -570,7 +612,7 @@ impl ExprIntervalGraph { self.graph[node].expr.evaluate_bounds(&children_intervals)?; } } - Ok(&self.graph[self.root].interval) + Ok(self.graph[self.root].interval()) } /// Updates/shrinks bounds for leaf expressions using interval arithmetic @@ -579,8 +621,6 @@ impl ExprIntervalGraph { &mut self, given_range: Interval, ) -> Result { - let mut bfs = Bfs::new(&self.graph, self.root); - // Adjust the root node with the given range: if let Some(interval) = self.graph[self.root].interval.intersect(given_range)? { self.graph[self.root].interval = interval; @@ -588,6 +628,8 @@ impl ExprIntervalGraph { return Ok(PropagationResult::Infeasible); } + let mut bfs = Bfs::new(&self.graph, self.root); + while let Some(node) = bfs.next(&self.graph) { let neighbors = self.graph.neighbors_directed(node, Outgoing); let mut children = neighbors.collect::>(); diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index b68d10905cab..0a448fa6a2e9 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -40,6 +40,7 @@ pub mod udf { #[allow(deprecated)] pub use crate::scalar_function::create_physical_expr; } +pub mod statistics; pub mod utils; pub mod window; diff --git a/datafusion/physical-expr/src/statistics/mod.rs b/datafusion/physical-expr/src/statistics/mod.rs new file mode 100644 index 000000000000..02897e059457 --- /dev/null +++ b/datafusion/physical-expr/src/statistics/mod.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Statistics and constraint propagation library + +pub mod stats_solver; diff --git a/datafusion/physical-expr/src/statistics/stats_solver.rs b/datafusion/physical-expr/src/statistics/stats_solver.rs new file mode 100644 index 000000000000..ec58076caf3b --- /dev/null +++ b/datafusion/physical-expr/src/statistics/stats_solver.rs @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use crate::expressions::Literal; +use crate::intervals::cp_solver::PropagationResult; +use crate::physical_expr::PhysicalExpr; +use crate::utils::{build_dag, ExprTreeNode}; + +use arrow::datatypes::{DataType, Schema}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::statistics::Distribution; +use datafusion_expr_common::interval_arithmetic::Interval; + +use petgraph::adj::DefaultIx; +use petgraph::prelude::Bfs; +use petgraph::stable_graph::{NodeIndex, StableGraph}; +use petgraph::visit::DfsPostOrder; +use petgraph::Outgoing; + +/// This object implements a directed acyclic expression graph (DAEG) that +/// is used to compute statistics/distributions for expressions hierarchically. +#[derive(Clone, Debug)] +pub struct ExprStatisticsGraph { + graph: StableGraph, + root: NodeIndex, +} + +/// This is a node in the DAEG; it encapsulates a reference to the actual +/// [`PhysicalExpr`] as well as its statistics/distribution. +#[derive(Clone, Debug)] +pub struct ExprStatisticsGraphNode { + expr: Arc, + dist: Distribution, +} + +impl ExprStatisticsGraphNode { + /// Constructs a new DAEG node based on the given interval with a + /// `Uniform` distribution. + fn new_uniform(expr: Arc, interval: Interval) -> Result { + Distribution::new_uniform(interval) + .map(|dist| ExprStatisticsGraphNode { expr, dist }) + } + + /// Constructs a new DAEG node with a `Bernoulli` distribution having an + /// unknown success probability. + fn new_bernoulli(expr: Arc) -> Result { + Distribution::new_bernoulli(ScalarValue::Float64(None)) + .map(|dist| ExprStatisticsGraphNode { expr, dist }) + } + + /// Constructs a new DAEG node with a `Generic` distribution having no + /// definite summary statistics. + fn new_generic(expr: Arc, dt: &DataType) -> Result { + let interval = Interval::make_unbounded(dt)?; + let dist = Distribution::new_from_interval(interval)?; + Ok(ExprStatisticsGraphNode { expr, dist }) + } + + /// Get the [`Distribution`] object representing the statistics of the + /// expression. + pub fn distribution(&self) -> &Distribution { + &self.dist + } + + /// This function creates a DAEG node from DataFusion's [`ExprTreeNode`] + /// object. Literals are created with `Uniform` distributions with a + /// definite, singleton interval. Expressions with a `Boolean` data type + /// result in a`Bernoulli` distribution with an unknown success probability. + /// Any other expression starts with an `Unknown` distribution with an + /// indefinite range (i.e. `[-∞, ∞]`). + pub fn make_node(node: &ExprTreeNode, schema: &Schema) -> Result { + let expr = Arc::clone(&node.expr); + if let Some(literal) = expr.as_any().downcast_ref::() { + let value = literal.value(); + Interval::try_new(value.clone(), value.clone()) + .and_then(|interval| Self::new_uniform(expr, interval)) + } else { + expr.data_type(schema).and_then(|dt| { + if dt.eq(&DataType::Boolean) { + Self::new_bernoulli(expr) + } else { + Self::new_generic(expr, &dt) + } + }) + } + } +} + +impl ExprStatisticsGraph { + pub fn try_new(expr: Arc, schema: &Schema) -> Result { + // Build the full graph: + let (root, graph) = build_dag(expr, &|node| { + ExprStatisticsGraphNode::make_node(node, schema) + })?; + Ok(Self { graph, root }) + } + + /// This function assigns given distributions to expressions in the DAEG. + /// The argument `assignments` associates indices of sought expressions + /// with their corresponding new distributions. + pub fn assign_statistics(&mut self, assignments: &[(usize, Distribution)]) { + for (index, stats) in assignments { + let node_index = NodeIndex::from(*index as DefaultIx); + self.graph[node_index].dist = stats.clone(); + } + } + + /// Computes statistics/distributions for an expression via a bottom-up + /// traversal. + pub fn evaluate_statistics(&mut self) -> Result<&Distribution> { + let mut dfs = DfsPostOrder::new(&self.graph, self.root); + while let Some(idx) = dfs.next(&self.graph) { + let neighbors = self.graph.neighbors_directed(idx, Outgoing); + let mut children_statistics = neighbors + .map(|child| self.graph[child].distribution()) + .collect::>(); + // Note that all distributions are assumed to be independent. + if !children_statistics.is_empty() { + // Reverse to align with `PhysicalExpr`'s children: + children_statistics.reverse(); + self.graph[idx].dist = self.graph[idx] + .expr + .evaluate_statistics(&children_statistics)?; + } + } + Ok(self.graph[self.root].distribution()) + } + + /// Runs a propagation mechanism in a top-down manner to update statistics + /// of leaf nodes. + pub fn propagate_statistics( + &mut self, + given_stats: Distribution, + ) -> Result { + // Adjust the root node with the given statistics: + let root_range = self.graph[self.root].dist.range()?; + let given_range = given_stats.range()?; + if let Some(interval) = root_range.intersect(&given_range)? { + if interval != root_range { + // If the given statistics enable us to obtain a more precise + // range for the root, update it: + let subset = root_range.contains(given_range)?; + self.graph[self.root].dist = if subset == Interval::CERTAINLY_TRUE { + // Given statistics is strictly more informative, use it as is: + given_stats + } else { + // Intersecting ranges gives us a more precise range: + Distribution::new_from_interval(interval)? + }; + } + } else { + return Ok(PropagationResult::Infeasible); + } + + let mut bfs = Bfs::new(&self.graph, self.root); + + while let Some(node) = bfs.next(&self.graph) { + let neighbors = self.graph.neighbors_directed(node, Outgoing); + let mut children = neighbors.collect::>(); + // If the current expression is a leaf, its statistics is now final. + // So, just continue with the propagation procedure: + if children.is_empty() { + continue; + } + // Reverse to align with `PhysicalExpr`'s children: + children.reverse(); + let children_stats = children + .iter() + .map(|child| self.graph[*child].distribution()) + .collect::>(); + let node_statistics = self.graph[node].distribution(); + let propagated_statistics = self.graph[node] + .expr + .propagate_statistics(node_statistics, &children_stats)?; + if let Some(propagated_stats) = propagated_statistics { + for (child_idx, stats) in children.into_iter().zip(propagated_stats) { + self.graph[child_idx].dist = stats; + } + } else { + // The constraint is infeasible, report: + return Ok(PropagationResult::Infeasible); + } + } + Ok(PropagationResult::Success) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::expressions::{binary, try_cast, Column}; + use crate::intervals::cp_solver::PropagationResult; + use crate::statistics::stats_solver::ExprStatisticsGraph; + + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr_common::interval_arithmetic::Interval; + use datafusion_expr_common::operator::Operator; + use datafusion_expr_common::statistics::Distribution; + use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer; + use datafusion_physical_expr_common::physical_expr::PhysicalExpr; + + pub fn binary_expr( + left: Arc, + op: Operator, + right: Arc, + schema: &Schema, + ) -> Result> { + let left_type = left.data_type(schema)?; + let right_type = right.data_type(schema)?; + let binary_type_coercer = BinaryTypeCoercer::new(&left_type, &op, &right_type); + let (lhs, rhs) = binary_type_coercer.get_input_types()?; + + let left_expr = try_cast(left, schema, lhs)?; + let right_expr = try_cast(right, schema, rhs)?; + binary(left_expr, op, right_expr, schema) + } + + #[test] + fn test_stats_integration() -> Result<()> { + let schema = &Schema::new(vec![ + Field::new("a", DataType::Float64, false), + Field::new("b", DataType::Float64, false), + Field::new("c", DataType::Float64, false), + Field::new("d", DataType::Float64, false), + ]); + + let a = Arc::new(Column::new("a", 0)) as _; + let b = Arc::new(Column::new("b", 1)) as _; + let c = Arc::new(Column::new("c", 2)) as _; + let d = Arc::new(Column::new("d", 3)) as _; + + let left = binary_expr(a, Operator::Plus, b, schema)?; + let right = binary_expr(c, Operator::Minus, d, schema)?; + let expr = binary_expr(left, Operator::Eq, right, schema)?; + + let mut graph = ExprStatisticsGraph::try_new(expr, schema)?; + // 2, 5 and 6 are BinaryExpr + graph.assign_statistics(&[ + ( + 0usize, + Distribution::new_uniform(Interval::make(Some(0.), Some(1.))?)?, + ), + ( + 1usize, + Distribution::new_uniform(Interval::make(Some(0.), Some(2.))?)?, + ), + ( + 3usize, + Distribution::new_uniform(Interval::make(Some(1.), Some(3.))?)?, + ), + ( + 4usize, + Distribution::new_uniform(Interval::make(Some(1.), Some(5.))?)?, + ), + ]); + let ev_stats = graph.evaluate_statistics()?; + assert_eq!( + ev_stats, + &Distribution::new_bernoulli(ScalarValue::Float64(None))? + ); + + let one = ScalarValue::new_one(&DataType::Float64)?; + assert_eq!( + graph.propagate_statistics(Distribution::new_bernoulli(one)?)?, + PropagationResult::Success + ); + Ok(()) + } +} diff --git a/test-utils/src/array_gen/string.rs b/test-utils/src/array_gen/string.rs index e2a983612b8b..ac659ae67bc0 100644 --- a/test-utils/src/array_gen/string.rs +++ b/test-utils/src/array_gen/string.rs @@ -97,7 +97,7 @@ fn random_string(rng: &mut StdRng, max_len: usize) -> String { let len = rng.gen_range(1..=max_len); rng.sample_iter::(rand::distributions::Standard) .take(len) - .collect::() + .collect() } } } From cec457a240d56f89ab90057ad56ce918a90410e2 Mon Sep 17 00:00:00 2001 From: Peter L Date: Tue, 25 Feb 2025 01:58:18 +1030 Subject: [PATCH 59/71] Add support for `Dictionary` & `Interval` to AST datatype (#14783) --- datafusion/sql/src/unparser/expr.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 7c56969d47cd..d051cb78a8d5 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1585,9 +1585,7 @@ impl Unparser<'_> { DataType::Duration(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::Interval(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } + DataType::Interval(_) => Ok(ast::DataType::Interval), DataType::Binary => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } @@ -1624,9 +1622,7 @@ impl Unparser<'_> { DataType::Union(_, _) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::Dictionary(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } + DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val), DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; @@ -2804,6 +2800,22 @@ mod tests { Ok(()) } + #[test] + fn test_dictionary_to_sql() -> Result<()> { + let dialect = CustomDialectBuilder::new().build(); + + let unparser = Unparser::new(&dialect); + + let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ))?; + + assert_eq!(ast_dtype, ast::DataType::Varchar(None)); + + Ok(()) + } + #[test] fn test_utf8_view_to_sql() -> Result<()> { let dialect = CustomDialectBuilder::new() From e799097cbd2079d658ddc6243817ac529e2f9807 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Mon, 24 Feb 2025 11:18:45 -0500 Subject: [PATCH 60/71] Improve benchmark docs (#14820) * Correct docs on subcommand help The command as written gives you `cargo` help, not `tpch` help as the text above says. And the output shown was for the benchmark bin, not the subcommand. Also correct some inconsistencies and punctuation. * Docs on how to add a new benchmark * Improve wording and punctuation in benchmarks README * Remove help text about /benchmark PR command that's disabled --- benchmarks/README.md | 104 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 21 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 332cac8459d7..2954f42c25db 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -85,7 +85,7 @@ git checkout main # Gather baseline data for tpch benchmark ./benchmarks/bench.sh run tpch -# Switch to the branch the branch name is mybranch and gather data +# Switch to the branch named mybranch and gather data git checkout mybranch ./benchmarks/bench.sh run tpch @@ -157,22 +157,19 @@ Benchmark tpch_mem.json └──────────────┴──────────────┴──────────────┴───────────────┘ ``` -Note that you can also execute an automatic comparison of the changes in a given PR against the base -just by including the trigger `/benchmark` in any comment. - ### Running Benchmarks Manually -Assuming data in the `data` directory, the `tpch` benchmark can be run with a command like this +Assuming data is in the `data` directory, the `tpch` benchmark can be run with a command like this: ```bash cargo run --release --bin dfbench -- tpch --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 ``` -See the help for more details +See the help for more details. ### Different features -You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`. For example +You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`. For example: ```shell cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 @@ -184,6 +181,7 @@ The benchmark program also supports CSV and Parquet input file formats and a uti ```bash cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet ``` + Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`. ### Comparing results between runs @@ -206,7 +204,7 @@ $ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path . ./compare.py /tmp/output_main/tpch-summary--1679330119.json /tmp/output_branch/tpch-summary--1679328405.json ``` -This will produce output like +This will produce output like: ``` ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ @@ -243,28 +241,92 @@ The `dfbench` program contains subcommands to run the various benchmarks. When benchmarking, it should always be built in release mode using `--release`. -Full help for each benchmark can be found in the relevant sub -command. For example to get help for tpch, run +Full help for each benchmark can be found in the relevant +subcommand. For example, to get help for tpch, run: ```shell -cargo run --release --bin dfbench --help +cargo run --release --bin dfbench -- tpch --help ... -datafusion-benchmarks 27.0.0 -benchmark command +dfbench-tpch 45.0.0 +Run the tpch benchmark. + +This benchmarks is derived from the [TPC-H][1] version +[2.17.1]. The data and answers are generated using `tpch-gen` from +[2]. + +[1]: http://www.tpc.org/tpch/ +[2]: https://github.com/databricks/tpch-dbgen.git, +[2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf USAGE: - dfbench + dfbench tpch [FLAGS] [OPTIONS] --path + +FLAGS: + -d, --debug + Activate debug mode to see more details -SUBCOMMANDS: - clickbench Run the clickbench benchmark - help Prints this message or the help of the given subcommand(s) - parquet-filter Test performance of parquet filter pushdown - sort Test performance of parquet filter pushdown - tpch Run the tpch benchmark. - tpch-convert Convert tpch .slt files to .parquet or .csv files + -S, --disable-statistics + Whether to disable collection of statistics (and cost based optimizations) or not + -h, --help + Prints help information +... ``` +# Writing a new benchmark + +## Creating or downloading data outside of the benchmark + +If you want to create or download the data with Rust as part of running the benchmark, see the next +section on adding a benchmark subcommand and add code to create or download data as part of its +`run` function. + +If you want to create or download the data with shell commands, in `benchmarks/bench.sh`, define a +new function named `data_[your benchmark name]` and call that function in the `data` command case +as a subcommand case named for your benchmark. Also call the new function in the `data all` case. + +## Adding the benchmark subcommand + +In `benchmarks/bench.sh`, define a new function named `run_[your benchmark name]` following the +example of existing `run_*` functions. Call that function in the `run` command case as a subcommand +case named for your benchmark. subcommand for your benchmark. Also call the new function in the +`run all` case. Add documentation for your benchmark to the text in the `usage` function. + +In `benchmarks/src/bin/dfbench.rs`, add a `dfbench` subcommand for your benchmark by: + +- Adding a new variant to the `Options` enum +- Adding corresponding code to handle the new variant in the `main` function, similar to the other + variants +- Adding a module to the `use datafusion_benchmarks::{}` statement + +In `benchmarks/src/lib.rs`, declare the new module you imported in `dfbench.rs` and create the +corresponding file(s) for the module's code. + +In the module, following the pattern of other existing benchmarks, define a `RunOpt` struct with: + +- A doc comment that will become the `--help` output for the subcommand +- A `run` method that the `dfbench` `main` function will call. +- A `--path` structopt field that the `bench.sh` script should use with `${DATA_DIR}` to define + where the input data should be stored. +- An `--output` structopt field that the `bench.sh` script should use with `"${RESULTS_FILE}"` to + define where the benchmark's results should be stored. + +### Creating or downloading data as part of the benchmark + +Use the `--path` structopt field defined on the `RunOpt` struct to know where to store or look for +the data. Generate the data using whatever Rust code you'd like, before the code that will be +measuring an operation. + +### Collecting data + +Your benchmark should create and use an instance of `BenchmarkRun` defined in `benchmarks/src/util/run.rs` as follows: + +- Call its `start_new_case` method with a string that will appear in the "Query" column of the + compare output. +- Use `write_iter` to record elapsed times for the behavior you're benchmarking. +- When all cases are done, call the `BenchmarkRun`'s `maybe_write_json` method, giving it the value + of the `--output` structopt field on `RunOpt`. + # Benchmarks The output of `dfbench` help includes a description of each benchmark, which is reproduced here for convenience From a235276ec493a78ea8e186e6966956fc9572b0f5 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Mon, 24 Feb 2025 22:58:35 +0100 Subject: [PATCH 61/71] Add `range` table function (#14830) * extract name * extract inclusive * range table function --- .../functions-table/src/generate_series.rs | 97 +++++++++++-- datafusion/functions-table/src/lib.rs | 3 +- .../test_files/table_functions.slt | 128 +++++++++++++++++- 3 files changed, 207 insertions(+), 21 deletions(-) diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs index 887daa71ec55..df8357ee1974 100644 --- a/datafusion/functions-table/src/generate_series.rs +++ b/datafusion/functions-table/src/generate_series.rs @@ -34,9 +34,19 @@ use std::sync::Arc; #[derive(Debug, Clone)] enum GenSeriesArgs { /// ContainsNull signifies that at least one argument(start, end, step) was null, thus no series will be generated. - ContainsNull, + ContainsNull { + include_end: bool, + name: &'static str, + }, /// AllNotNullArgs holds the start, end, and step values for generating the series when all arguments are not null. - AllNotNullArgs { start: i64, end: i64, step: i64 }, + AllNotNullArgs { + start: i64, + end: i64, + step: i64, + /// Indicates whether the end value should be included in the series. + include_end: bool, + name: &'static str, + }, } /// Table that generates a series of integers from `start`(inclusive) to `end`(inclusive), incrementing by step @@ -57,15 +67,26 @@ struct GenerateSeriesState { /// Tracks current position when generating table current: i64, + /// Indicates whether the end value should be included in the series. + include_end: bool, + name: &'static str, } impl GenerateSeriesState { fn reach_end(&self, val: i64) -> bool { if self.step > 0 { - return val > self.end; + if self.include_end { + return val > self.end; + } else { + return val >= self.end; + } } - val < self.end + if self.include_end { + val < self.end + } else { + val <= self.end + } } } @@ -74,8 +95,8 @@ impl fmt::Display for GenerateSeriesState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "generate_series: start={}, end={}, batch_size={}", - self.start, self.end, self.batch_size + "{}: start={}, end={}, batch_size={}", + self.name, self.start, self.end, self.batch_size ) } } @@ -124,21 +145,31 @@ impl TableProvider for GenerateSeriesTable { let state = match self.args { // if args have null, then return 0 row - GenSeriesArgs::ContainsNull => GenerateSeriesState { + GenSeriesArgs::ContainsNull { include_end, name } => GenerateSeriesState { schema: self.schema.clone(), start: 0, end: 0, step: 1, current: 1, batch_size, + include_end, + name, }, - GenSeriesArgs::AllNotNullArgs { start, end, step } => GenerateSeriesState { + GenSeriesArgs::AllNotNullArgs { + start, + end, + step, + include_end, + name, + } => GenerateSeriesState { schema: self.schema.clone(), start, end, step, current: start, batch_size, + include_end, + name, }, }; @@ -150,12 +181,15 @@ impl TableProvider for GenerateSeriesTable { } #[derive(Debug)] -pub struct GenerateSeriesFunc {} +struct GenerateSeriesFuncImpl { + name: &'static str, + include_end: bool, +} -impl TableFunctionImpl for GenerateSeriesFunc { +impl TableFunctionImpl for GenerateSeriesFuncImpl { fn call(&self, exprs: &[Expr]) -> Result> { if exprs.is_empty() || exprs.len() > 3 { - return plan_err!("generate_series function requires 1 to 3 arguments"); + return plan_err!("{} function requires 1 to 3 arguments", self.name); } let mut normalize_args = Vec::new(); @@ -177,7 +211,10 @@ impl TableFunctionImpl for GenerateSeriesFunc { // contain null return Ok(Arc::new(GenerateSeriesTable { schema, - args: GenSeriesArgs::ContainsNull, + args: GenSeriesArgs::ContainsNull { + include_end: self.include_end, + name: self.name, + }, })); } @@ -186,7 +223,7 @@ impl TableFunctionImpl for GenerateSeriesFunc { [start, end] => (*start, *end, 1), [start, end, step] => (*start, *end, *step), _ => { - return plan_err!("generate_series function requires 1 to 3 arguments"); + return plan_err!("{} function requires 1 to 3 arguments", self.name); } }; @@ -204,7 +241,39 @@ impl TableFunctionImpl for GenerateSeriesFunc { Ok(Arc::new(GenerateSeriesTable { schema, - args: GenSeriesArgs::AllNotNullArgs { start, end, step }, + args: GenSeriesArgs::AllNotNullArgs { + start, + end, + step, + include_end: self.include_end, + name: self.name, + }, })) } } + +#[derive(Debug)] +pub struct GenerateSeriesFunc {} + +impl TableFunctionImpl for GenerateSeriesFunc { + fn call(&self, exprs: &[Expr]) -> Result> { + let impl_func = GenerateSeriesFuncImpl { + name: "generate_series", + include_end: true, + }; + impl_func.call(exprs) + } +} + +#[derive(Debug)] +pub struct RangeFunc {} + +impl TableFunctionImpl for RangeFunc { + fn call(&self, exprs: &[Expr]) -> Result> { + let impl_func = GenerateSeriesFuncImpl { + name: "range", + include_end: false, + }; + impl_func.call(exprs) + } +} diff --git a/datafusion/functions-table/src/lib.rs b/datafusion/functions-table/src/lib.rs index 4a31760e7c4d..311b9d310f39 100644 --- a/datafusion/functions-table/src/lib.rs +++ b/datafusion/functions-table/src/lib.rs @@ -28,7 +28,7 @@ use std::sync::Arc; /// Returns all default table functions pub fn all_default_table_functions() -> Vec> { - vec![generate_series()] + vec![generate_series(), range()] } /// Creates a singleton instance of a table function @@ -55,3 +55,4 @@ macro_rules! create_udtf_function { } create_udtf_function!(generate_series::GenerateSeriesFunc, "generate_series"); +create_udtf_function!(generate_series::RangeFunc, "range"); diff --git a/datafusion/sqllogictest/test_files/table_functions.slt b/datafusion/sqllogictest/test_files/table_functions.slt index 2769da03b8bb..7d318c50bacf 100644 --- a/datafusion/sqllogictest/test_files/table_functions.slt +++ b/datafusion/sqllogictest/test_files/table_functions.slt @@ -68,15 +68,15 @@ SELECT SUM(v1) FROM generate_series(1, 5) t1(v1) query I SELECT * FROM generate_series(6, -1, -2) ---- -6 -4 -2 -0 +6 +4 +2 +0 query I SELECT * FROM generate_series(6, 66, 666) ---- -6 +6 @@ -120,7 +120,7 @@ SELECT v1 + 10 FROM (SELECT * FROM generate_series(1, 3) t1(v1)) # Test generate_series with JOIN query II rowsort -SELECT a.v1, b.v1 +SELECT a.v1, b.v1 FROM generate_series(1, 3) a(v1) JOIN generate_series(2, 4) b(v1) ON a.v1 = b.v1 - 1 @@ -187,3 +187,119 @@ SELECT generate_series(1, t1.end) FROM generate_series(3, 5) as t1(end) [1, 2, 3, 4, 5] [1, 2, 3, 4] [1, 2, 3] + +# Test range table function +query I +SELECT * FROM range(6) +---- +0 +1 +2 +3 +4 +5 + + + +query I rowsort +SELECT * FROM range(1, 5) +---- +1 +2 +3 +4 + +query I rowsort +SELECT * FROM range(1, 1) +---- + +query I rowsort +SELECT * FROM range(3, 6) +---- +3 +4 +5 + +# #generated_data > batch_size +query I +SELECT count(v1) FROM range(-66666,66666) t1(v1) +---- +133332 + +query I rowsort +SELECT SUM(v1) FROM range(1, 5) t1(v1) +---- +10 + +query I +SELECT * FROM range(6, -1, -2) +---- +6 +4 +2 +0 + +query I +SELECT * FROM range(6, 66, 666) +---- +6 + + + +# +# Test range with null arguments +# + +query I +SELECT * FROM range(NULL, 5) +---- + +query I +SELECT * FROM range(1, NULL) +---- + +query I +SELECT * FROM range(NULL, NULL) +---- + +query I +SELECT * FROM range(1, 5, NULL) +---- + + +query TT +EXPLAIN SELECT * FROM range(1, 5) +---- +logical_plan TableScan: tmp_table projection=[value] +physical_plan LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192] + +# +# Test range with invalid arguments +# + +query error DataFusion error: Error during planning: start is bigger than end, but increment is positive: cannot generate infinite series +SELECT * FROM range(5, 1) + +query error DataFusion error: Error during planning: start is smaller than end, but increment is negative: cannot generate infinite series +SELECT * FROM range(-6, 6, -1) + +query error DataFusion error: Error during planning: step cannot be zero +SELECT * FROM range(-6, 6, 0) + +query error DataFusion error: Error during planning: start is bigger than end, but increment is positive: cannot generate infinite series +SELECT * FROM range(6, -6, 1) + + +statement error DataFusion error: Error during planning: range function requires 1 to 3 arguments +SELECT * FROM range(1, 2, 3, 4) + + +statement error DataFusion error: Error during planning: First argument must be an integer literal +SELECT * FROM range('foo', 'bar') + +# UDF and UDTF `range` can be used simultaneously +query ? rowsort +SELECT range(1, t1.end) FROM range(3, 5) as t1(end) +---- +[1, 2, 3] +[1, 2] From aadb0b64056e51d10c43905c2dc16ab794128cfb Mon Sep 17 00:00:00 2001 From: Andy Yen <38731840+onlyjackfrost@users.noreply.github.com> Date: Tue, 25 Feb 2025 05:58:53 +0800 Subject: [PATCH 62/71] migrate invoke_batch to invoke_with_args for unicode function (#14856) --- datafusion/functions/src/unicode/character_length.rs | 7 +++---- datafusion/functions/src/unicode/initcap.rs | 6 +++--- datafusion/functions/src/unicode/left.rs | 6 +++--- datafusion/functions/src/unicode/lpad.rs | 6 +++--- datafusion/functions/src/unicode/reverse.rs | 6 +++--- datafusion/functions/src/unicode/right.rs | 6 +++--- datafusion/functions/src/unicode/rpad.rs | 6 +++--- datafusion/functions/src/unicode/strpos.rs | 7 +++---- datafusion/functions/src/unicode/substr.rs | 7 +++---- datafusion/functions/src/unicode/substrindex.rs | 7 +++---- datafusion/functions/src/unicode/translate.rs | 7 +++---- 11 files changed, 33 insertions(+), 38 deletions(-) diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index 6ef0871f368b..c2db253dc741 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -88,12 +88,11 @@ impl ScalarUDFImpl for CharacterLengthFunc { utf8_to_int_type(&arg_types[0], "character_length") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(character_length, vec![])(args) + make_scalar_function(character_length, vec![])(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions/src/unicode/initcap.rs b/datafusion/functions/src/unicode/initcap.rs index 03314bb24925..a8a4dd0fa249 100644 --- a/datafusion/functions/src/unicode/initcap.rs +++ b/datafusion/functions/src/unicode/initcap.rs @@ -87,11 +87,11 @@ impl ScalarUDFImpl for InitcapFunc { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match args[0].data_type() { DataType::Utf8 => make_scalar_function(initcap::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(initcap::, vec![])(args), diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index c8fbee4d90d8..f99f0de67ebb 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -97,11 +97,11 @@ impl ScalarUDFImpl for LeftFunc { utf8_to_str_type(&arg_types[0], "left") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match args[0].data_type() { DataType::Utf8 | DataType::Utf8View => { make_scalar_function(left::, vec![])(args) diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 216037e90659..ea57dbd2bed5 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -109,11 +109,11 @@ impl ScalarUDFImpl for LPadFunc { utf8_to_str_type(&arg_types[0], "lpad") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match args[0].data_type() { Utf8 | Utf8View => make_scalar_function(lpad::, vec![])(args), LargeUtf8 => make_scalar_function(lpad::, vec![])(args), diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index c941fe32c13b..311e9e81a8be 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -85,11 +85,11 @@ impl ScalarUDFImpl for ReverseFunc { utf8_to_str_type(&arg_types[0], "reverse") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match args[0].data_type() { Utf8 | Utf8View => make_scalar_function(reverse::, vec![])(args), LargeUtf8 => make_scalar_function(reverse::, vec![])(args), diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 3561176f1dd7..1ceaf6998331 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -97,11 +97,11 @@ impl ScalarUDFImpl for RightFunc { utf8_to_str_type(&arg_types[0], "right") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match args[0].data_type() { DataType::Utf8 | DataType::Utf8View => { make_scalar_function(right::, vec![])(args) diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 72bb112e6f70..c68c4d329c74 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -108,11 +108,11 @@ impl ScalarUDFImpl for RPadFunc { utf8_to_str_type(&arg_types[0], "rpad") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let args = &args.args; match ( args.len(), args[0].data_type(), diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs index 19b82ccc23c2..abc2780e002f 100644 --- a/datafusion/functions/src/unicode/strpos.rs +++ b/datafusion/functions/src/unicode/strpos.rs @@ -83,12 +83,11 @@ impl ScalarUDFImpl for StrposFunc { utf8_to_int_type(&arg_types[0], "strpos/instr/position") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(strpos, vec![])(args) + make_scalar_function(strpos, vec![])(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 20d5f6e3abeb..4dcbea4807f4 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -95,12 +95,11 @@ impl ScalarUDFImpl for SubstrFunc { Ok(DataType::Utf8View) } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(substr, vec![])(args) + make_scalar_function(substr, vec![])(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index 20ad33b3cfe3..9a18b5d23c5e 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -108,12 +108,11 @@ impl ScalarUDFImpl for SubstrIndexFunc { utf8_to_str_type(&arg_types[0], "substr_index") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(substr_index, vec![])(args) + make_scalar_function(substr_index, vec![])(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 47766ded3add..8b4894643a7a 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -95,12 +95,11 @@ impl ScalarUDFImpl for TranslateFunc { utf8_to_str_type(&arg_types[0], "translate") } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(invoke_translate, vec![])(args) + make_scalar_function(invoke_translate, vec![])(&args.args) } fn documentation(&self) -> Option<&Documentation> { From d0ab003dbf39bff8eeb568be294b7c4a63676589 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:21:36 +0200 Subject: [PATCH 63/71] test: change test_function macro to use `return_type_from_args` instead of `return_type` (#14852) --- datafusion/functions/src/utils.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 39d8aeeda460..47f3121ba2ce 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -141,12 +141,28 @@ pub mod test { ColumnarValue::Array(a) => Some(a.len()), }) .unwrap_or(1); - let return_type = func.return_type(&type_array); + + let scalar_arguments = $ARGS.iter().map(|arg| match arg { + ColumnarValue::Scalar(scalar) => Some(scalar.clone()), + ColumnarValue::Array(_) => None, + }).collect::>(); + let scalar_arguments_refs = scalar_arguments.iter().map(|arg| arg.as_ref()).collect::>(); + + let nullables = $ARGS.iter().map(|arg| match arg { + ColumnarValue::Scalar(scalar) => scalar.is_null(), + ColumnarValue::Array(a) => a.null_count() > 0, + }).collect::>(); + + let return_info = func.return_type_from_args(datafusion_expr::ReturnTypeArgs { + arg_types: &type_array, + scalar_arguments: &scalar_arguments_refs, + nullables: &nullables + }); match expected { Ok(expected) => { - assert_eq!(return_type.is_ok(), true); - let return_type = return_type.unwrap(); + assert_eq!(return_info.is_ok(), true); + let (return_type, _nullable) = return_info.unwrap().into_parts(); assert_eq!(return_type, $EXPECTED_DATA_TYPE); let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, number_rows: cardinality, return_type: &return_type}); @@ -163,15 +179,17 @@ pub mod test { }; } Err(expected_error) => { - if return_type.is_err() { - match return_type { + if return_info.is_err() { + match return_info { Ok(_) => assert!(false, "expected error"), Err(error) => { datafusion_common::assert_contains!(expected_error.strip_backtrace(), error.strip_backtrace()); } } } else { + let (return_type, _nullable) = return_info.unwrap().into_parts(); + // invoke is expected error - cannot use .expect_err() due to Debug not being implemented - match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, number_rows: cardinality, return_type: &return_type.unwrap()}) { + match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, number_rows: cardinality, return_type: &return_type}) { Ok(_) => assert!(false, "expected error"), Err(error) => { assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace())); From 9285b84ce90180eab96241939ef9e97632f35566 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 25 Feb 2025 13:45:22 +0000 Subject: [PATCH 64/71] Move `FileSourceConfig` and `FileStream` to the new `datafusion-datasource` (#14838) * Initial work * Fix some CI issues * remove cyclical dev-dependency on core * Trying to keep some key things accessible in the same way * ignore rustdoc test for example * Restore doc test with mock parquet source --------- Co-authored-by: Andrew Lamb --- .../examples/csv_json_opener.rs | 2 +- .../examples/custom_file_format.rs | 5 +- datafusion/common/src/test_util.rs | 2 +- datafusion/core/Cargo.toml | 2 +- .../core/src/datasource/file_format/arrow.rs | 8 +- .../core/src/datasource/file_format/avro.rs | 5 +- .../core/src/datasource/file_format/csv.rs | 8 +- .../core/src/datasource/file_format/json.rs | 7 +- .../core/src/datasource/file_format/mod.rs | 5 +- .../src/datasource/file_format/parquet.rs | 8 +- .../core/src/datasource/listing/table.rs | 3 +- datafusion/core/src/datasource/mod.rs | 1 - .../datasource/physical_plan/arrow_file.rs | 5 +- .../core/src/datasource/physical_plan/avro.rs | 6 +- .../core/src/datasource/physical_plan/csv.rs | 6 +- .../physical_plan/file_scan_config.rs | 1244 -------------- .../datasource/physical_plan/file_stream.rs | 799 --------- .../core/src/datasource/physical_plan/json.rs | 6 +- .../core/src/datasource/physical_plan/mod.rs | 462 +----- .../datasource/physical_plan/parquet/mod.rs | 5 +- .../physical_plan/parquet/source.rs | 7 +- .../core/src/datasource/schema_adapter.rs | 3 +- datafusion/core/src/test/mod.rs | 5 +- datafusion/core/src/test_util/parquet.rs | 3 +- .../physical_optimizer/enforce_sorting.rs | 3 +- .../tests/physical_optimizer/test_utils.rs | 3 +- datafusion/datasource/Cargo.toml | 2 + datafusion/datasource/src/display.rs | 295 ++++ .../data_source.rs => datasource/src/file.rs} | 8 +- datafusion/datasource/src/file_scan_config.rs | 1447 ++++++++++++++++- datafusion/datasource/src/file_stream.rs | 773 ++++++++- datafusion/datasource/src/memory.rs | 40 +- datafusion/datasource/src/mod.rs | 45 +- .../src}/statistics.rs | 14 +- datafusion/datasource/src/test_util.rs | 84 + .../proto/src/physical_plan/from_proto.rs | 3 +- 36 files changed, 2727 insertions(+), 2597 deletions(-) delete mode 100644 datafusion/core/src/datasource/physical_plan/file_scan_config.rs delete mode 100644 datafusion/core/src/datasource/physical_plan/file_stream.rs create mode 100644 datafusion/datasource/src/display.rs rename datafusion/{core/src/datasource/data_source.rs => datasource/src/file.rs} (95%) rename datafusion/{core/src/datasource/physical_plan => datasource/src}/statistics.rs (97%) create mode 100644 datafusion/datasource/src/test_util.rs diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index ef4ff9d51e7f..574137afe5c9 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -21,8 +21,8 @@ use arrow::datatypes::{DataType, Field, Schema}; use datafusion::datasource::physical_plan::JsonSource; use datafusion::{ assert_batches_eq, + datasource::physical_plan::FileSource, datasource::{ - data_source::FileSource, file_format::file_compression_type::FileCompressionType, listing::PartitionedFile, object_store::ObjectStoreUrl, diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs index c44210e55318..165d82627061 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_file_format.rs @@ -21,14 +21,15 @@ use arrow::{ array::{AsArray, RecordBatch, StringArray, UInt8Array}, datatypes::{DataType, Field, Schema, SchemaRef, UInt64Type}, }; -use datafusion::datasource::data_source::FileSource; -use datafusion::execution::session_state::SessionStateBuilder; use datafusion::physical_expr::LexRequirement; use datafusion::physical_expr::PhysicalExpr; use datafusion::{ catalog::Session, common::{GetExt, Statistics}, }; +use datafusion::{ + datasource::physical_plan::FileSource, execution::session_state::SessionStateBuilder, +}; use datafusion::{ datasource::{ file_format::{ diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index a1f883f20525..298f54389cf8 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -28,7 +28,7 @@ use std::{error::Error, path::PathBuf}; /// /// Expects to be called about like this: /// -/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +/// `assert_batches_eq!(expected_lines: &[&str], batches: &[RecordBatch])` /// /// # Example /// ``` diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1e0f27ccdfc8..69048f6a7cf4 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -40,7 +40,7 @@ nested_expressions = ["datafusion-functions-nested"] # This feature is deprecated. Use the `nested_expressions` feature instead. array_expressions = ["nested_expressions"] # Used to enable the avro format -avro = ["apache-avro", "num-traits", "datafusion-common/avro"] +avro = ["apache-avro", "num-traits", "datafusion-common/avro", "datafusion-datasource/avro"] backtrace = ["datafusion-common/backtrace"] compression = ["xz2", "bzip2", "flate2", "zstd", "datafusion-datasource/compression"] crypto_expressions = ["datafusion-functions/crypto_expressions"] diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 09121eba6702..3614b788af90 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -31,9 +31,7 @@ use super::write::{create_writer, SharedBuffer}; use super::FileFormatFactory; use crate::datasource::file_format::write::get_writer_schema; use crate::datasource::file_format::FileFormat; -use crate::datasource::physical_plan::{ - ArrowSource, FileGroupDisplay, FileScanConfig, FileSink, FileSinkConfig, -}; +use crate::datasource::physical_plan::{ArrowSource, FileSink, FileSinkConfig}; use crate::error::Result; use crate::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; @@ -49,13 +47,15 @@ use datafusion_common::{ not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; +use datafusion_datasource::display::FileGroupDisplay; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexRequirement; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; -use crate::datasource::data_source::FileSource; use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs index c0c8f25722c2..e7314e839bf2 100644 --- a/datafusion/core/src/datasource/file_format/avro.rs +++ b/datafusion/core/src/datasource/file_format/avro.rs @@ -26,12 +26,11 @@ use super::file_compression_type::FileCompressionType; use super::FileFormat; use super::FileFormatFactory; use crate::datasource::avro_to_arrow::read_avro_schema_from_reader; -use crate::datasource::physical_plan::{AvroSource, FileScanConfig}; +use crate::datasource::physical_plan::AvroSource; use crate::error::Result; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::Statistics; -use crate::datasource::data_source::FileSource; use arrow::datatypes::Schema; use arrow::datatypes::SchemaRef; use async_trait::async_trait; @@ -40,6 +39,8 @@ use datafusion_common::internal_err; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::GetExt; use datafusion_common::DEFAULT_AVRO_EXTENSION; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_physical_expr::PhysicalExpr; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 4991a96dc3d3..45ad3e8c1c30 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -30,9 +30,7 @@ use super::{ use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::demux::DemuxedStreamReceiver; use crate::datasource::file_format::write::BatchSerializer; -use crate::datasource::physical_plan::{ - CsvSource, FileGroupDisplay, FileScanConfig, FileSink, FileSinkConfig, -}; +use crate::datasource::physical_plan::{CsvSource, FileSink, FileSinkConfig}; use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::insert::{DataSink, DataSinkExec}; @@ -51,12 +49,14 @@ use datafusion_common::{ exec_err, not_impl_err, DataFusionError, GetExt, DEFAULT_CSV_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; +use datafusion_datasource::display::FileGroupDisplay; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_execution::TaskContext; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexRequirement; -use crate::datasource::data_source::FileSource; use async_trait::async_trait; use bytes::{Buf, Bytes}; use futures::stream::BoxStream; diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs index 94e74b144499..7a5aaf7c64e8 100644 --- a/datafusion/core/src/datasource/file_format/json.rs +++ b/datafusion/core/src/datasource/file_format/json.rs @@ -32,9 +32,7 @@ use super::{ use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::demux::DemuxedStreamReceiver; use crate::datasource::file_format::write::BatchSerializer; -use crate::datasource::physical_plan::{ - FileGroupDisplay, FileSink, FileSinkConfig, JsonSource, -}; +use crate::datasource::physical_plan::{FileSink, FileSinkConfig, JsonSource}; use crate::error::Result; use crate::execution::SessionState; use crate::physical_plan::insert::{DataSink, DataSinkExec}; @@ -52,12 +50,13 @@ use datafusion_common::config::{ConfigField, ConfigFileType, JsonOptions}; use datafusion_common::file_options::json_writer::JsonWriterOptions; use datafusion_common::{not_impl_err, GetExt, DEFAULT_JSON_EXTENSION}; use datafusion_common_runtime::SpawnedTask; +use datafusion_datasource::display::FileGroupDisplay; +use datafusion_datasource::file::FileSource; use datafusion_execution::TaskContext; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; -use crate::datasource::data_source::FileSource; use async_trait::async_trait; use bytes::{Buf, Bytes}; use datafusion_physical_expr_common::sort_expr::LexRequirement; diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 657fe6ca5511..2b46748d0a52 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -28,7 +28,9 @@ pub mod json; pub mod options; #[cfg(feature = "parquet")] pub mod parquet; +use datafusion_datasource::file::FileSource; pub use datafusion_datasource::file_compression_type; +use datafusion_datasource::file_scan_config::FileScanConfig; pub use datafusion_datasource::write; use std::any::Any; @@ -40,7 +42,7 @@ use std::task::Poll; use crate::arrow::array::RecordBatch; use crate::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef}; use crate::arrow::error::ArrowError; -use crate::datasource::physical_plan::{FileScanConfig, FileSinkConfig}; +use crate::datasource::physical_plan::FileSinkConfig; use crate::error::Result; use crate::physical_plan::{ExecutionPlan, Statistics}; @@ -50,7 +52,6 @@ use datafusion_common::{internal_err, not_impl_err, GetExt}; use datafusion_expr::Expr; use datafusion_physical_expr::PhysicalExpr; -use crate::datasource::data_source::FileSource; use async_trait::async_trait; use bytes::{Buf, Bytes}; use datafusion_physical_expr_common::sort_expr::LexRequirement; diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 7dbc510eca09..e9ecff7baff5 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -28,7 +28,7 @@ use super::write::{create_writer, SharedBuffer}; use super::{ coerce_file_schema_to_string_type, coerce_file_schema_to_view_type, transform_binary_to_string, transform_schema_to_view, FileFormat, FileFormatFactory, - FilePushdownSupport, FileScanConfig, + FilePushdownSupport, }; use crate::arrow::array::RecordBatch; use crate::arrow::datatypes::{Fields, Schema, SchemaRef}; @@ -36,7 +36,7 @@ use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::get_writer_schema; use crate::datasource::physical_plan::parquet::can_expr_be_pushed_down_with_schemas; use crate::datasource::physical_plan::parquet::source::ParquetSource; -use crate::datasource::physical_plan::{FileGroupDisplay, FileSink, FileSinkConfig}; +use crate::datasource::physical_plan::{FileSink, FileSinkConfig}; use crate::datasource::statistics::{create_max_min_accs, get_col_stats}; use crate::error::Result; use crate::execution::SessionState; @@ -57,6 +57,9 @@ use datafusion_common::{ DEFAULT_PARQUET_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; +use datafusion_datasource::display::FileGroupDisplay; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; use datafusion_execution::TaskContext; use datafusion_expr::dml::InsertOp; @@ -65,7 +68,6 @@ use datafusion_functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexRequirement; -use crate::datasource::data_source::FileSource; use async_trait::async_trait; use bytes::Bytes; use futures::future::BoxFuture; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 819da155a1a2..adef02c38d73 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -29,11 +29,12 @@ use crate::datasource::{ file_compression_type::FileCompressionType, FileFormat, FilePushdownSupport, }, get_statistics_with_limit, - physical_plan::{FileScanConfig, FileSinkConfig}, + physical_plan::FileSinkConfig, }; use crate::execution::context::SessionState; use datafusion_catalog::TableProvider; use datafusion_common::{config_err, DataFusionError, Result}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_expr::dml::InsertOp; use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; use datafusion_expr::{SortExpr, TableType}; diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 96687913fb42..2b7bb14b6f6c 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -21,7 +21,6 @@ pub mod avro_to_arrow; pub mod cte_worktable; -pub mod data_source; pub mod default_table_source; pub mod dynamic_file; pub mod empty; diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index e5523063c782..1cae5c5084b1 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -20,10 +20,9 @@ use std::any::Any; use std::sync::Arc; -use crate::datasource::data_source::FileSource; use crate::datasource::listing::PartitionedFile; use crate::datasource::physical_plan::{ - FileMeta, FileOpenFuture, FileOpener, FileScanConfig, JsonSource, + FileMeta, FileOpenFuture, FileOpener, JsonSource, }; use crate::error::Result; @@ -32,6 +31,8 @@ use arrow::datatypes::SchemaRef; use arrow_ipc::reader::FileDecoder; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 1674814d76a7..08c22183302b 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -21,14 +21,16 @@ use std::any::Any; use std::fmt::Formatter; use std::sync::Arc; -use super::{FileOpener, FileScanConfig}; +use super::FileOpener; #[cfg(feature = "avro")] use crate::datasource::avro_to_arrow::Reader as AvroReader; -use crate::datasource::data_source::FileSource; + use crate::error::Result; use arrow::datatypes::SchemaRef; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 629d452064f5..1552060d067d 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -23,8 +23,8 @@ use std::io::{Read, Seek, SeekFrom}; use std::sync::Arc; use std::task::Poll; -use super::{calculate_range, FileScanConfig, RangeCalculation}; -use crate::datasource::data_source::FileSource; +use super::{calculate_range, RangeCalculation}; + use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::{deserialize_stream, DecoderDeserializer}; use crate::datasource::listing::{FileRange, ListingTableUrl, PartitionedFile}; @@ -37,6 +37,8 @@ use arrow::csv; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs deleted file mode 100644 index 6b74f6be79eb..000000000000 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ /dev/null @@ -1,1244 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! [`FileScanConfig`] to configure scanning of possibly partitioned -//! file sources. - -use super::{ - get_projected_output_ordering, statistics::MinMaxStatistics, FileGroupsDisplay, - FileStream, -}; -use crate::datasource::file_format::file_compression_type::FileCompressionType; -use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; -use crate::{error::Result, scalar::ScalarValue}; -use std::any::Any; -use std::fmt::Formatter; -use std::{fmt, sync::Arc}; - -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use datafusion_common::stats::Precision; -use datafusion_common::{ColumnStatistics, Constraints, Statistics}; -use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, Partitioning}; - -use crate::datasource::data_source::FileSource; -pub use datafusion_datasource::file_scan_config::*; -use datafusion_datasource::source::{DataSource, DataSourceExec}; -use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use datafusion_physical_plan::display::{display_orderings, ProjectSchemaDisplay}; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; -use datafusion_physical_plan::projection::{ - all_alias_free_columns, new_projections_for_columns, ProjectionExec, -}; -use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; - -/// Convert type to a type suitable for use as a [`ListingTable`] -/// partition column. Returns `Dictionary(UInt16, val_type)`, which is -/// a reasonable trade off between a reasonable number of partition -/// values and space efficiency. -/// -/// This use this to specify types for partition columns. However -/// you MAY also choose not to dictionary-encode the data or to use a -/// different dictionary type. -/// -/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say. -/// -/// [`ListingTable`]: crate::datasource::listing::ListingTable -pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType { - DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type)) -} - -/// Convert a [`ScalarValue`] of partition columns to a type, as -/// described in the documentation of [`wrap_partition_type_in_dict`], -/// which can wrap the types. -pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue { - ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val)) -} - -/// The base configurations for a [`DataSourceExec`], the a physical plan for -/// any given file format. -/// -/// Use [`Self::build`] to create a [`DataSourceExec`] from a ``FileScanConfig`. -/// -/// # Example -/// ``` -/// # use std::sync::Arc; -/// # use arrow::datatypes::{Field, Fields, DataType, Schema}; -/// # use datafusion::datasource::listing::PartitionedFile; -/// # use datafusion::datasource::physical_plan::FileScanConfig; -/// # use datafusion_execution::object_store::ObjectStoreUrl; -/// # use datafusion::datasource::physical_plan::ArrowSource; -/// # use datafusion_physical_plan::ExecutionPlan; -/// # let file_schema = Arc::new(Schema::new(vec![ -/// # Field::new("c1", DataType::Int32, false), -/// # Field::new("c2", DataType::Int32, false), -/// # Field::new("c3", DataType::Int32, false), -/// # Field::new("c4", DataType::Int32, false), -/// # ])); -/// // create FileScan config for reading arrow files from file:// -/// let object_store_url = ObjectStoreUrl::local_filesystem(); -/// let file_source = Arc::new(ArrowSource::default()); -/// let config = FileScanConfig::new(object_store_url, file_schema, file_source) -/// .with_limit(Some(1000)) // read only the first 1000 records -/// .with_projection(Some(vec![2, 3])) // project columns 2 and 3 -/// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group -/// .with_file(PartitionedFile::new("file1.parquet", 1234)) -/// // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes -/// // in a single row group -/// .with_file_group(vec![ -/// PartitionedFile::new("file2.parquet", 56), -/// PartitionedFile::new("file3.parquet", 78), -/// ]); -/// // create an execution plan from the config -/// let plan: Arc = config.build(); -/// ``` -#[derive(Clone)] -pub struct FileScanConfig { - /// Object store URL, used to get an [`ObjectStore`] instance from - /// [`RuntimeEnv::object_store`] - /// - /// This `ObjectStoreUrl` should be the prefix of the absolute url for files - /// as `file://` or `s3://my_bucket`. It should not include the path to the - /// file itself. The relevant URL prefix must be registered via - /// [`RuntimeEnv::register_object_store`] - /// - /// [`ObjectStore`]: object_store::ObjectStore - /// [`RuntimeEnv::register_object_store`]: datafusion_execution::runtime_env::RuntimeEnv::register_object_store - /// [`RuntimeEnv::object_store`]: datafusion_execution::runtime_env::RuntimeEnv::object_store - pub object_store_url: ObjectStoreUrl, - /// Schema before `projection` is applied. It contains the all columns that may - /// appear in the files. It does not include table partition columns - /// that may be added. - pub file_schema: SchemaRef, - /// List of files to be processed, grouped into partitions - /// - /// Each file must have a schema of `file_schema` or a subset. If - /// a particular file has a subset, the missing columns are - /// padded with NULLs. - /// - /// DataFusion may attempt to read each partition of files - /// concurrently, however files *within* a partition will be read - /// sequentially, one after the next. - pub file_groups: Vec>, - /// Table constraints - pub constraints: Constraints, - /// Estimated overall statistics of the files, taking `filters` into account. - /// Defaults to [`Statistics::new_unknown`]. - pub statistics: Statistics, - /// Columns on which to project the data. Indexes that are higher than the - /// number of columns of `file_schema` refer to `table_partition_cols`. - pub projection: Option>, - /// The maximum number of records to read from this plan. If `None`, - /// all records after filtering are returned. - pub limit: Option, - /// The partitioning columns - pub table_partition_cols: Vec, - /// All equivalent lexicographical orderings that describe the schema. - pub output_ordering: Vec, - /// File compression type - pub file_compression_type: FileCompressionType, - /// Are new lines in values supported for CSVOptions - pub new_lines_in_values: bool, - /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc. - pub source: Arc, -} - -impl DataSource for FileScanConfig { - fn open( - &self, - partition: usize, - context: Arc, - ) -> Result { - let object_store = context.runtime_env().object_store(&self.object_store_url)?; - - let source = self - .source - .with_batch_size(context.session_config().batch_size()) - .with_schema(Arc::clone(&self.file_schema)) - .with_projection(self); - - let opener = source.create_file_opener(object_store, self, partition); - - let stream = FileStream::new(self, partition, opener, source.metrics())?; - Ok(Box::pin(stream)) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { - let (schema, _, _, orderings) = self.project(); - - write!(f, "file_groups=")?; - FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?; - - if !schema.fields().is_empty() { - write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; - } - - if let Some(limit) = self.limit { - write!(f, ", limit={limit}")?; - } - - display_orderings(f, &orderings)?; - - if !self.constraints.is_empty() { - write!(f, ", {}", self.constraints)?; - } - - self.fmt_file_source(t, f) - } - - /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size. - fn repartitioned( - &self, - target_partitions: usize, - repartition_file_min_size: usize, - output_ordering: Option, - ) -> Result>> { - let source = self.source.repartitioned( - target_partitions, - repartition_file_min_size, - output_ordering, - self, - )?; - - Ok(source.map(|s| Arc::new(s) as _)) - } - - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.file_groups.len()) - } - - fn eq_properties(&self) -> EquivalenceProperties { - let (schema, constraints, _, orderings) = self.project(); - EquivalenceProperties::new_with_orderings(schema, orderings.as_slice()) - .with_constraints(constraints) - } - - fn statistics(&self) -> Result { - self.source.statistics() - } - - fn with_fetch(&self, limit: Option) -> Option> { - let source = self.clone(); - Some(Arc::new(source.with_limit(limit))) - } - - fn fetch(&self) -> Option { - self.limit - } - - fn metrics(&self) -> ExecutionPlanMetricsSet { - self.source.metrics().clone() - } - - fn try_swapping_with_projection( - &self, - projection: &ProjectionExec, - ) -> Result>> { - // If there is any non-column or alias-carrier expression, Projection should not be removed. - // This process can be moved into CsvExec, but it would be an overlap of their responsibility. - Ok(all_alias_free_columns(projection.expr()).then(|| { - let file_scan = self.clone(); - let source = Arc::clone(&file_scan.source); - let new_projections = new_projections_for_columns( - projection, - &file_scan - .projection - .clone() - .unwrap_or((0..self.file_schema.fields().len()).collect()), - ); - file_scan - // Assign projected statistics to source - .with_projection(Some(new_projections)) - .with_source(source) - .build() as _ - })) - } -} - -impl FileScanConfig { - /// Create a new [`FileScanConfig`] with default settings for scanning files. - /// - /// See example on [`FileScanConfig`] - /// - /// No file groups are added by default. See [`Self::with_file`], [`Self::with_file_group`] and - /// [`Self::with_file_groups`]. - /// - /// # Parameters: - /// * `object_store_url`: See [`Self::object_store_url`] - /// * `file_schema`: See [`Self::file_schema`] - pub fn new( - object_store_url: ObjectStoreUrl, - file_schema: SchemaRef, - file_source: Arc, - ) -> Self { - let statistics = Statistics::new_unknown(&file_schema); - - let mut config = Self { - object_store_url, - file_schema, - file_groups: vec![], - constraints: Constraints::empty(), - statistics, - projection: None, - limit: None, - table_partition_cols: vec![], - output_ordering: vec![], - file_compression_type: FileCompressionType::UNCOMPRESSED, - new_lines_in_values: false, - source: Arc::clone(&file_source), - }; - - config = config.with_source(Arc::clone(&file_source)); - config - } - - /// Set the file source - pub fn with_source(mut self, source: Arc) -> Self { - let ( - _projected_schema, - _constraints, - projected_statistics, - _projected_output_ordering, - ) = self.project(); - self.source = source.with_statistics(projected_statistics); - self - } - - /// Set the table constraints of the files - pub fn with_constraints(mut self, constraints: Constraints) -> Self { - self.constraints = constraints; - self - } - - /// Set the statistics of the files - pub fn with_statistics(mut self, statistics: Statistics) -> Self { - self.statistics = statistics; - self - } - - /// Set the projection of the files - pub fn with_projection(mut self, projection: Option>) -> Self { - self.projection = projection; - self - } - - /// Set the limit of the files - pub fn with_limit(mut self, limit: Option) -> Self { - self.limit = limit; - self - } - - /// Add a file as a single group - /// - /// See [Self::file_groups] for more information. - pub fn with_file(self, file: PartitionedFile) -> Self { - self.with_file_group(vec![file]) - } - - /// Add the file groups - /// - /// See [Self::file_groups] for more information. - pub fn with_file_groups( - mut self, - mut file_groups: Vec>, - ) -> Self { - self.file_groups.append(&mut file_groups); - self - } - - /// Add a new file group - /// - /// See [Self::file_groups] for more information - pub fn with_file_group(mut self, file_group: Vec) -> Self { - self.file_groups.push(file_group); - self - } - - /// Set the partitioning columns of the files - pub fn with_table_partition_cols(mut self, table_partition_cols: Vec) -> Self { - self.table_partition_cols = table_partition_cols; - self - } - - /// Set the output ordering of the files - pub fn with_output_ordering(mut self, output_ordering: Vec) -> Self { - self.output_ordering = output_ordering; - self - } - - /// Set the file compression type - pub fn with_file_compression_type( - mut self, - file_compression_type: FileCompressionType, - ) -> Self { - self.file_compression_type = file_compression_type; - self - } - - /// Set the new_lines_in_values property - pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self { - self.new_lines_in_values = new_lines_in_values; - self - } - - /// Specifies whether newlines in (quoted) values are supported. - /// - /// Parsing newlines in quoted values may be affected by execution behaviour such as - /// parallel file scanning. Setting this to `true` ensures that newlines in values are - /// parsed successfully, which may reduce performance. - /// - /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting. - pub fn newlines_in_values(&self) -> bool { - self.new_lines_in_values - } - - /// Project the schema, constraints, and the statistics on the given column indices - pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec) { - if self.projection.is_none() && self.table_partition_cols.is_empty() { - return ( - Arc::clone(&self.file_schema), - self.constraints.clone(), - self.statistics.clone(), - self.output_ordering.clone(), - ); - } - - let proj_indices = if let Some(proj) = &self.projection { - proj - } else { - let len = self.file_schema.fields().len() + self.table_partition_cols.len(); - &(0..len).collect::>() - }; - - let mut table_fields = vec![]; - let mut table_cols_stats = vec![]; - for idx in proj_indices { - if *idx < self.file_schema.fields().len() { - let field = self.file_schema.field(*idx); - table_fields.push(field.clone()); - table_cols_stats.push(self.statistics.column_statistics[*idx].clone()) - } else { - let partition_idx = idx - self.file_schema.fields().len(); - table_fields.push(self.table_partition_cols[partition_idx].to_owned()); - // TODO provide accurate stat for partition column (#1186) - table_cols_stats.push(ColumnStatistics::new_unknown()) - } - } - - let table_stats = Statistics { - num_rows: self.statistics.num_rows, - // TODO correct byte size? - total_byte_size: Precision::Absent, - column_statistics: table_cols_stats, - }; - - let projected_schema = Arc::new(Schema::new_with_metadata( - table_fields, - self.file_schema.metadata().clone(), - )); - - let projected_constraints = self - .constraints - .project(proj_indices) - .unwrap_or_else(Constraints::empty); - - let projected_output_ordering = - get_projected_output_ordering(self, &projected_schema); - - ( - projected_schema, - projected_constraints, - table_stats, - projected_output_ordering, - ) - } - - #[cfg_attr(not(feature = "avro"), allow(unused))] // Only used by avro - pub(crate) fn projected_file_column_names(&self) -> Option> { - self.projection.as_ref().map(|p| { - p.iter() - .filter(|col_idx| **col_idx < self.file_schema.fields().len()) - .map(|col_idx| self.file_schema.field(*col_idx).name()) - .cloned() - .collect() - }) - } - - /// Projects only file schema, ignoring partition columns - pub(crate) fn projected_file_schema(&self) -> SchemaRef { - let fields = self.file_column_projection_indices().map(|indices| { - indices - .iter() - .map(|col_idx| self.file_schema.field(*col_idx)) - .cloned() - .collect::>() - }); - - fields.map_or_else( - || Arc::clone(&self.file_schema), - |f| { - Arc::new(Schema::new_with_metadata( - f, - self.file_schema.metadata.clone(), - )) - }, - ) - } - - pub(crate) fn file_column_projection_indices(&self) -> Option> { - self.projection.as_ref().map(|p| { - p.iter() - .filter(|col_idx| **col_idx < self.file_schema.fields().len()) - .copied() - .collect() - }) - } - - /// Attempts to do a bin-packing on files into file groups, such that any two files - /// in a file group are ordered and non-overlapping with respect to their statistics. - /// It will produce the smallest number of file groups possible. - pub fn split_groups_by_statistics( - table_schema: &SchemaRef, - file_groups: &[Vec], - sort_order: &LexOrdering, - ) -> Result>> { - let flattened_files = file_groups.iter().flatten().collect::>(); - // First Fit: - // * Choose the first file group that a file can be placed into. - // * If it fits into no existing file groups, create a new one. - // - // By sorting files by min values and then applying first-fit bin packing, - // we can produce the smallest number of file groups such that - // files within a group are in order and non-overlapping. - // - // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 - // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html - - if flattened_files.is_empty() { - return Ok(vec![]); - } - - let statistics = MinMaxStatistics::new_from_files( - sort_order, - table_schema, - None, - flattened_files.iter().copied(), - ) - .map_err(|e| { - e.context("construct min/max statistics for split_groups_by_statistics") - })?; - - let indices_sorted_by_min = statistics.min_values_sorted(); - let mut file_groups_indices: Vec> = vec![]; - - for (idx, min) in indices_sorted_by_min { - let file_group_to_insert = file_groups_indices.iter_mut().find(|group| { - // If our file is non-overlapping and comes _after_ the last file, - // it fits in this file group. - min > statistics.max( - *group - .last() - .expect("groups should be nonempty at construction"), - ) - }); - match file_group_to_insert { - Some(group) => group.push(idx), - None => file_groups_indices.push(vec![idx]), - } - } - - // Assemble indices back into groups of PartitionedFiles - Ok(file_groups_indices - .into_iter() - .map(|file_group_indices| { - file_group_indices - .into_iter() - .map(|idx| flattened_files[idx].clone()) - .collect() - }) - .collect()) - } - - // TODO: This function should be moved into DataSourceExec once FileScanConfig moved out of datafusion/core - /// Returns a new [`DataSourceExec`] to scan the files specified by this config - pub fn build(self) -> Arc { - Arc::new(DataSourceExec::new(Arc::new(self))) - } - - /// Write the data_type based on file_source - fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { - write!(f, ", file_type={}", self.source.file_type())?; - self.source.fmt_extra(t, f) - } - - /// Returns the file_source - pub fn file_source(&self) -> &Arc { - &self.source - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::datasource::physical_plan::ArrowSource; - use crate::{test::columns, test_util::aggr_test_schema}; - use arrow::array::{Int32Array, RecordBatch}; - use std::collections::HashMap; - - #[test] - fn physical_plan_config_no_projection() { - let file_schema = aggr_test_schema(); - let conf = config_for_projection( - Arc::clone(&file_schema), - None, - Statistics::new_unknown(&file_schema), - to_partition_cols(vec![( - "date".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - )]), - ); - - let (proj_schema, _, proj_statistics, _) = conf.project(); - assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); - assert_eq!( - proj_schema.field(file_schema.fields().len()).name(), - "date", - "partition columns are the last columns" - ); - assert_eq!( - proj_statistics.column_statistics.len(), - file_schema.fields().len() + 1 - ); - // TODO implement tests for partition column statistics once implemented - - let col_names = conf.projected_file_column_names(); - assert_eq!(col_names, None); - - let col_indices = conf.file_column_projection_indices(); - assert_eq!(col_indices, None); - } - - #[test] - fn physical_plan_config_no_projection_tab_cols_as_field() { - let file_schema = aggr_test_schema(); - - // make a table_partition_col as a field - let table_partition_col = - Field::new("date", wrap_partition_type_in_dict(DataType::Utf8), true) - .with_metadata(HashMap::from_iter(vec![( - "key_whatever".to_owned(), - "value_whatever".to_owned(), - )])); - - let conf = config_for_projection( - Arc::clone(&file_schema), - None, - Statistics::new_unknown(&file_schema), - vec![table_partition_col.clone()], - ); - - // verify the proj_schema includes the last column and exactly the same the field it is defined - let (proj_schema, _, _, _) = conf.project(); - assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); - assert_eq!( - *proj_schema.field(file_schema.fields().len()), - table_partition_col, - "partition columns are the last columns and ust have all values defined in created field" - ); - } - - #[test] - fn physical_plan_config_with_projection() { - let file_schema = aggr_test_schema(); - let conf = config_for_projection( - Arc::clone(&file_schema), - Some(vec![file_schema.fields().len(), 0]), - Statistics { - num_rows: Precision::Inexact(10), - // assign the column index to distinct_count to help assert - // the source statistic after the projection - column_statistics: (0..file_schema.fields().len()) - .map(|i| ColumnStatistics { - distinct_count: Precision::Inexact(i), - ..Default::default() - }) - .collect(), - total_byte_size: Precision::Absent, - }, - to_partition_cols(vec![( - "date".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - )]), - ); - - let (proj_schema, _, proj_statistics, _) = conf.project(); - assert_eq!( - columns(&proj_schema), - vec!["date".to_owned(), "c1".to_owned()] - ); - let proj_stat_cols = proj_statistics.column_statistics; - assert_eq!(proj_stat_cols.len(), 2); - // TODO implement tests for proj_stat_cols[0] once partition column - // statistics are implemented - assert_eq!(proj_stat_cols[1].distinct_count, Precision::Inexact(0)); - - let col_names = conf.projected_file_column_names(); - assert_eq!(col_names, Some(vec!["c1".to_owned()])); - - let col_indices = conf.file_column_projection_indices(); - assert_eq!(col_indices, Some(vec![0])); - } - - #[test] - fn partition_column_projector() { - let file_batch = build_table_i32( - ("a", &vec![0, 1, 2]), - ("b", &vec![-2, -1, 0]), - ("c", &vec![10, 11, 12]), - ); - let partition_cols = vec![ - ( - "year".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ( - "month".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ( - "day".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ]; - // create a projected schema - let conf = config_for_projection( - file_batch.schema(), - // keep all cols from file and 2 from partitioning - Some(vec![ - 0, - 1, - 2, - file_batch.schema().fields().len(), - file_batch.schema().fields().len() + 2, - ]), - Statistics::new_unknown(&file_batch.schema()), - to_partition_cols(partition_cols.clone()), - ); - let (proj_schema, ..) = conf.project(); - // created a projector for that projected schema - let mut proj = PartitionColumnProjector::new( - proj_schema, - &partition_cols - .iter() - .map(|x| x.0.clone()) - .collect::>(), - ); - - // project first batch - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - wrap_partition_value_in_dict(ScalarValue::from("2021")), - wrap_partition_value_in_dict(ScalarValue::from("10")), - wrap_partition_value_in_dict(ScalarValue::from("26")), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+----+----+------+-----+", - "| a | b | c | year | day |", - "+---+----+----+------+-----+", - "| 0 | -2 | 10 | 2021 | 26 |", - "| 1 | -1 | 11 | 2021 | 26 |", - "| 2 | 0 | 12 | 2021 | 26 |", - "+---+----+----+------+-----+", - ]; - crate::assert_batches_eq!(expected, &[projected_batch]); - - // project another batch that is larger than the previous one - let file_batch = build_table_i32( - ("a", &vec![5, 6, 7, 8, 9]), - ("b", &vec![-10, -9, -8, -7, -6]), - ("c", &vec![12, 13, 14, 15, 16]), - ); - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - wrap_partition_value_in_dict(ScalarValue::from("2021")), - wrap_partition_value_in_dict(ScalarValue::from("10")), - wrap_partition_value_in_dict(ScalarValue::from("27")), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+-----+----+------+-----+", - "| a | b | c | year | day |", - "+---+-----+----+------+-----+", - "| 5 | -10 | 12 | 2021 | 27 |", - "| 6 | -9 | 13 | 2021 | 27 |", - "| 7 | -8 | 14 | 2021 | 27 |", - "| 8 | -7 | 15 | 2021 | 27 |", - "| 9 | -6 | 16 | 2021 | 27 |", - "+---+-----+----+------+-----+", - ]; - crate::assert_batches_eq!(expected, &[projected_batch]); - - // project another batch that is smaller than the previous one - let file_batch = build_table_i32( - ("a", &vec![0, 1, 3]), - ("b", &vec![2, 3, 4]), - ("c", &vec![4, 5, 6]), - ); - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - wrap_partition_value_in_dict(ScalarValue::from("2021")), - wrap_partition_value_in_dict(ScalarValue::from("10")), - wrap_partition_value_in_dict(ScalarValue::from("28")), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+---+---+------+-----+", - "| a | b | c | year | day |", - "+---+---+---+------+-----+", - "| 0 | 2 | 4 | 2021 | 28 |", - "| 1 | 3 | 5 | 2021 | 28 |", - "| 3 | 4 | 6 | 2021 | 28 |", - "+---+---+---+------+-----+", - ]; - crate::assert_batches_eq!(expected, &[projected_batch]); - - // forgot to dictionary-wrap the scalar value - let file_batch = build_table_i32( - ("a", &vec![0, 1, 2]), - ("b", &vec![-2, -1, 0]), - ("c", &vec![10, 11, 12]), - ); - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - ScalarValue::from("2021"), - ScalarValue::from("10"), - ScalarValue::from("26"), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+----+----+------+-----+", - "| a | b | c | year | day |", - "+---+----+----+------+-----+", - "| 0 | -2 | 10 | 2021 | 26 |", - "| 1 | -1 | 11 | 2021 | 26 |", - "| 2 | 0 | 12 | 2021 | 26 |", - "+---+----+----+------+-----+", - ]; - crate::assert_batches_eq!(expected, &[projected_batch]); - } - - #[test] - fn test_projected_file_schema_with_partition_col() { - let schema = aggr_test_schema(); - let partition_cols = vec![ - ( - "part1".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ( - "part2".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ]; - - // Projected file schema for config with projection including partition column - let projection = config_for_projection( - schema.clone(), - Some(vec![0, 3, 5, schema.fields().len()]), - Statistics::new_unknown(&schema), - to_partition_cols(partition_cols), - ) - .projected_file_schema(); - - // Assert partition column filtered out in projected file schema - let expected_columns = vec!["c1", "c4", "c6"]; - let actual_columns = projection - .fields() - .iter() - .map(|f| f.name().clone()) - .collect::>(); - assert_eq!(expected_columns, actual_columns); - } - - #[test] - fn test_projected_file_schema_without_projection() { - let schema = aggr_test_schema(); - let partition_cols = vec![ - ( - "part1".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ( - "part2".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ]; - - // Projected file schema for config without projection - let projection = config_for_projection( - schema.clone(), - None, - Statistics::new_unknown(&schema), - to_partition_cols(partition_cols), - ) - .projected_file_schema(); - - // Assert projected file schema is equal to file schema - assert_eq!(projection.fields(), schema.fields()); - } - - #[test] - fn test_split_groups_by_statistics() -> Result<()> { - use chrono::TimeZone; - use datafusion_common::DFSchema; - use datafusion_expr::execution_props::ExecutionProps; - use object_store::{path::Path, ObjectMeta}; - - struct File { - name: &'static str, - date: &'static str, - statistics: Vec>, - } - impl File { - fn new( - name: &'static str, - date: &'static str, - statistics: Vec>, - ) -> Self { - Self { - name, - date, - statistics, - } - } - } - - struct TestCase { - name: &'static str, - file_schema: Schema, - files: Vec, - sort: Vec, - expected_result: Result>, &'static str>, - } - - use datafusion_expr::col; - let cases = vec![ - TestCase { - name: "test sort", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), - File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), - ], - sort: vec![col("value").sort(true, false)], - expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), - }, - // same input but file '2' is in the middle - // test that we still order correctly - TestCase { - name: "test sort with files ordered differently", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), - File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), - ], - sort: vec![col("value").sort(true, false)], - expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), - }, - TestCase { - name: "reverse sort", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), - File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), - ], - sort: vec![col("value").sort(false, true)], - expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]), - }, - // reject nullable sort columns - TestCase { - name: "no nullable sort columns", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - true, // should fail because nullable - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), - File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), - ], - sort: vec![col("value").sort(true, false)], - expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column") - }, - TestCase { - name: "all three non-overlapping", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.50, 0.99))]), - File::new("2", "2023-01-02", vec![Some((1.00, 1.49))]), - ], - sort: vec![col("value").sort(true, false)], - expected_result: Ok(vec![vec!["0", "1", "2"]]), - }, - TestCase { - name: "all three overlapping", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("2", "2023-01-02", vec![Some((0.00, 0.49))]), - ], - sort: vec![col("value").sort(true, false)], - expected_result: Ok(vec![vec!["0"], vec!["1"], vec!["2"]]), - }, - TestCase { - name: "empty input", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![], - sort: vec![col("value").sort(true, false)], - expected_result: Ok(vec![]), - }, - TestCase { - name: "one file missing statistics", - file_schema: Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )]), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("2", "2023-01-02", vec![None]), - ], - sort: vec![col("value").sort(true, false)], - expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found"), - }, - ]; - - for case in cases { - let table_schema = Arc::new(Schema::new( - case.file_schema - .fields() - .clone() - .into_iter() - .cloned() - .chain(Some(Arc::new(Field::new( - "date".to_string(), - DataType::Utf8, - false, - )))) - .collect::>(), - )); - let sort_order = LexOrdering::from( - case.sort - .into_iter() - .map(|expr| { - crate::physical_planner::create_physical_sort_expr( - &expr, - &DFSchema::try_from(table_schema.as_ref().clone())?, - &ExecutionProps::default(), - ) - }) - .collect::>>()?, - ); - - let partitioned_files = - case.files.into_iter().map(From::from).collect::>(); - let result = FileScanConfig::split_groups_by_statistics( - &table_schema, - &[partitioned_files.clone()], - &sort_order, - ); - let results_by_name = result - .as_ref() - .map(|file_groups| { - file_groups - .iter() - .map(|file_group| { - file_group - .iter() - .map(|file| { - partitioned_files - .iter() - .find_map(|f| { - if f.object_meta == file.object_meta { - Some( - f.object_meta - .location - .as_ref() - .rsplit('/') - .next() - .unwrap() - .trim_end_matches(".parquet"), - ) - } else { - None - } - }) - .unwrap() - }) - .collect::>() - }) - .collect::>() - }) - .map_err(|e| e.strip_backtrace().leak() as &'static str); - - assert_eq!(results_by_name, case.expected_result, "{}", case.name); - } - - return Ok(()); - - impl From for PartitionedFile { - fn from(file: File) -> Self { - PartitionedFile { - object_meta: ObjectMeta { - location: Path::from(format!( - "data/date={}/{}.parquet", - file.date, file.name - )), - last_modified: chrono::Utc.timestamp_nanos(0), - size: 0, - e_tag: None, - version: None, - }, - partition_values: vec![ScalarValue::from(file.date)], - range: None, - statistics: Some(Statistics { - num_rows: Precision::Absent, - total_byte_size: Precision::Absent, - column_statistics: file - .statistics - .into_iter() - .map(|stats| { - stats - .map(|(min, max)| ColumnStatistics { - min_value: Precision::Exact(ScalarValue::from( - min, - )), - max_value: Precision::Exact(ScalarValue::from( - max, - )), - ..Default::default() - }) - .unwrap_or_default() - }) - .collect::>(), - }), - extensions: None, - metadata_size_hint: None, - } - } - } - } - - // sets default for configs that play no role in projections - fn config_for_projection( - file_schema: SchemaRef, - projection: Option>, - statistics: Statistics, - table_partition_cols: Vec, - ) -> FileScanConfig { - FileScanConfig::new( - ObjectStoreUrl::parse("test:///").unwrap(), - file_schema, - Arc::new(ArrowSource::default()), - ) - .with_projection(projection) - .with_statistics(statistics) - .with_table_partition_cols(table_partition_cols) - } - - /// Convert partition columns from Vec to Vec - fn to_partition_cols(table_partition_cols: Vec<(String, DataType)>) -> Vec { - table_partition_cols - .iter() - .map(|(name, dtype)| Field::new(name, dtype.clone(), false)) - .collect::>() - } - - /// returns record batch with 3 columns of i32 in memory - pub fn build_table_i32( - a: (&str, &Vec), - b: (&str, &Vec), - c: (&str, &Vec), - ) -> RecordBatch { - let schema = Schema::new(vec![ - Field::new(a.0, DataType::Int32, false), - Field::new(b.0, DataType::Int32, false), - Field::new(c.0, DataType::Int32, false), - ]); - - RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(Int32Array::from(a.1.clone())), - Arc::new(Int32Array::from(b.1.clone())), - Arc::new(Int32Array::from(c.1.clone())), - ], - ) - .unwrap() - } -} diff --git a/datafusion/core/src/datasource/physical_plan/file_stream.rs b/datafusion/core/src/datasource/physical_plan/file_stream.rs deleted file mode 100644 index 7944d6fa9020..000000000000 --- a/datafusion/core/src/datasource/physical_plan/file_stream.rs +++ /dev/null @@ -1,799 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! A generic stream over file format readers that can be used by -//! any file format that read its files from start to end. -//! -//! Note: Most traits here need to be marked `Sync + Send` to be -//! compliant with the `SendableRecordBatchStream` trait. - -use std::collections::VecDeque; -use std::mem; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use crate::datasource::listing::PartitionedFile; -use crate::datasource::physical_plan::file_scan_config::PartitionColumnProjector; -use crate::datasource::physical_plan::{FileMeta, FileScanConfig}; -use crate::error::Result; -use crate::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet}; -use crate::physical_plan::RecordBatchStream; - -use arrow::datatypes::SchemaRef; -use arrow::error::ArrowError; -use arrow::record_batch::RecordBatch; -use datafusion_common::ScalarValue; -pub use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener, OnError}; -use datafusion_datasource::file_stream::{FileStreamMetrics, FileStreamState, NextOpen}; - -use futures::{ready, FutureExt, Stream, StreamExt}; - -/// A stream that iterates record batch by record batch, file over file. -pub struct FileStream { - /// An iterator over input files. - file_iter: VecDeque, - /// The stream schema (file schema including partition columns and after - /// projection). - projected_schema: SchemaRef, - /// The remaining number of records to parse, None if no limit - remain: Option, - /// A dynamic [`FileOpener`]. Calling `open()` returns a [`FileOpenFuture`], - /// which can be resolved to a stream of `RecordBatch`. - file_opener: Arc, - /// The partition column projector - pc_projector: PartitionColumnProjector, - /// The stream state - state: FileStreamState, - /// File stream specific metrics - file_stream_metrics: FileStreamMetrics, - /// runtime baseline metrics - baseline_metrics: BaselineMetrics, - /// Describes the behavior of the `FileStream` if file opening or scanning fails - on_error: OnError, -} - -impl FileStream { - /// Create a new `FileStream` using the give `FileOpener` to scan underlying files - pub fn new( - config: &FileScanConfig, - partition: usize, - file_opener: Arc, - metrics: &ExecutionPlanMetricsSet, - ) -> Result { - let (projected_schema, ..) = config.project(); - let pc_projector = PartitionColumnProjector::new( - Arc::clone(&projected_schema), - &config - .table_partition_cols - .iter() - .map(|x| x.name().clone()) - .collect::>(), - ); - - let files = config.file_groups[partition].clone(); - - Ok(Self { - file_iter: files.into(), - projected_schema, - remain: config.limit, - file_opener, - pc_projector, - state: FileStreamState::Idle, - file_stream_metrics: FileStreamMetrics::new(metrics, partition), - baseline_metrics: BaselineMetrics::new(metrics, partition), - on_error: OnError::Fail, - }) - } - - /// Specify the behavior when an error occurs opening or scanning a file - /// - /// If `OnError::Skip` the stream will skip files which encounter an error and continue - /// If `OnError:Fail` (default) the stream will fail and stop processing when an error occurs - pub fn with_on_error(mut self, on_error: OnError) -> Self { - self.on_error = on_error; - self - } - - /// Begin opening the next file in parallel while decoding the current file in FileStream. - /// - /// Since file opening is mostly IO (and may involve a - /// bunch of sequential IO), it can be parallelized with decoding. - fn start_next_file(&mut self) -> Option)>> { - let part_file = self.file_iter.pop_front()?; - - let file_meta = FileMeta { - object_meta: part_file.object_meta, - range: part_file.range, - extensions: part_file.extensions, - metadata_size_hint: part_file.metadata_size_hint, - }; - - Some( - self.file_opener - .open(file_meta) - .map(|future| (future, part_file.partition_values)), - ) - } - - fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll>> { - loop { - match &mut self.state { - FileStreamState::Idle => { - self.file_stream_metrics.time_opening.start(); - - match self.start_next_file().transpose() { - Ok(Some((future, partition_values))) => { - self.state = FileStreamState::Open { - future, - partition_values, - } - } - Ok(None) => return Poll::Ready(None), - Err(e) => { - self.state = FileStreamState::Error; - return Poll::Ready(Some(Err(e))); - } - } - } - FileStreamState::Open { - future, - partition_values, - } => match ready!(future.poll_unpin(cx)) { - Ok(reader) => { - let partition_values = mem::take(partition_values); - - // include time needed to start opening in `start_next_file` - self.file_stream_metrics.time_opening.stop(); - let next = self.start_next_file().transpose(); - self.file_stream_metrics.time_scanning_until_data.start(); - self.file_stream_metrics.time_scanning_total.start(); - - match next { - Ok(Some((next_future, next_partition_values))) => { - self.state = FileStreamState::Scan { - partition_values, - reader, - next: Some(( - NextOpen::Pending(next_future), - next_partition_values, - )), - }; - } - Ok(None) => { - self.state = FileStreamState::Scan { - reader, - partition_values, - next: None, - }; - } - Err(e) => { - self.state = FileStreamState::Error; - return Poll::Ready(Some(Err(e))); - } - } - } - Err(e) => { - self.file_stream_metrics.file_open_errors.add(1); - match self.on_error { - OnError::Skip => { - self.file_stream_metrics.time_opening.stop(); - self.state = FileStreamState::Idle - } - OnError::Fail => { - self.state = FileStreamState::Error; - return Poll::Ready(Some(Err(e))); - } - } - } - }, - FileStreamState::Scan { - reader, - partition_values, - next, - } => { - // We need to poll the next `FileOpenFuture` here to drive it forward - if let Some((next_open_future, _)) = next { - if let NextOpen::Pending(f) = next_open_future { - if let Poll::Ready(reader) = f.as_mut().poll(cx) { - *next_open_future = NextOpen::Ready(reader); - } - } - } - match ready!(reader.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - self.file_stream_metrics.time_scanning_until_data.stop(); - self.file_stream_metrics.time_scanning_total.stop(); - let result = self - .pc_projector - .project(batch, partition_values) - .map_err(|e| ArrowError::ExternalError(e.into())) - .map(|batch| match &mut self.remain { - Some(remain) => { - if *remain > batch.num_rows() { - *remain -= batch.num_rows(); - batch - } else { - let batch = batch.slice(0, *remain); - self.state = FileStreamState::Limit; - *remain = 0; - batch - } - } - None => batch, - }); - - if result.is_err() { - // If the partition value projection fails, this is not governed by - // the `OnError` behavior - self.state = FileStreamState::Error - } - self.file_stream_metrics.time_scanning_total.start(); - return Poll::Ready(Some(result.map_err(Into::into))); - } - Some(Err(err)) => { - self.file_stream_metrics.file_scan_errors.add(1); - self.file_stream_metrics.time_scanning_until_data.stop(); - self.file_stream_metrics.time_scanning_total.stop(); - - match self.on_error { - // If `OnError::Skip` we skip the file as soon as we hit the first error - OnError::Skip => match mem::take(next) { - Some((future, partition_values)) => { - self.file_stream_metrics.time_opening.start(); - - match future { - NextOpen::Pending(future) => { - self.state = FileStreamState::Open { - future, - partition_values, - } - } - NextOpen::Ready(reader) => { - self.state = FileStreamState::Open { - future: Box::pin(std::future::ready( - reader, - )), - partition_values, - } - } - } - } - None => return Poll::Ready(None), - }, - OnError::Fail => { - self.state = FileStreamState::Error; - return Poll::Ready(Some(Err(err.into()))); - } - } - } - None => { - self.file_stream_metrics.time_scanning_until_data.stop(); - self.file_stream_metrics.time_scanning_total.stop(); - - match mem::take(next) { - Some((future, partition_values)) => { - self.file_stream_metrics.time_opening.start(); - - match future { - NextOpen::Pending(future) => { - self.state = FileStreamState::Open { - future, - partition_values, - } - } - NextOpen::Ready(reader) => { - self.state = FileStreamState::Open { - future: Box::pin(std::future::ready( - reader, - )), - partition_values, - } - } - } - } - None => return Poll::Ready(None), - } - } - } - } - FileStreamState::Error | FileStreamState::Limit => { - return Poll::Ready(None) - } - } - } - } -} - -impl Stream for FileStream { - type Item = Result; - - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - self.file_stream_metrics.time_processing.start(); - let result = self.poll_inner(cx); - self.file_stream_metrics.time_processing.stop(); - self.baseline_metrics.record_poll(result) - } -} - -impl RecordBatchStream for FileStream { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.projected_schema) - } -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::{AtomicUsize, Ordering}; - use std::sync::Arc; - - use super::*; - use crate::datasource::object_store::ObjectStoreUrl; - use crate::prelude::SessionContext; - use crate::test::{make_partition, object_store::register_test_store}; - - use crate::datasource::physical_plan::CsvSource; - use arrow::datatypes::Schema; - use datafusion_common::internal_err; - - /// Test `FileOpener` which will simulate errors during file opening or scanning - #[derive(Default)] - struct TestOpener { - /// Index in stream of files which should throw an error while opening - error_opening_idx: Vec, - /// Index in stream of files which should throw an error while scanning - error_scanning_idx: Vec, - /// Index of last file in stream - current_idx: AtomicUsize, - /// `RecordBatch` to return - records: Vec, - } - - impl FileOpener for TestOpener { - fn open(&self, _file_meta: FileMeta) -> Result { - let idx = self.current_idx.fetch_add(1, Ordering::SeqCst); - - if self.error_opening_idx.contains(&idx) { - Ok(futures::future::ready(internal_err!("error opening")).boxed()) - } else if self.error_scanning_idx.contains(&idx) { - let error = futures::future::ready(Err(ArrowError::IpcError( - "error scanning".to_owned(), - ))); - let stream = futures::stream::once(error).boxed(); - Ok(futures::future::ready(Ok(stream)).boxed()) - } else { - let iterator = self.records.clone().into_iter().map(Ok); - let stream = futures::stream::iter(iterator).boxed(); - Ok(futures::future::ready(Ok(stream)).boxed()) - } - } - } - - #[derive(Default)] - struct FileStreamTest { - /// Number of files in the stream - num_files: usize, - /// Global limit of records emitted by the stream - limit: Option, - /// Error-handling behavior of the stream - on_error: OnError, - /// Mock `FileOpener` - opener: TestOpener, - } - - impl FileStreamTest { - pub fn new() -> Self { - Self::default() - } - - /// Specify the number of files in the stream - pub fn with_num_files(mut self, num_files: usize) -> Self { - self.num_files = num_files; - self - } - - /// Specify the limit - pub fn with_limit(mut self, limit: Option) -> Self { - self.limit = limit; - self - } - - /// Specify the index of files in the stream which should - /// throw an error when opening - pub fn with_open_errors(mut self, idx: Vec) -> Self { - self.opener.error_opening_idx = idx; - self - } - - /// Specify the index of files in the stream which should - /// throw an error when scanning - pub fn with_scan_errors(mut self, idx: Vec) -> Self { - self.opener.error_scanning_idx = idx; - self - } - - /// Specify the behavior of the stream when an error occurs - pub fn with_on_error(mut self, on_error: OnError) -> Self { - self.on_error = on_error; - self - } - - /// Specify the record batches that should be returned from each - /// file that is successfully scanned - pub fn with_records(mut self, records: Vec) -> Self { - self.opener.records = records; - self - } - - /// Collect the results of the `FileStream` - pub async fn result(self) -> Result> { - let file_schema = self - .opener - .records - .first() - .map(|batch| batch.schema()) - .unwrap_or_else(|| Arc::new(Schema::empty())); - - let ctx = SessionContext::new(); - let mock_files: Vec<(String, u64)> = (0..self.num_files) - .map(|idx| (format!("mock_file{idx}"), 10_u64)) - .collect(); - - let mock_files_ref: Vec<(&str, u64)> = mock_files - .iter() - .map(|(name, size)| (name.as_str(), *size)) - .collect(); - - register_test_store(&ctx, &mock_files_ref); - - let file_group = mock_files - .into_iter() - .map(|(name, size)| PartitionedFile::new(name, size)) - .collect(); - - let on_error = self.on_error; - - let config = FileScanConfig::new( - ObjectStoreUrl::parse("test:///").unwrap(), - file_schema, - Arc::new(CsvSource::default()), - ) - .with_file_group(file_group) - .with_limit(self.limit); - let metrics_set = ExecutionPlanMetricsSet::new(); - let file_stream = - FileStream::new(&config, 0, Arc::new(self.opener), &metrics_set) - .unwrap() - .with_on_error(on_error); - - file_stream - .collect::>() - .await - .into_iter() - .collect::>>() - } - } - - /// helper that creates a stream of 2 files with the same pair of batches in each ([0,1,2] and [0,1]) - async fn create_and_collect(limit: Option) -> Vec { - FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_limit(limit) - .result() - .await - .expect("error executing stream") - } - - #[tokio::test] - async fn on_error_opening() -> Result<()> { - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Skip) - .with_open_errors(vec![0]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Skip) - .with_open_errors(vec![1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Skip) - .with_open_errors(vec![0, 1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "++", - "++", - ], &batches); - - Ok(()) - } - - #[tokio::test] - async fn on_error_scanning_fail() -> Result<()> { - let result = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Fail) - .with_scan_errors(vec![1]) - .result() - .await; - - assert!(result.is_err()); - - Ok(()) - } - - #[tokio::test] - async fn on_error_opening_fail() -> Result<()> { - let result = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Fail) - .with_open_errors(vec![1]) - .result() - .await; - - assert!(result.is_err()); - - Ok(()) - } - - #[tokio::test] - async fn on_error_scanning() -> Result<()> { - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Skip) - .with_scan_errors(vec![0]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Skip) - .with_scan_errors(vec![1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(2) - .with_on_error(OnError::Skip) - .with_scan_errors(vec![0, 1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "++", - "++", - ], &batches); - - Ok(()) - } - - #[tokio::test] - async fn on_error_mixed() -> Result<()> { - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(3) - .with_on_error(OnError::Skip) - .with_open_errors(vec![1]) - .with_scan_errors(vec![0]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(3) - .with_on_error(OnError::Skip) - .with_open_errors(vec![0]) - .with_scan_errors(vec![1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(3) - .with_on_error(OnError::Skip) - .with_open_errors(vec![2]) - .with_scan_errors(vec![0, 1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "++", - "++", - ], &batches); - - let batches = FileStreamTest::new() - .with_records(vec![make_partition(3), make_partition(2)]) - .with_num_files(3) - .with_on_error(OnError::Skip) - .with_open_errors(vec![0, 2]) - .with_scan_errors(vec![1]) - .result() - .await?; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "++", - "++", - ], &batches); - - Ok(()) - } - - #[tokio::test] - async fn without_limit() -> Result<()> { - let batches = create_and_collect(None).await; - - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - Ok(()) - } - - #[tokio::test] - async fn with_limit_between_files() -> Result<()> { - let batches = create_and_collect(Some(5)).await; - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "+---+", - ], &batches); - - Ok(()) - } - - #[tokio::test] - async fn with_limit_at_middle_of_batch() -> Result<()> { - let batches = create_and_collect(Some(6)).await; - #[rustfmt::skip] - crate::assert_batches_eq!(&[ - "+---+", - "| i |", - "+---+", - "| 0 |", - "| 1 |", - "| 2 |", - "| 0 |", - "| 1 |", - "| 0 |", - "+---+", - ], &batches); - - Ok(()) - } -} diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index d1ae13b083ab..c92d4dfdf835 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -22,8 +22,8 @@ use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; use std::task::Poll; -use super::{calculate_range, FileScanConfig, RangeCalculation}; -use crate::datasource::data_source::FileSource; +use super::{calculate_range, RangeCalculation}; + use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::{deserialize_stream, DecoderDeserializer}; use crate::datasource::listing::{ListingTableUrl, PartitionedFile}; @@ -35,6 +35,8 @@ use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use arrow::json::ReaderBuilder; use arrow::{datatypes::SchemaRef, json}; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 953c99322e16..42f6912afec0 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -20,12 +20,10 @@ mod arrow_file; mod avro; mod csv; -mod file_scan_config; -mod file_stream; mod json; #[cfg(feature = "parquet")] pub mod parquet; -mod statistics; + pub(crate) use self::csv::plan_to_csv; pub(crate) use self::json::plan_to_json; #[cfg(feature = "parquet")] @@ -35,12 +33,9 @@ pub use self::parquet::source::ParquetSource; pub use self::parquet::{ ParquetExec, ParquetExecBuilder, ParquetFileMetrics, ParquetFileReaderFactory, }; +use crate::datasource::listing::FileRange; use crate::error::Result; -use crate::physical_plan::{DisplayAs, DisplayFormatType}; -use crate::{ - datasource::listing::{FileRange, PartitionedFile}, - physical_plan::display::{display_orderings, ProjectSchemaDisplay}, -}; +use crate::physical_plan::DisplayAs; #[allow(deprecated)] pub use arrow_file::ArrowExec; pub use arrow_file::ArrowSource; @@ -50,300 +45,24 @@ pub use avro::AvroSource; #[allow(deprecated)] pub use csv::{CsvExec, CsvExecBuilder}; pub use csv::{CsvOpener, CsvSource}; +pub use datafusion_datasource::file::FileSource; pub use datafusion_datasource::file_groups::FileGroupPartitioner; pub use datafusion_datasource::file_meta::FileMeta; -pub use datafusion_datasource::file_sink_config::*; -pub use file_scan_config::{ +pub use datafusion_datasource::file_scan_config::{ wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, }; -pub use file_stream::{FileOpenFuture, FileOpener, FileStream, OnError}; +pub use datafusion_datasource::file_sink_config::*; + +pub use datafusion_datasource::file_stream::{ + FileOpenFuture, FileOpener, FileStream, OnError, +}; use futures::StreamExt; #[allow(deprecated)] pub use json::NdJsonExec; pub use json::{JsonOpener, JsonSource}; -use log::debug; -use object_store::{path::Path, GetOptions, GetRange, ObjectStore}; -use std::{ - fmt::{Debug, Formatter, Result as FmtResult}, - ops::Range, - sync::Arc, - vec, -}; - -use arrow::datatypes::SchemaRef; -use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::PhysicalSortExpr; -use datafusion_physical_expr_common::sort_expr::LexOrdering; - -impl Debug for FileScanConfig { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "object_store_url={:?}, ", self.object_store_url)?; - - write!(f, "statistics={:?}, ", self.statistics)?; - - DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f) - } -} - -impl DisplayAs for FileScanConfig { - fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { - let (schema, _, _, orderings) = self.project(); - - write!(f, "file_groups=")?; - FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?; - - if !schema.fields().is_empty() { - write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; - } - - if let Some(limit) = self.limit { - write!(f, ", limit={limit}")?; - } - - display_orderings(f, &orderings)?; - - if !self.constraints.is_empty() { - write!(f, ", {}", self.constraints)?; - } - - Ok(()) - } -} - -/// A wrapper to customize partitioned file display -/// -/// Prints in the format: -/// ```text -/// {NUM_GROUPS groups: [[file1, file2,...], [fileN, fileM, ...], ...]} -/// ``` -#[derive(Debug)] -struct FileGroupsDisplay<'a>(&'a [Vec]); - -impl DisplayAs for FileGroupsDisplay<'_> { - fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { - let n_groups = self.0.len(); - let groups = if n_groups == 1 { "group" } else { "groups" }; - write!(f, "{{{n_groups} {groups}: [")?; - match t { - DisplayFormatType::Default => { - // To avoid showing too many partitions - let max_groups = 5; - fmt_up_to_n_elements(self.0, max_groups, f, |group, f| { - FileGroupDisplay(group).fmt_as(t, f) - })?; - } - DisplayFormatType::Verbose => { - fmt_elements_split_by_commas(self.0.iter(), f, |group, f| { - FileGroupDisplay(group).fmt_as(t, f) - })? - } - } - write!(f, "]}}") - } -} - -/// A wrapper to customize partitioned group of files display -/// -/// Prints in the format: -/// ```text -/// [file1, file2,...] -/// ``` -#[derive(Debug)] -pub(crate) struct FileGroupDisplay<'a>(pub &'a [PartitionedFile]); - -impl DisplayAs for FileGroupDisplay<'_> { - fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { - write!(f, "[")?; - match t { - DisplayFormatType::Default => { - // To avoid showing too many files - let max_files = 5; - fmt_up_to_n_elements(self.0, max_files, f, |pf, f| { - write!(f, "{}", pf.object_meta.location.as_ref())?; - if let Some(range) = pf.range.as_ref() { - write!(f, ":{}..{}", range.start, range.end)?; - } - Ok(()) - })? - } - DisplayFormatType::Verbose => { - fmt_elements_split_by_commas(self.0.iter(), f, |pf, f| { - write!(f, "{}", pf.object_meta.location.as_ref())?; - if let Some(range) = pf.range.as_ref() { - write!(f, ":{}..{}", range.start, range.end)?; - } - Ok(()) - })? - } - } - write!(f, "]") - } -} - -/// helper to format an array of up to N elements -fn fmt_up_to_n_elements( - elements: &[E], - n: usize, - f: &mut Formatter, - format_element: F, -) -> FmtResult -where - F: Fn(&E, &mut Formatter) -> FmtResult, -{ - let len = elements.len(); - fmt_elements_split_by_commas(elements.iter().take(n), f, |element, f| { - format_element(element, f) - })?; - // Remaining elements are showed as `...` (to indicate there is more) - if len > n { - write!(f, ", ...")?; - } - Ok(()) -} - -/// helper formatting array elements with a comma and a space between them -fn fmt_elements_split_by_commas( - iter: I, - f: &mut Formatter, - format_element: F, -) -> FmtResult -where - I: Iterator, - F: Fn(E, &mut Formatter) -> FmtResult, -{ - for (idx, element) in iter.enumerate() { - if idx > 0 { - write!(f, ", ")?; - } - format_element(element, f)?; - } - Ok(()) -} - -/// The various listing tables does not attempt to read all files -/// concurrently, instead they will read files in sequence within a -/// partition. This is an important property as it allows plans to -/// run against 1000s of files and not try to open them all -/// concurrently. -/// -/// However, it means if we assign more than one file to a partition -/// the output sort order will not be preserved as illustrated in the -/// following diagrams: -/// -/// When only 1 file is assigned to each partition, each partition is -/// correctly sorted on `(A, B, C)` -/// -/// ```text -///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐ -///┃ ┌───────────────┐ ┌──────────────┐ │ ┌──────────────┐ │ ┌─────────────┐ ┃ -/// │ │ 1.parquet │ │ │ │ 2.parquet │ │ │ 3.parquet │ │ │ 4.parquet │ │ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ │Sort: A, B, C │ │ │Sort: A, B, C│ ┃ -/// │ └───────────────┘ │ │ └──────────────┘ │ └──────────────┘ │ └─────────────┘ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -/// DataFusion DataFusion DataFusion DataFusion -///┃ Partition 1 Partition 2 Partition 3 Partition 4 ┃ -/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ -/// -/// DataSourceExec -///``` -/// -/// However, when more than 1 file is assigned to each partition, each -/// partition is NOT correctly sorted on `(A, B, C)`. Once the second -/// file is scanned, the same values for A, B and C can be repeated in -/// the same sorted stream -/// -///```text -///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -///┃ ┌───────────────┐ ┌──────────────┐ │ -/// │ │ 1.parquet │ │ │ │ 2.parquet │ ┃ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ -/// │ └───────────────┘ │ │ └──────────────┘ ┃ -///┃ ┌───────────────┐ ┌──────────────┐ │ -/// │ │ 3.parquet │ │ │ │ 4.parquet │ ┃ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ -/// │ └───────────────┘ │ │ └──────────────┘ ┃ -///┃ │ -/// │ │ │ ┃ -///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ -/// DataFusion DataFusion ┃ -///┃ Partition 1 Partition 2 -/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛ -/// -/// DataSourceExec -///``` -fn get_projected_output_ordering( - base_config: &FileScanConfig, - projected_schema: &SchemaRef, -) -> Vec { - let mut all_orderings = vec![]; - for output_ordering in &base_config.output_ordering { - let mut new_ordering = LexOrdering::default(); - for PhysicalSortExpr { expr, options } in output_ordering.iter() { - if let Some(col) = expr.as_any().downcast_ref::() { - let name = col.name(); - if let Some((idx, _)) = projected_schema.column_with_name(name) { - // Compute the new sort expression (with correct index) after projection: - new_ordering.push(PhysicalSortExpr { - expr: Arc::new(Column::new(name, idx)), - options: *options, - }); - continue; - } - } - // Cannot find expression in the projected_schema, stop iterating - // since rest of the orderings are violated - break; - } - - // do not push empty entries - // otherwise we may have `Some(vec![])` at the output ordering. - if new_ordering.is_empty() { - continue; - } - - // Check if any file groups are not sorted - if base_config.file_groups.iter().any(|group| { - if group.len() <= 1 { - // File groups with <= 1 files are always sorted - return false; - } - - let statistics = match statistics::MinMaxStatistics::new_from_files( - &new_ordering, - projected_schema, - base_config.projection.as_deref(), - group, - ) { - Ok(statistics) => statistics, - Err(e) => { - log::trace!("Error fetching statistics for file group: {e}"); - // we can't prove that it's ordered, so we have to reject it - return true; - } - }; - - !statistics.is_sorted() - }) { - debug!( - "Skipping specified output ordering {:?}. \ - Some file groups couldn't be determined to be sorted: {:?}", - base_config.output_ordering[0], base_config.file_groups - ); - continue; - } - all_orderings.push(new_ordering); - } - all_orderings -} +use object_store::{path::Path, GetOptions, GetRange, ObjectStore}; +use std::{ops::Range, sync::Arc}; /// Represents the possible outcomes of a range calculation. /// @@ -452,7 +171,6 @@ async fn find_first_newline( #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::{DefaultDisplay, VerboseDisplay}; use arrow::array::{ cast::AsArray, @@ -461,12 +179,11 @@ mod tests { StringArray, UInt64Array, }; use arrow::datatypes::{DataType, Field, Schema}; - use object_store::ObjectMeta; + use arrow_schema::SchemaRef; use crate::datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, }; - use chrono::Utc; #[test] fn schema_mapping_map_batch() { @@ -584,157 +301,4 @@ mod tests { assert_eq!(c4.value(1), 2.0_f32); assert_eq!(c4.value(2), 3.0_f32); } - - #[test] - fn file_groups_display_empty() { - let expected = "{0 groups: []}"; - assert_eq!(DefaultDisplay(FileGroupsDisplay(&[])).to_string(), expected); - } - - #[test] - fn file_groups_display_one() { - let files = [vec![partitioned_file("foo"), partitioned_file("bar")]]; - - let expected = "{1 group: [[foo, bar]]}"; - assert_eq!( - DefaultDisplay(FileGroupsDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_groups_display_many_default() { - let files = [ - vec![partitioned_file("foo"), partitioned_file("bar")], - vec![partitioned_file("baz")], - vec![], - ]; - - let expected = "{3 groups: [[foo, bar], [baz], []]}"; - assert_eq!( - DefaultDisplay(FileGroupsDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_groups_display_many_verbose() { - let files = [ - vec![partitioned_file("foo"), partitioned_file("bar")], - vec![partitioned_file("baz")], - vec![], - ]; - - let expected = "{3 groups: [[foo, bar], [baz], []]}"; - assert_eq!( - VerboseDisplay(FileGroupsDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_groups_display_too_many_default() { - let files = [ - vec![partitioned_file("foo"), partitioned_file("bar")], - vec![partitioned_file("baz")], - vec![partitioned_file("qux")], - vec![partitioned_file("quux")], - vec![partitioned_file("quuux")], - vec![partitioned_file("quuuux")], - vec![], - ]; - - let expected = "{7 groups: [[foo, bar], [baz], [qux], [quux], [quuux], ...]}"; - assert_eq!( - DefaultDisplay(FileGroupsDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_groups_display_too_many_verbose() { - let files = [ - vec![partitioned_file("foo"), partitioned_file("bar")], - vec![partitioned_file("baz")], - vec![partitioned_file("qux")], - vec![partitioned_file("quux")], - vec![partitioned_file("quuux")], - vec![partitioned_file("quuuux")], - vec![], - ]; - - let expected = - "{7 groups: [[foo, bar], [baz], [qux], [quux], [quuux], [quuuux], []]}"; - assert_eq!( - VerboseDisplay(FileGroupsDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_group_display_many_default() { - let files = vec![partitioned_file("foo"), partitioned_file("bar")]; - - let expected = "[foo, bar]"; - assert_eq!( - DefaultDisplay(FileGroupDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_group_display_too_many_default() { - let files = vec![ - partitioned_file("foo"), - partitioned_file("bar"), - partitioned_file("baz"), - partitioned_file("qux"), - partitioned_file("quux"), - partitioned_file("quuux"), - ]; - - let expected = "[foo, bar, baz, qux, quux, ...]"; - assert_eq!( - DefaultDisplay(FileGroupDisplay(&files)).to_string(), - expected - ); - } - - #[test] - fn file_group_display_too_many_verbose() { - let files = vec![ - partitioned_file("foo"), - partitioned_file("bar"), - partitioned_file("baz"), - partitioned_file("qux"), - partitioned_file("quux"), - partitioned_file("quuux"), - ]; - - let expected = "[foo, bar, baz, qux, quux, quuux]"; - assert_eq!( - VerboseDisplay(FileGroupDisplay(&files)).to_string(), - expected - ); - } - - /// create a PartitionedFile for testing - fn partitioned_file(path: &str) -> PartitionedFile { - let object_meta = ObjectMeta { - location: Path::parse(path).unwrap(), - last_modified: Utc::now(), - size: 42, - e_tag: None, - version: None, - }; - - PartitionedFile { - object_meta, - partition_values: vec![], - range: None, - statistics: None, - extensions: None, - metadata_size_hint: None, - } - } } diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 2a2d6d7fefdf..89902336ce5e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -32,9 +32,7 @@ use std::fmt::Formatter; use std::sync::Arc; use crate::datasource::listing::PartitionedFile; -use crate::datasource::physical_plan::{ - parquet::source::ParquetSource, DisplayAs, FileScanConfig, -}; +use crate::datasource::physical_plan::{parquet::source::ParquetSource, DisplayAs}; use crate::datasource::schema_adapter::SchemaAdapterFactory; use crate::{ config::TableParquetOptions, @@ -50,6 +48,7 @@ pub use access_plan::{ParquetAccessPlan, RowGroupAccess}; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::Constraints; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr}; use datafusion_physical_optimizer::pruning::PruningPredicate; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/source.rs b/datafusion/core/src/datasource/physical_plan/parquet/source.rs index 178de8f51ae4..0f0863905a3c 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/source.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/source.rs @@ -20,13 +20,10 @@ use std::any::Any; use std::fmt::Formatter; use std::sync::Arc; -use crate::datasource::data_source::FileSource; use crate::datasource::physical_plan::parquet::opener::ParquetOpener; use crate::datasource::physical_plan::parquet::page_filter::PagePruningAccessPlanFilter; use crate::datasource::physical_plan::parquet::DefaultParquetFileReaderFactory; -use crate::datasource::physical_plan::{ - FileOpener, FileScanConfig, ParquetFileReaderFactory, -}; +use crate::datasource::physical_plan::{FileOpener, ParquetFileReaderFactory}; use crate::datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, }; @@ -34,6 +31,8 @@ use crate::datasource::schema_adapter::{ use arrow::datatypes::{Schema, SchemaRef}; use datafusion_common::config::TableParquetOptions; use datafusion_common::Statistics; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; diff --git a/datafusion/core/src/datasource/schema_adapter.rs b/datafusion/core/src/datasource/schema_adapter.rs index 41e375cf81f8..8076c114ad16 100644 --- a/datafusion/core/src/datasource/schema_adapter.rs +++ b/datafusion/core/src/datasource/schema_adapter.rs @@ -437,12 +437,13 @@ mod tests { use arrow::array::{Int32Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; + use datafusion_datasource::file_scan_config::FileScanConfig; use object_store::path::Path; use object_store::ObjectMeta; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; - use crate::datasource::physical_plan::{FileScanConfig, ParquetSource}; + use crate::datasource::physical_plan::ParquetSource; use crate::datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index c569113a27bd..489e1ed240f4 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -25,13 +25,12 @@ use std::io::{BufReader, BufWriter}; use std::path::Path; use std::sync::Arc; -use crate::datasource::data_source::FileSource; use crate::datasource::file_format::csv::CsvFormat; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::FileFormat; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; -use crate::datasource::physical_plan::{CsvSource, FileScanConfig}; +use crate::datasource::physical_plan::CsvSource; use crate::datasource::{MemTable, TableProvider}; use crate::error::Result; use crate::logical_expr::LogicalPlan; @@ -42,6 +41,8 @@ use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::DataFusionError; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; #[cfg(feature = "compression")] diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index fc98b43051f8..9c6888bb8b10 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -26,7 +26,7 @@ use crate::common::ToDFSchema; use crate::config::ConfigOptions; use crate::datasource::listing::{ListingTableUrl, PartitionedFile}; use crate::datasource::object_store::ObjectStoreUrl; -use crate::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use crate::datasource::physical_plan::ParquetSource; use crate::error::Result; use crate::logical_expr::execution_props::ExecutionProps; use crate::logical_expr::simplify::SimplifyContext; @@ -37,6 +37,7 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use object_store::path::Path; use object_store::ObjectMeta; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 3412b962d859..ae69b0609a5d 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -28,6 +28,7 @@ use crate::physical_optimizer::test_utils::{ spr_repartition_exec, stream_exec_ordered, union_exec, RequirementsTestExec, }; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_physical_plan::displayable; use arrow::compute::SortOptions; use arrow::datatypes::SchemaRef; @@ -46,7 +47,7 @@ use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeE use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{TreeNode, TransformedResult}; -use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::{CsvSource, ParquetSource}; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion::datasource::listing::PartitionedFile; use datafusion_physical_optimizer::enforce_distribution::EnforceDistribution; diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index e4d72c112c38..418c46628daa 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -27,12 +27,13 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::MemorySourceConfig; -use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::source::DataSourceExec; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::utils::expr::COUNT_STAR_EXPANSION; use datafusion_common::{JoinType, Result}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{WindowFrame, WindowFunctionDefinition}; diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index a1d7af106f69..d75b27c2f685 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -31,6 +31,8 @@ version.workspace = true all-features = true [features] +# Temporary feature while I move things around +avro = [] compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"] default = ["compression"] diff --git a/datafusion/datasource/src/display.rs b/datafusion/datasource/src/display.rs new file mode 100644 index 000000000000..58fc27bb8010 --- /dev/null +++ b/datafusion/datasource/src/display.rs @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_physical_plan::{DisplayAs, DisplayFormatType}; + +use std::fmt::{Debug, Formatter, Result as FmtResult}; + +use crate::PartitionedFile; + +/// A wrapper to customize partitioned file display +/// +/// Prints in the format: +/// ```text +/// {NUM_GROUPS groups: [[file1, file2,...], [fileN, fileM, ...], ...]} +/// ``` +#[derive(Debug)] +pub(crate) struct FileGroupsDisplay<'a>(pub(crate) &'a [Vec]); + +impl DisplayAs for FileGroupsDisplay<'_> { + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { + let n_groups = self.0.len(); + let groups = if n_groups == 1 { "group" } else { "groups" }; + write!(f, "{{{n_groups} {groups}: [")?; + match t { + DisplayFormatType::Default => { + // To avoid showing too many partitions + let max_groups = 5; + fmt_up_to_n_elements(self.0, max_groups, f, |group, f| { + FileGroupDisplay(group).fmt_as(t, f) + })?; + } + DisplayFormatType::Verbose => { + fmt_elements_split_by_commas(self.0.iter(), f, |group, f| { + FileGroupDisplay(group).fmt_as(t, f) + })? + } + } + write!(f, "]}}") + } +} + +/// A wrapper to customize partitioned group of files display +/// +/// Prints in the format: +/// ```text +/// [file1, file2,...] +/// ``` +#[derive(Debug)] +pub struct FileGroupDisplay<'a>(pub &'a [PartitionedFile]); + +impl DisplayAs for FileGroupDisplay<'_> { + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { + write!(f, "[")?; + match t { + DisplayFormatType::Default => { + // To avoid showing too many files + let max_files = 5; + fmt_up_to_n_elements(self.0, max_files, f, |pf, f| { + write!(f, "{}", pf.object_meta.location.as_ref())?; + if let Some(range) = pf.range.as_ref() { + write!(f, ":{}..{}", range.start, range.end)?; + } + Ok(()) + })? + } + DisplayFormatType::Verbose => { + fmt_elements_split_by_commas(self.0.iter(), f, |pf, f| { + write!(f, "{}", pf.object_meta.location.as_ref())?; + if let Some(range) = pf.range.as_ref() { + write!(f, ":{}..{}", range.start, range.end)?; + } + Ok(()) + })? + } + } + write!(f, "]") + } +} + +/// helper to format an array of up to N elements +fn fmt_up_to_n_elements( + elements: &[E], + n: usize, + f: &mut Formatter, + format_element: F, +) -> FmtResult +where + F: Fn(&E, &mut Formatter) -> FmtResult, +{ + let len = elements.len(); + fmt_elements_split_by_commas(elements.iter().take(n), f, |element, f| { + format_element(element, f) + })?; + // Remaining elements are showed as `...` (to indicate there is more) + if len > n { + write!(f, ", ...")?; + } + Ok(()) +} + +/// helper formatting array elements with a comma and a space between them +fn fmt_elements_split_by_commas( + iter: I, + f: &mut Formatter, + format_element: F, +) -> FmtResult +where + I: Iterator, + F: Fn(E, &mut Formatter) -> FmtResult, +{ + for (idx, element) in iter.enumerate() { + if idx > 0 { + write!(f, ", ")?; + } + format_element(element, f)?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + use datafusion_physical_plan::{DefaultDisplay, VerboseDisplay}; + use object_store::{path::Path, ObjectMeta}; + + use chrono::Utc; + + #[test] + fn file_groups_display_empty() { + let expected = "{0 groups: []}"; + assert_eq!(DefaultDisplay(FileGroupsDisplay(&[])).to_string(), expected); + } + + #[test] + fn file_groups_display_one() { + let files = [vec![partitioned_file("foo"), partitioned_file("bar")]]; + + let expected = "{1 group: [[foo, bar]]}"; + assert_eq!( + DefaultDisplay(FileGroupsDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_groups_display_many_default() { + let files = [ + vec![partitioned_file("foo"), partitioned_file("bar")], + vec![partitioned_file("baz")], + vec![], + ]; + + let expected = "{3 groups: [[foo, bar], [baz], []]}"; + assert_eq!( + DefaultDisplay(FileGroupsDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_groups_display_many_verbose() { + let files = [ + vec![partitioned_file("foo"), partitioned_file("bar")], + vec![partitioned_file("baz")], + vec![], + ]; + + let expected = "{3 groups: [[foo, bar], [baz], []]}"; + assert_eq!( + VerboseDisplay(FileGroupsDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_groups_display_too_many_default() { + let files = [ + vec![partitioned_file("foo"), partitioned_file("bar")], + vec![partitioned_file("baz")], + vec![partitioned_file("qux")], + vec![partitioned_file("quux")], + vec![partitioned_file("quuux")], + vec![partitioned_file("quuuux")], + vec![], + ]; + + let expected = "{7 groups: [[foo, bar], [baz], [qux], [quux], [quuux], ...]}"; + assert_eq!( + DefaultDisplay(FileGroupsDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_groups_display_too_many_verbose() { + let files = [ + vec![partitioned_file("foo"), partitioned_file("bar")], + vec![partitioned_file("baz")], + vec![partitioned_file("qux")], + vec![partitioned_file("quux")], + vec![partitioned_file("quuux")], + vec![partitioned_file("quuuux")], + vec![], + ]; + + let expected = + "{7 groups: [[foo, bar], [baz], [qux], [quux], [quuux], [quuuux], []]}"; + assert_eq!( + VerboseDisplay(FileGroupsDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_group_display_many_default() { + let files = vec![partitioned_file("foo"), partitioned_file("bar")]; + + let expected = "[foo, bar]"; + assert_eq!( + DefaultDisplay(FileGroupDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_group_display_too_many_default() { + let files = vec![ + partitioned_file("foo"), + partitioned_file("bar"), + partitioned_file("baz"), + partitioned_file("qux"), + partitioned_file("quux"), + partitioned_file("quuux"), + ]; + + let expected = "[foo, bar, baz, qux, quux, ...]"; + assert_eq!( + DefaultDisplay(FileGroupDisplay(&files)).to_string(), + expected + ); + } + + #[test] + fn file_group_display_too_many_verbose() { + let files = vec![ + partitioned_file("foo"), + partitioned_file("bar"), + partitioned_file("baz"), + partitioned_file("qux"), + partitioned_file("quux"), + partitioned_file("quuux"), + ]; + + let expected = "[foo, bar, baz, qux, quux, quuux]"; + assert_eq!( + VerboseDisplay(FileGroupDisplay(&files)).to_string(), + expected + ); + } + + /// create a PartitionedFile for testing + fn partitioned_file(path: &str) -> PartitionedFile { + let object_meta = ObjectMeta { + location: Path::parse(path).unwrap(), + last_modified: Utc::now(), + size: 42, + e_tag: None, + version: None, + }; + + PartitionedFile { + object_meta, + partition_values: vec![], + range: None, + statistics: None, + extensions: None, + metadata_size_hint: None, + } + } +} diff --git a/datafusion/core/src/datasource/data_source.rs b/datafusion/datasource/src/file.rs similarity index 95% rename from datafusion/core/src/datasource/data_source.rs rename to datafusion/datasource/src/file.rs index 2db79c5c839d..8d8cbbc67b9a 100644 --- a/datafusion/core/src/datasource/data_source.rs +++ b/datafusion/datasource/src/file.rs @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. -//! DataSource and FileSource trait implementations +//! Common behaviors that every file format needs to implement use std::any::Any; use std::fmt; use std::fmt::Formatter; use std::sync::Arc; -use crate::datasource::physical_plan::{FileOpener, FileScanConfig}; - +use crate::file_groups::FileGroupPartitioner; +use crate::file_scan_config::FileScanConfig; +use crate::file_stream::FileOpener; use arrow::datatypes::SchemaRef; use datafusion_common::Statistics; -use datafusion_datasource::file_groups::FileGroupPartitioner; use datafusion_physical_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index bfddbc3a1fc4..affea6e34afd 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -15,7 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::{borrow::Cow, collections::HashMap, marker::PhantomData, sync::Arc}; +//! [`FileScanConfig`] to configure scanning of possibly partitioned +//! file sources. + +use std::{ + any::Any, borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter, + fmt::Result as FmtResult, marker::PhantomData, sync::Arc, +}; use arrow::{ array::{ @@ -23,11 +29,618 @@ use arrow::{ RecordBatchOptions, }, buffer::Buffer, - datatypes::{ArrowNativeType, DataType, SchemaRef, UInt16Type}, + datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type}, +}; +use datafusion_common::{ + exec_err, stats::Precision, ColumnStatistics, Constraints, Result, Statistics, }; -use datafusion_common::{exec_err, Result}; use datafusion_common::{DataFusionError, ScalarValue}; -use log::warn; +use datafusion_execution::{ + object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, +}; +use datafusion_physical_expr::{ + expressions::Column, EquivalenceProperties, LexOrdering, Partitioning, + PhysicalSortExpr, +}; +use datafusion_physical_plan::{ + display::{display_orderings, ProjectSchemaDisplay}, + metrics::ExecutionPlanMetricsSet, + projection::{all_alias_free_columns, new_projections_for_columns, ProjectionExec}, + DisplayAs, DisplayFormatType, ExecutionPlan, +}; +use log::{debug, warn}; + +use crate::{ + display::FileGroupsDisplay, + file::FileSource, + file_compression_type::FileCompressionType, + file_stream::FileStream, + source::{DataSource, DataSourceExec}, + statistics::MinMaxStatistics, + PartitionedFile, +}; + +/// The base configurations for a [`DataSourceExec`], the a physical plan for +/// any given file format. +/// +/// Use [`Self::build`] to create a [`DataSourceExec`] from a ``FileScanConfig`. +/// +/// # Example +/// ``` +/// # use std::any::Any; +/// # use std::sync::Arc; +/// # use arrow::datatypes::{Field, Fields, DataType, Schema, SchemaRef}; +/// # use object_store::ObjectStore; +/// # use datafusion_common::Statistics; +/// # use datafusion_datasource::file::FileSource; +/// # use datafusion_datasource::PartitionedFile; +/// # use datafusion_datasource::file_scan_config::FileScanConfig; +/// # use datafusion_datasource::file_stream::FileOpener; +/// # use datafusion_execution::object_store::ObjectStoreUrl; +/// # use datafusion_physical_plan::ExecutionPlan; +/// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +/// # let file_schema = Arc::new(Schema::new(vec![ +/// # Field::new("c1", DataType::Int32, false), +/// # Field::new("c2", DataType::Int32, false), +/// # Field::new("c3", DataType::Int32, false), +/// # Field::new("c4", DataType::Int32, false), +/// # ])); +/// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate +/// # struct ParquetSource {}; +/// # impl FileSource for ParquetSource { +/// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Arc { unimplemented!() } +/// # fn as_any(&self) -> &dyn Any { self } +/// # fn with_batch_size(&self, _: usize) -> Arc { unimplemented!() } +/// # fn with_schema(&self, _: SchemaRef) -> Arc { unimplemented!() } +/// # fn with_projection(&self, _: &FileScanConfig) -> Arc { unimplemented!() } +/// # fn with_statistics(&self, _: Statistics) -> Arc { Arc::new(Self::new()) } +/// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() } +/// # fn statistics(&self) -> datafusion_common::Result { unimplemented!() } +/// # fn file_type(&self) -> &str { "parquet" } +/// # } +/// # impl ParquetSource { +/// # fn new() -> Self { Self{} } +/// # } +/// // create FileScan config for reading parquet files from file:// +/// let object_store_url = ObjectStoreUrl::local_filesystem(); +/// let file_source = Arc::new(ParquetSource::new()); +/// let config = FileScanConfig::new(object_store_url, file_schema, file_source) +/// .with_limit(Some(1000)) // read only the first 1000 records +/// .with_projection(Some(vec![2, 3])) // project columns 2 and 3 +/// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group +/// .with_file(PartitionedFile::new("file1.parquet", 1234)) +/// // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes +/// // in a single row group +/// .with_file_group(vec![ +/// PartitionedFile::new("file2.parquet", 56), +/// PartitionedFile::new("file3.parquet", 78), +/// ]); +/// // create an execution plan from the config +/// let plan: Arc = config.build(); +/// ``` +#[derive(Clone)] +pub struct FileScanConfig { + /// Object store URL, used to get an [`ObjectStore`] instance from + /// [`RuntimeEnv::object_store`] + /// + /// This `ObjectStoreUrl` should be the prefix of the absolute url for files + /// as `file://` or `s3://my_bucket`. It should not include the path to the + /// file itself. The relevant URL prefix must be registered via + /// [`RuntimeEnv::register_object_store`] + /// + /// [`ObjectStore`]: object_store::ObjectStore + /// [`RuntimeEnv::register_object_store`]: datafusion_execution::runtime_env::RuntimeEnv::register_object_store + /// [`RuntimeEnv::object_store`]: datafusion_execution::runtime_env::RuntimeEnv::object_store + pub object_store_url: ObjectStoreUrl, + /// Schema before `projection` is applied. It contains the all columns that may + /// appear in the files. It does not include table partition columns + /// that may be added. + pub file_schema: SchemaRef, + /// List of files to be processed, grouped into partitions + /// + /// Each file must have a schema of `file_schema` or a subset. If + /// a particular file has a subset, the missing columns are + /// padded with NULLs. + /// + /// DataFusion may attempt to read each partition of files + /// concurrently, however files *within* a partition will be read + /// sequentially, one after the next. + pub file_groups: Vec>, + /// Table constraints + pub constraints: Constraints, + /// Estimated overall statistics of the files, taking `filters` into account. + /// Defaults to [`Statistics::new_unknown`]. + pub statistics: Statistics, + /// Columns on which to project the data. Indexes that are higher than the + /// number of columns of `file_schema` refer to `table_partition_cols`. + pub projection: Option>, + /// The maximum number of records to read from this plan. If `None`, + /// all records after filtering are returned. + pub limit: Option, + /// The partitioning columns + pub table_partition_cols: Vec, + /// All equivalent lexicographical orderings that describe the schema. + pub output_ordering: Vec, + /// File compression type + pub file_compression_type: FileCompressionType, + /// Are new lines in values supported for CSVOptions + pub new_lines_in_values: bool, + /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc. + pub source: Arc, +} + +impl DataSource for FileScanConfig { + fn open( + &self, + partition: usize, + context: Arc, + ) -> Result { + let object_store = context.runtime_env().object_store(&self.object_store_url)?; + + let source = self + .source + .with_batch_size(context.session_config().batch_size()) + .with_schema(Arc::clone(&self.file_schema)) + .with_projection(self); + + let opener = source.create_file_opener(object_store, self, partition); + + let stream = FileStream::new(self, partition, opener, source.metrics())?; + Ok(Box::pin(stream)) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { + let (schema, _, _, orderings) = self.project(); + + write!(f, "file_groups=")?; + FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?; + + if !schema.fields().is_empty() { + write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; + } + + if let Some(limit) = self.limit { + write!(f, ", limit={limit}")?; + } + + display_orderings(f, &orderings)?; + + if !self.constraints.is_empty() { + write!(f, ", {}", self.constraints)?; + } + + self.fmt_file_source(t, f) + } + + /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size. + fn repartitioned( + &self, + target_partitions: usize, + repartition_file_min_size: usize, + output_ordering: Option, + ) -> Result>> { + let source = self.source.repartitioned( + target_partitions, + repartition_file_min_size, + output_ordering, + self, + )?; + + Ok(source.map(|s| Arc::new(s) as _)) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.file_groups.len()) + } + + fn eq_properties(&self) -> EquivalenceProperties { + let (schema, constraints, _, orderings) = self.project(); + EquivalenceProperties::new_with_orderings(schema, orderings.as_slice()) + .with_constraints(constraints) + } + + fn statistics(&self) -> Result { + self.source.statistics() + } + + fn with_fetch(&self, limit: Option) -> Option> { + let source = self.clone(); + Some(Arc::new(source.with_limit(limit))) + } + + fn fetch(&self) -> Option { + self.limit + } + + fn metrics(&self) -> ExecutionPlanMetricsSet { + self.source.metrics().clone() + } + + fn try_swapping_with_projection( + &self, + projection: &ProjectionExec, + ) -> Result>> { + // If there is any non-column or alias-carrier expression, Projection should not be removed. + // This process can be moved into CsvExec, but it would be an overlap of their responsibility. + Ok(all_alias_free_columns(projection.expr()).then(|| { + let file_scan = self.clone(); + let source = Arc::clone(&file_scan.source); + let new_projections = new_projections_for_columns( + projection, + &file_scan + .projection + .clone() + .unwrap_or((0..self.file_schema.fields().len()).collect()), + ); + file_scan + // Assign projected statistics to source + .with_projection(Some(new_projections)) + .with_source(source) + .build() as _ + })) + } +} + +impl FileScanConfig { + /// Create a new [`FileScanConfig`] with default settings for scanning files. + /// + /// See example on [`FileScanConfig`] + /// + /// No file groups are added by default. See [`Self::with_file`], [`Self::with_file_group`] and + /// [`Self::with_file_groups`]. + /// + /// # Parameters: + /// * `object_store_url`: See [`Self::object_store_url`] + /// * `file_schema`: See [`Self::file_schema`] + pub fn new( + object_store_url: ObjectStoreUrl, + file_schema: SchemaRef, + file_source: Arc, + ) -> Self { + let statistics = Statistics::new_unknown(&file_schema); + + let mut config = Self { + object_store_url, + file_schema, + file_groups: vec![], + constraints: Constraints::empty(), + statistics, + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: vec![], + file_compression_type: FileCompressionType::UNCOMPRESSED, + new_lines_in_values: false, + source: Arc::clone(&file_source), + }; + + config = config.with_source(Arc::clone(&file_source)); + config + } + + /// Set the file source + pub fn with_source(mut self, source: Arc) -> Self { + let ( + _projected_schema, + _constraints, + projected_statistics, + _projected_output_ordering, + ) = self.project(); + self.source = source.with_statistics(projected_statistics); + self + } + + /// Set the table constraints of the files + pub fn with_constraints(mut self, constraints: Constraints) -> Self { + self.constraints = constraints; + self + } + + /// Set the statistics of the files + pub fn with_statistics(mut self, statistics: Statistics) -> Self { + self.statistics = statistics; + self + } + + /// Set the projection of the files + pub fn with_projection(mut self, projection: Option>) -> Self { + self.projection = projection; + self + } + + /// Set the limit of the files + pub fn with_limit(mut self, limit: Option) -> Self { + self.limit = limit; + self + } + + /// Add a file as a single group + /// + /// See [Self::file_groups] for more information. + pub fn with_file(self, file: PartitionedFile) -> Self { + self.with_file_group(vec![file]) + } + + /// Add the file groups + /// + /// See [Self::file_groups] for more information. + pub fn with_file_groups( + mut self, + mut file_groups: Vec>, + ) -> Self { + self.file_groups.append(&mut file_groups); + self + } + + /// Add a new file group + /// + /// See [Self::file_groups] for more information + pub fn with_file_group(mut self, file_group: Vec) -> Self { + self.file_groups.push(file_group); + self + } + + /// Set the partitioning columns of the files + pub fn with_table_partition_cols(mut self, table_partition_cols: Vec) -> Self { + self.table_partition_cols = table_partition_cols; + self + } + + /// Set the output ordering of the files + pub fn with_output_ordering(mut self, output_ordering: Vec) -> Self { + self.output_ordering = output_ordering; + self + } + + /// Set the file compression type + pub fn with_file_compression_type( + mut self, + file_compression_type: FileCompressionType, + ) -> Self { + self.file_compression_type = file_compression_type; + self + } + + /// Set the new_lines_in_values property + pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self { + self.new_lines_in_values = new_lines_in_values; + self + } + + /// Specifies whether newlines in (quoted) values are supported. + /// + /// Parsing newlines in quoted values may be affected by execution behaviour such as + /// parallel file scanning. Setting this to `true` ensures that newlines in values are + /// parsed successfully, which may reduce performance. + /// + /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting. + pub fn newlines_in_values(&self) -> bool { + self.new_lines_in_values + } + + /// Project the schema, constraints, and the statistics on the given column indices + pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec) { + if self.projection.is_none() && self.table_partition_cols.is_empty() { + return ( + Arc::clone(&self.file_schema), + self.constraints.clone(), + self.statistics.clone(), + self.output_ordering.clone(), + ); + } + + let proj_indices = if let Some(proj) = &self.projection { + proj + } else { + let len = self.file_schema.fields().len() + self.table_partition_cols.len(); + &(0..len).collect::>() + }; + + let mut table_fields = vec![]; + let mut table_cols_stats = vec![]; + for idx in proj_indices { + if *idx < self.file_schema.fields().len() { + let field = self.file_schema.field(*idx); + table_fields.push(field.clone()); + table_cols_stats.push(self.statistics.column_statistics[*idx].clone()) + } else { + let partition_idx = idx - self.file_schema.fields().len(); + table_fields.push(self.table_partition_cols[partition_idx].to_owned()); + // TODO provide accurate stat for partition column (#1186) + table_cols_stats.push(ColumnStatistics::new_unknown()) + } + } + + let table_stats = Statistics { + num_rows: self.statistics.num_rows, + // TODO correct byte size? + total_byte_size: Precision::Absent, + column_statistics: table_cols_stats, + }; + + let projected_schema = Arc::new(Schema::new_with_metadata( + table_fields, + self.file_schema.metadata().clone(), + )); + + let projected_constraints = self + .constraints + .project(proj_indices) + .unwrap_or_else(Constraints::empty); + + let projected_output_ordering = + get_projected_output_ordering(self, &projected_schema); + + ( + projected_schema, + projected_constraints, + table_stats, + projected_output_ordering, + ) + } + + #[cfg_attr(not(feature = "avro"), allow(unused))] // Only used by avro + pub fn projected_file_column_names(&self) -> Option> { + self.projection.as_ref().map(|p| { + p.iter() + .filter(|col_idx| **col_idx < self.file_schema.fields().len()) + .map(|col_idx| self.file_schema.field(*col_idx).name()) + .cloned() + .collect() + }) + } + + /// Projects only file schema, ignoring partition columns + pub fn projected_file_schema(&self) -> SchemaRef { + let fields = self.file_column_projection_indices().map(|indices| { + indices + .iter() + .map(|col_idx| self.file_schema.field(*col_idx)) + .cloned() + .collect::>() + }); + + fields.map_or_else( + || Arc::clone(&self.file_schema), + |f| { + Arc::new(Schema::new_with_metadata( + f, + self.file_schema.metadata.clone(), + )) + }, + ) + } + + pub fn file_column_projection_indices(&self) -> Option> { + self.projection.as_ref().map(|p| { + p.iter() + .filter(|col_idx| **col_idx < self.file_schema.fields().len()) + .copied() + .collect() + }) + } + + /// Attempts to do a bin-packing on files into file groups, such that any two files + /// in a file group are ordered and non-overlapping with respect to their statistics. + /// It will produce the smallest number of file groups possible. + pub fn split_groups_by_statistics( + table_schema: &SchemaRef, + file_groups: &[Vec], + sort_order: &LexOrdering, + ) -> Result>> { + let flattened_files = file_groups.iter().flatten().collect::>(); + // First Fit: + // * Choose the first file group that a file can be placed into. + // * If it fits into no existing file groups, create a new one. + // + // By sorting files by min values and then applying first-fit bin packing, + // we can produce the smallest number of file groups such that + // files within a group are in order and non-overlapping. + // + // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 + // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html + + if flattened_files.is_empty() { + return Ok(vec![]); + } + + let statistics = MinMaxStatistics::new_from_files( + sort_order, + table_schema, + None, + flattened_files.iter().copied(), + ) + .map_err(|e| { + e.context("construct min/max statistics for split_groups_by_statistics") + })?; + + let indices_sorted_by_min = statistics.min_values_sorted(); + let mut file_groups_indices: Vec> = vec![]; + + for (idx, min) in indices_sorted_by_min { + let file_group_to_insert = file_groups_indices.iter_mut().find(|group| { + // If our file is non-overlapping and comes _after_ the last file, + // it fits in this file group. + min > statistics.max( + *group + .last() + .expect("groups should be nonempty at construction"), + ) + }); + match file_group_to_insert { + Some(group) => group.push(idx), + None => file_groups_indices.push(vec![idx]), + } + } + + // Assemble indices back into groups of PartitionedFiles + Ok(file_groups_indices + .into_iter() + .map(|file_group_indices| { + file_group_indices + .into_iter() + .map(|idx| flattened_files[idx].clone()) + .collect() + }) + .collect()) + } + + // TODO: This function should be moved into DataSourceExec once FileScanConfig moved out of datafusion/core + /// Returns a new [`DataSourceExec`] to scan the files specified by this config + pub fn build(self) -> Arc { + Arc::new(DataSourceExec::new(Arc::new(self))) + } + + /// Write the data_type based on file_source + fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { + write!(f, ", file_type={}", self.source.file_type())?; + self.source.fmt_extra(t, f) + } + + /// Returns the file_source + pub fn file_source(&self) -> &Arc { + &self.source + } +} + +impl Debug for FileScanConfig { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "object_store_url={:?}, ", self.object_store_url)?; + + write!(f, "statistics={:?}, ", self.statistics)?; + + DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f) + } +} + +impl DisplayAs for FileScanConfig { + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { + let (schema, _, _, orderings) = self.project(); + + write!(f, "file_groups=")?; + FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?; + + if !schema.fields().is_empty() { + write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; + } + + if let Some(limit) = self.limit { + write!(f, ", limit={limit}")?; + } + + display_orderings(f, &orderings)?; + + if !self.constraints.is_empty() { + write!(f, ", {}", self.constraints)?; + } + + Ok(()) + } +} /// A helper that projects partition columns into the file record batches. /// @@ -276,3 +889,829 @@ fn create_output_array( val.to_array_of_size(len) } + +/// The various listing tables does not attempt to read all files +/// concurrently, instead they will read files in sequence within a +/// partition. This is an important property as it allows plans to +/// run against 1000s of files and not try to open them all +/// concurrently. +/// +/// However, it means if we assign more than one file to a partition +/// the output sort order will not be preserved as illustrated in the +/// following diagrams: +/// +/// When only 1 file is assigned to each partition, each partition is +/// correctly sorted on `(A, B, C)` +/// +/// ```text +///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐ +///┃ ┌───────────────┐ ┌──────────────┐ │ ┌──────────────┐ │ ┌─────────────┐ ┃ +/// │ │ 1.parquet │ │ │ │ 2.parquet │ │ │ 3.parquet │ │ │ 4.parquet │ │ +///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ │Sort: A, B, C │ │ │Sort: A, B, C│ ┃ +/// │ └───────────────┘ │ │ └──────────────┘ │ └──────────────┘ │ └─────────────┘ │ +///┃ │ │ ┃ +/// │ │ │ │ │ │ +///┃ │ │ ┃ +/// │ │ │ │ │ │ +///┃ │ │ ┃ +/// │ │ │ │ │ │ +///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +/// DataFusion DataFusion DataFusion DataFusion +///┃ Partition 1 Partition 2 Partition 3 Partition 4 ┃ +/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ +/// +/// DataSourceExec +///``` +/// +/// However, when more than 1 file is assigned to each partition, each +/// partition is NOT correctly sorted on `(A, B, C)`. Once the second +/// file is scanned, the same values for A, B and C can be repeated in +/// the same sorted stream +/// +///```text +///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +///┃ ┌───────────────┐ ┌──────────────┐ │ +/// │ │ 1.parquet │ │ │ │ 2.parquet │ ┃ +///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ +/// │ └───────────────┘ │ │ └──────────────┘ ┃ +///┃ ┌───────────────┐ ┌──────────────┐ │ +/// │ │ 3.parquet │ │ │ │ 4.parquet │ ┃ +///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ +/// │ └───────────────┘ │ │ └──────────────┘ ┃ +///┃ │ +/// │ │ │ ┃ +///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ +/// DataFusion DataFusion ┃ +///┃ Partition 1 Partition 2 +/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛ +/// +/// DataSourceExec +///``` +fn get_projected_output_ordering( + base_config: &FileScanConfig, + projected_schema: &SchemaRef, +) -> Vec { + let mut all_orderings = vec![]; + for output_ordering in &base_config.output_ordering { + let mut new_ordering = LexOrdering::default(); + for PhysicalSortExpr { expr, options } in output_ordering.iter() { + if let Some(col) = expr.as_any().downcast_ref::() { + let name = col.name(); + if let Some((idx, _)) = projected_schema.column_with_name(name) { + // Compute the new sort expression (with correct index) after projection: + new_ordering.push(PhysicalSortExpr { + expr: Arc::new(Column::new(name, idx)), + options: *options, + }); + continue; + } + } + // Cannot find expression in the projected_schema, stop iterating + // since rest of the orderings are violated + break; + } + + // do not push empty entries + // otherwise we may have `Some(vec![])` at the output ordering. + if new_ordering.is_empty() { + continue; + } + + // Check if any file groups are not sorted + if base_config.file_groups.iter().any(|group| { + if group.len() <= 1 { + // File groups with <= 1 files are always sorted + return false; + } + + let statistics = match MinMaxStatistics::new_from_files( + &new_ordering, + projected_schema, + base_config.projection.as_deref(), + group, + ) { + Ok(statistics) => statistics, + Err(e) => { + log::trace!("Error fetching statistics for file group: {e}"); + // we can't prove that it's ordered, so we have to reject it + return true; + } + }; + + !statistics.is_sorted() + }) { + debug!( + "Skipping specified output ordering {:?}. \ + Some file groups couldn't be determined to be sorted: {:?}", + base_config.output_ordering[0], base_config.file_groups + ); + continue; + } + + all_orderings.push(new_ordering); + } + all_orderings +} + +/// Convert type to a type suitable for use as a `ListingTable` +/// partition column. Returns `Dictionary(UInt16, val_type)`, which is +/// a reasonable trade off between a reasonable number of partition +/// values and space efficiency. +/// +/// This use this to specify types for partition columns. However +/// you MAY also choose not to dictionary-encode the data or to use a +/// different dictionary type. +/// +/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say. +pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType { + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type)) +} + +/// Convert a [`ScalarValue`] of partition columns to a type, as +/// described in the documentation of [`wrap_partition_type_in_dict`], +/// which can wrap the types. +pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue { + ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val)) +} + +#[cfg(test)] +mod tests { + use crate::{test_util::MockSource, tests::aggr_test_schema}; + + use super::*; + use arrow::{ + array::{Int32Array, RecordBatch}, + compute::SortOptions, + }; + + use datafusion_common::{assert_batches_eq, DFSchema}; + use datafusion_expr::{execution_props::ExecutionProps, SortExpr}; + use datafusion_physical_expr::create_physical_expr; + use std::collections::HashMap; + + fn create_physical_sort_expr( + e: &SortExpr, + input_dfschema: &DFSchema, + execution_props: &ExecutionProps, + ) -> Result { + let SortExpr { + expr, + asc, + nulls_first, + } = e; + Ok(PhysicalSortExpr { + expr: create_physical_expr(expr, input_dfschema, execution_props)?, + options: SortOptions { + descending: !asc, + nulls_first: *nulls_first, + }, + }) + } + + /// Returns the column names on the schema + pub fn columns(schema: &Schema) -> Vec { + schema.fields().iter().map(|f| f.name().clone()).collect() + } + + #[test] + fn physical_plan_config_no_projection() { + let file_schema = aggr_test_schema(); + let conf = config_for_projection( + Arc::clone(&file_schema), + None, + Statistics::new_unknown(&file_schema), + to_partition_cols(vec![( + "date".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + )]), + ); + + let (proj_schema, _, proj_statistics, _) = conf.project(); + assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); + assert_eq!( + proj_schema.field(file_schema.fields().len()).name(), + "date", + "partition columns are the last columns" + ); + assert_eq!( + proj_statistics.column_statistics.len(), + file_schema.fields().len() + 1 + ); + // TODO implement tests for partition column statistics once implemented + + let col_names = conf.projected_file_column_names(); + assert_eq!(col_names, None); + + let col_indices = conf.file_column_projection_indices(); + assert_eq!(col_indices, None); + } + + #[test] + fn physical_plan_config_no_projection_tab_cols_as_field() { + let file_schema = aggr_test_schema(); + + // make a table_partition_col as a field + let table_partition_col = + Field::new("date", wrap_partition_type_in_dict(DataType::Utf8), true) + .with_metadata(HashMap::from_iter(vec![( + "key_whatever".to_owned(), + "value_whatever".to_owned(), + )])); + + let conf = config_for_projection( + Arc::clone(&file_schema), + None, + Statistics::new_unknown(&file_schema), + vec![table_partition_col.clone()], + ); + + // verify the proj_schema includes the last column and exactly the same the field it is defined + let (proj_schema, _, _, _) = conf.project(); + assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); + assert_eq!( + *proj_schema.field(file_schema.fields().len()), + table_partition_col, + "partition columns are the last columns and ust have all values defined in created field" + ); + } + + #[test] + fn physical_plan_config_with_projection() { + let file_schema = aggr_test_schema(); + let conf = config_for_projection( + Arc::clone(&file_schema), + Some(vec![file_schema.fields().len(), 0]), + Statistics { + num_rows: Precision::Inexact(10), + // assign the column index to distinct_count to help assert + // the source statistic after the projection + column_statistics: (0..file_schema.fields().len()) + .map(|i| ColumnStatistics { + distinct_count: Precision::Inexact(i), + ..Default::default() + }) + .collect(), + total_byte_size: Precision::Absent, + }, + to_partition_cols(vec![( + "date".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + )]), + ); + + let (proj_schema, _, proj_statistics, _) = conf.project(); + assert_eq!( + columns(&proj_schema), + vec!["date".to_owned(), "c1".to_owned()] + ); + let proj_stat_cols = proj_statistics.column_statistics; + assert_eq!(proj_stat_cols.len(), 2); + // TODO implement tests for proj_stat_cols[0] once partition column + // statistics are implemented + assert_eq!(proj_stat_cols[1].distinct_count, Precision::Inexact(0)); + + let col_names = conf.projected_file_column_names(); + assert_eq!(col_names, Some(vec!["c1".to_owned()])); + + let col_indices = conf.file_column_projection_indices(); + assert_eq!(col_indices, Some(vec![0])); + } + + #[test] + fn partition_column_projector() { + let file_batch = build_table_i32( + ("a", &vec![0, 1, 2]), + ("b", &vec![-2, -1, 0]), + ("c", &vec![10, 11, 12]), + ); + let partition_cols = vec![ + ( + "year".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ( + "month".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ( + "day".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ]; + // create a projected schema + let conf = config_for_projection( + file_batch.schema(), + // keep all cols from file and 2 from partitioning + Some(vec![ + 0, + 1, + 2, + file_batch.schema().fields().len(), + file_batch.schema().fields().len() + 2, + ]), + Statistics::new_unknown(&file_batch.schema()), + to_partition_cols(partition_cols.clone()), + ); + let (proj_schema, ..) = conf.project(); + // created a projector for that projected schema + let mut proj = PartitionColumnProjector::new( + proj_schema, + &partition_cols + .iter() + .map(|x| x.0.clone()) + .collect::>(), + ); + + // project first batch + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + wrap_partition_value_in_dict(ScalarValue::from("2021")), + wrap_partition_value_in_dict(ScalarValue::from("10")), + wrap_partition_value_in_dict(ScalarValue::from("26")), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = [ + "+---+----+----+------+-----+", + "| a | b | c | year | day |", + "+---+----+----+------+-----+", + "| 0 | -2 | 10 | 2021 | 26 |", + "| 1 | -1 | 11 | 2021 | 26 |", + "| 2 | 0 | 12 | 2021 | 26 |", + "+---+----+----+------+-----+", + ]; + assert_batches_eq!(expected, &[projected_batch]); + + // project another batch that is larger than the previous one + let file_batch = build_table_i32( + ("a", &vec![5, 6, 7, 8, 9]), + ("b", &vec![-10, -9, -8, -7, -6]), + ("c", &vec![12, 13, 14, 15, 16]), + ); + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + wrap_partition_value_in_dict(ScalarValue::from("2021")), + wrap_partition_value_in_dict(ScalarValue::from("10")), + wrap_partition_value_in_dict(ScalarValue::from("27")), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = [ + "+---+-----+----+------+-----+", + "| a | b | c | year | day |", + "+---+-----+----+------+-----+", + "| 5 | -10 | 12 | 2021 | 27 |", + "| 6 | -9 | 13 | 2021 | 27 |", + "| 7 | -8 | 14 | 2021 | 27 |", + "| 8 | -7 | 15 | 2021 | 27 |", + "| 9 | -6 | 16 | 2021 | 27 |", + "+---+-----+----+------+-----+", + ]; + assert_batches_eq!(expected, &[projected_batch]); + + // project another batch that is smaller than the previous one + let file_batch = build_table_i32( + ("a", &vec![0, 1, 3]), + ("b", &vec![2, 3, 4]), + ("c", &vec![4, 5, 6]), + ); + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + wrap_partition_value_in_dict(ScalarValue::from("2021")), + wrap_partition_value_in_dict(ScalarValue::from("10")), + wrap_partition_value_in_dict(ScalarValue::from("28")), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = [ + "+---+---+---+------+-----+", + "| a | b | c | year | day |", + "+---+---+---+------+-----+", + "| 0 | 2 | 4 | 2021 | 28 |", + "| 1 | 3 | 5 | 2021 | 28 |", + "| 3 | 4 | 6 | 2021 | 28 |", + "+---+---+---+------+-----+", + ]; + assert_batches_eq!(expected, &[projected_batch]); + + // forgot to dictionary-wrap the scalar value + let file_batch = build_table_i32( + ("a", &vec![0, 1, 2]), + ("b", &vec![-2, -1, 0]), + ("c", &vec![10, 11, 12]), + ); + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + ScalarValue::from("2021"), + ScalarValue::from("10"), + ScalarValue::from("26"), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = [ + "+---+----+----+------+-----+", + "| a | b | c | year | day |", + "+---+----+----+------+-----+", + "| 0 | -2 | 10 | 2021 | 26 |", + "| 1 | -1 | 11 | 2021 | 26 |", + "| 2 | 0 | 12 | 2021 | 26 |", + "+---+----+----+------+-----+", + ]; + assert_batches_eq!(expected, &[projected_batch]); + } + + #[test] + fn test_projected_file_schema_with_partition_col() { + let schema = aggr_test_schema(); + let partition_cols = vec![ + ( + "part1".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ( + "part2".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ]; + + // Projected file schema for config with projection including partition column + let projection = config_for_projection( + schema.clone(), + Some(vec![0, 3, 5, schema.fields().len()]), + Statistics::new_unknown(&schema), + to_partition_cols(partition_cols), + ) + .projected_file_schema(); + + // Assert partition column filtered out in projected file schema + let expected_columns = vec!["c1", "c4", "c6"]; + let actual_columns = projection + .fields() + .iter() + .map(|f| f.name().clone()) + .collect::>(); + assert_eq!(expected_columns, actual_columns); + } + + #[test] + fn test_projected_file_schema_without_projection() { + let schema = aggr_test_schema(); + let partition_cols = vec![ + ( + "part1".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ( + "part2".to_owned(), + wrap_partition_type_in_dict(DataType::Utf8), + ), + ]; + + // Projected file schema for config without projection + let projection = config_for_projection( + schema.clone(), + None, + Statistics::new_unknown(&schema), + to_partition_cols(partition_cols), + ) + .projected_file_schema(); + + // Assert projected file schema is equal to file schema + assert_eq!(projection.fields(), schema.fields()); + } + + #[test] + fn test_split_groups_by_statistics() -> Result<()> { + use chrono::TimeZone; + use datafusion_common::DFSchema; + use datafusion_expr::execution_props::ExecutionProps; + use object_store::{path::Path, ObjectMeta}; + + struct File { + name: &'static str, + date: &'static str, + statistics: Vec>, + } + impl File { + fn new( + name: &'static str, + date: &'static str, + statistics: Vec>, + ) -> Self { + Self { + name, + date, + statistics, + } + } + } + + struct TestCase { + name: &'static str, + file_schema: Schema, + files: Vec, + sort: Vec, + expected_result: Result>, &'static str>, + } + + use datafusion_expr::col; + let cases = vec![ + TestCase { + name: "test sort", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), + }, + // same input but file '2' is in the middle + // test that we still order correctly + TestCase { + name: "test sort with files ordered differently", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), + }, + TestCase { + name: "reverse sort", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + ], + sort: vec![col("value").sort(false, true)], + expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]), + }, + // reject nullable sort columns + TestCase { + name: "no nullable sort columns", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + true, // should fail because nullable + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column") + }, + TestCase { + name: "all three non-overlapping", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 0.99))]), + File::new("2", "2023-01-02", vec![Some((1.00, 1.49))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0", "1", "2"]]), + }, + TestCase { + name: "all three overlapping", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("2", "2023-01-02", vec![Some((0.00, 0.49))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0"], vec!["1"], vec!["2"]]), + }, + TestCase { + name: "empty input", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![]), + }, + TestCase { + name: "one file missing statistics", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("2", "2023-01-02", vec![None]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found"), + }, + ]; + + for case in cases { + let table_schema = Arc::new(Schema::new( + case.file_schema + .fields() + .clone() + .into_iter() + .cloned() + .chain(Some(Arc::new(Field::new( + "date".to_string(), + DataType::Utf8, + false, + )))) + .collect::>(), + )); + let sort_order = LexOrdering::from( + case.sort + .into_iter() + .map(|expr| { + create_physical_sort_expr( + &expr, + &DFSchema::try_from(table_schema.as_ref().clone())?, + &ExecutionProps::default(), + ) + }) + .collect::>>()?, + ); + + let partitioned_files = + case.files.into_iter().map(From::from).collect::>(); + let result = FileScanConfig::split_groups_by_statistics( + &table_schema, + &[partitioned_files.clone()], + &sort_order, + ); + let results_by_name = result + .as_ref() + .map(|file_groups| { + file_groups + .iter() + .map(|file_group| { + file_group + .iter() + .map(|file| { + partitioned_files + .iter() + .find_map(|f| { + if f.object_meta == file.object_meta { + Some( + f.object_meta + .location + .as_ref() + .rsplit('/') + .next() + .unwrap() + .trim_end_matches(".parquet"), + ) + } else { + None + } + }) + .unwrap() + }) + .collect::>() + }) + .collect::>() + }) + .map_err(|e| e.strip_backtrace().leak() as &'static str); + + assert_eq!(results_by_name, case.expected_result, "{}", case.name); + } + + return Ok(()); + + impl From for PartitionedFile { + fn from(file: File) -> Self { + PartitionedFile { + object_meta: ObjectMeta { + location: Path::from(format!( + "data/date={}/{}.parquet", + file.date, file.name + )), + last_modified: chrono::Utc.timestamp_nanos(0), + size: 0, + e_tag: None, + version: None, + }, + partition_values: vec![ScalarValue::from(file.date)], + range: None, + statistics: Some(Statistics { + num_rows: Precision::Absent, + total_byte_size: Precision::Absent, + column_statistics: file + .statistics + .into_iter() + .map(|stats| { + stats + .map(|(min, max)| ColumnStatistics { + min_value: Precision::Exact(ScalarValue::from( + min, + )), + max_value: Precision::Exact(ScalarValue::from( + max, + )), + ..Default::default() + }) + .unwrap_or_default() + }) + .collect::>(), + }), + extensions: None, + metadata_size_hint: None, + } + } + } + } + + // sets default for configs that play no role in projections + fn config_for_projection( + file_schema: SchemaRef, + projection: Option>, + statistics: Statistics, + table_partition_cols: Vec, + ) -> FileScanConfig { + FileScanConfig::new( + ObjectStoreUrl::parse("test:///").unwrap(), + file_schema, + Arc::new(MockSource::default()), + ) + .with_projection(projection) + .with_statistics(statistics) + .with_table_partition_cols(table_partition_cols) + } + + /// Convert partition columns from Vec to Vec + fn to_partition_cols(table_partition_cols: Vec<(String, DataType)>) -> Vec { + table_partition_cols + .iter() + .map(|(name, dtype)| Field::new(name, dtype.clone(), false)) + .collect::>() + } + + /// returns record batch with 3 columns of i32 in memory + pub fn build_table_i32( + a: (&str, &Vec), + b: (&str, &Vec), + c: (&str, &Vec), + ) -> RecordBatch { + let schema = Schema::new(vec![ + Field::new(a.0, DataType::Int32, false), + Field::new(b.0, DataType::Int32, false), + Field::new(c.0, DataType::Int32, false), + ]); + + RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(a.1.clone())), + Arc::new(Int32Array::from(b.1.clone())), + Arc::new(Int32Array::from(c.1.clone())), + ], + ) + .unwrap() + } +} diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs index 570ca6678538..7d17d230fc01 100644 --- a/datafusion/datasource/src/file_stream.rs +++ b/datafusion/datasource/src/file_stream.rs @@ -21,10 +21,20 @@ //! Note: Most traits here need to be marked `Sync + Send` to be //! compliant with the `SendableRecordBatchStream` trait. +use std::collections::VecDeque; +use std::mem; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + use crate::file_meta::FileMeta; +use crate::file_scan_config::{FileScanConfig, PartitionColumnProjector}; +use crate::PartitionedFile; +use arrow::datatypes::SchemaRef; use datafusion_common::error::Result; +use datafusion_execution::RecordBatchStream; use datafusion_physical_plan::metrics::{ - Count, ExecutionPlanMetricsSet, MetricBuilder, Time, + BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, Time, }; use arrow::error::ArrowError; @@ -34,6 +44,303 @@ use datafusion_common::ScalarValue; use futures::future::BoxFuture; use futures::stream::BoxStream; +use futures::{ready, FutureExt as _, Stream, StreamExt as _}; + +/// A stream that iterates record batch by record batch, file over file. +pub struct FileStream { + /// An iterator over input files. + file_iter: VecDeque, + /// The stream schema (file schema including partition columns and after + /// projection). + projected_schema: SchemaRef, + /// The remaining number of records to parse, None if no limit + remain: Option, + /// A dynamic [`FileOpener`]. Calling `open()` returns a [`FileOpenFuture`], + /// which can be resolved to a stream of `RecordBatch`. + file_opener: Arc, + /// The partition column projector + pc_projector: PartitionColumnProjector, + /// The stream state + state: FileStreamState, + /// File stream specific metrics + file_stream_metrics: FileStreamMetrics, + /// runtime baseline metrics + baseline_metrics: BaselineMetrics, + /// Describes the behavior of the `FileStream` if file opening or scanning fails + on_error: OnError, +} + +impl FileStream { + /// Create a new `FileStream` using the give `FileOpener` to scan underlying files + pub fn new( + config: &FileScanConfig, + partition: usize, + file_opener: Arc, + metrics: &ExecutionPlanMetricsSet, + ) -> Result { + let (projected_schema, ..) = config.project(); + let pc_projector = PartitionColumnProjector::new( + Arc::clone(&projected_schema), + &config + .table_partition_cols + .iter() + .map(|x| x.name().clone()) + .collect::>(), + ); + + let files = config.file_groups[partition].clone(); + + Ok(Self { + file_iter: files.into(), + projected_schema, + remain: config.limit, + file_opener, + pc_projector, + state: FileStreamState::Idle, + file_stream_metrics: FileStreamMetrics::new(metrics, partition), + baseline_metrics: BaselineMetrics::new(metrics, partition), + on_error: OnError::Fail, + }) + } + + /// Specify the behavior when an error occurs opening or scanning a file + /// + /// If `OnError::Skip` the stream will skip files which encounter an error and continue + /// If `OnError:Fail` (default) the stream will fail and stop processing when an error occurs + pub fn with_on_error(mut self, on_error: OnError) -> Self { + self.on_error = on_error; + self + } + + /// Begin opening the next file in parallel while decoding the current file in FileStream. + /// + /// Since file opening is mostly IO (and may involve a + /// bunch of sequential IO), it can be parallelized with decoding. + fn start_next_file(&mut self) -> Option)>> { + let part_file = self.file_iter.pop_front()?; + + let file_meta = FileMeta { + object_meta: part_file.object_meta, + range: part_file.range, + extensions: part_file.extensions, + metadata_size_hint: part_file.metadata_size_hint, + }; + + Some( + self.file_opener + .open(file_meta) + .map(|future| (future, part_file.partition_values)), + ) + } + + fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll>> { + loop { + match &mut self.state { + FileStreamState::Idle => { + self.file_stream_metrics.time_opening.start(); + + match self.start_next_file().transpose() { + Ok(Some((future, partition_values))) => { + self.state = FileStreamState::Open { + future, + partition_values, + } + } + Ok(None) => return Poll::Ready(None), + Err(e) => { + self.state = FileStreamState::Error; + return Poll::Ready(Some(Err(e))); + } + } + } + FileStreamState::Open { + future, + partition_values, + } => match ready!(future.poll_unpin(cx)) { + Ok(reader) => { + let partition_values = mem::take(partition_values); + + // include time needed to start opening in `start_next_file` + self.file_stream_metrics.time_opening.stop(); + let next = self.start_next_file().transpose(); + self.file_stream_metrics.time_scanning_until_data.start(); + self.file_stream_metrics.time_scanning_total.start(); + + match next { + Ok(Some((next_future, next_partition_values))) => { + self.state = FileStreamState::Scan { + partition_values, + reader, + next: Some(( + NextOpen::Pending(next_future), + next_partition_values, + )), + }; + } + Ok(None) => { + self.state = FileStreamState::Scan { + reader, + partition_values, + next: None, + }; + } + Err(e) => { + self.state = FileStreamState::Error; + return Poll::Ready(Some(Err(e))); + } + } + } + Err(e) => { + self.file_stream_metrics.file_open_errors.add(1); + match self.on_error { + OnError::Skip => { + self.file_stream_metrics.time_opening.stop(); + self.state = FileStreamState::Idle + } + OnError::Fail => { + self.state = FileStreamState::Error; + return Poll::Ready(Some(Err(e))); + } + } + } + }, + FileStreamState::Scan { + reader, + partition_values, + next, + } => { + // We need to poll the next `FileOpenFuture` here to drive it forward + if let Some((next_open_future, _)) = next { + if let NextOpen::Pending(f) = next_open_future { + if let Poll::Ready(reader) = f.as_mut().poll(cx) { + *next_open_future = NextOpen::Ready(reader); + } + } + } + match ready!(reader.poll_next_unpin(cx)) { + Some(Ok(batch)) => { + self.file_stream_metrics.time_scanning_until_data.stop(); + self.file_stream_metrics.time_scanning_total.stop(); + let result = self + .pc_projector + .project(batch, partition_values) + .map_err(|e| ArrowError::ExternalError(e.into())) + .map(|batch| match &mut self.remain { + Some(remain) => { + if *remain > batch.num_rows() { + *remain -= batch.num_rows(); + batch + } else { + let batch = batch.slice(0, *remain); + self.state = FileStreamState::Limit; + *remain = 0; + batch + } + } + None => batch, + }); + + if result.is_err() { + // If the partition value projection fails, this is not governed by + // the `OnError` behavior + self.state = FileStreamState::Error + } + self.file_stream_metrics.time_scanning_total.start(); + return Poll::Ready(Some(result.map_err(Into::into))); + } + Some(Err(err)) => { + self.file_stream_metrics.file_scan_errors.add(1); + self.file_stream_metrics.time_scanning_until_data.stop(); + self.file_stream_metrics.time_scanning_total.stop(); + + match self.on_error { + // If `OnError::Skip` we skip the file as soon as we hit the first error + OnError::Skip => match mem::take(next) { + Some((future, partition_values)) => { + self.file_stream_metrics.time_opening.start(); + + match future { + NextOpen::Pending(future) => { + self.state = FileStreamState::Open { + future, + partition_values, + } + } + NextOpen::Ready(reader) => { + self.state = FileStreamState::Open { + future: Box::pin(std::future::ready( + reader, + )), + partition_values, + } + } + } + } + None => return Poll::Ready(None), + }, + OnError::Fail => { + self.state = FileStreamState::Error; + return Poll::Ready(Some(Err(err.into()))); + } + } + } + None => { + self.file_stream_metrics.time_scanning_until_data.stop(); + self.file_stream_metrics.time_scanning_total.stop(); + + match mem::take(next) { + Some((future, partition_values)) => { + self.file_stream_metrics.time_opening.start(); + + match future { + NextOpen::Pending(future) => { + self.state = FileStreamState::Open { + future, + partition_values, + } + } + NextOpen::Ready(reader) => { + self.state = FileStreamState::Open { + future: Box::pin(std::future::ready( + reader, + )), + partition_values, + } + } + } + } + None => return Poll::Ready(None), + } + } + } + } + FileStreamState::Error | FileStreamState::Limit => { + return Poll::Ready(None) + } + } + } + } +} + +impl Stream for FileStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + self.file_stream_metrics.time_processing.start(); + let result = self.poll_inner(cx); + self.file_stream_metrics.time_processing.stop(); + self.baseline_metrics.record_poll(result) + } +} + +impl RecordBatchStream for FileStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.projected_schema) + } +} /// A fallible future that resolves to a stream of [`RecordBatch`] pub type FileOpenFuture = @@ -212,3 +519,467 @@ impl FileStreamMetrics { } } } + +#[cfg(test)] +mod tests { + use crate::file_scan_config::FileScanConfig; + use crate::tests::make_partition; + use crate::PartitionedFile; + use arrow::error::ArrowError; + use datafusion_common::error::Result; + use datafusion_execution::object_store::ObjectStoreUrl; + use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; + use futures::{FutureExt as _, StreamExt as _}; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + use crate::file_meta::FileMeta; + use crate::file_stream::{FileOpenFuture, FileOpener, FileStream, OnError}; + use crate::test_util::MockSource; + use arrow::array::RecordBatch; + use arrow::datatypes::Schema; + + use datafusion_common::{assert_batches_eq, internal_err}; + + /// Test `FileOpener` which will simulate errors during file opening or scanning + #[derive(Default)] + struct TestOpener { + /// Index in stream of files which should throw an error while opening + error_opening_idx: Vec, + /// Index in stream of files which should throw an error while scanning + error_scanning_idx: Vec, + /// Index of last file in stream + current_idx: AtomicUsize, + /// `RecordBatch` to return + records: Vec, + } + + impl FileOpener for TestOpener { + fn open(&self, _file_meta: FileMeta) -> Result { + let idx = self.current_idx.fetch_add(1, Ordering::SeqCst); + + if self.error_opening_idx.contains(&idx) { + Ok(futures::future::ready(internal_err!("error opening")).boxed()) + } else if self.error_scanning_idx.contains(&idx) { + let error = futures::future::ready(Err(ArrowError::IpcError( + "error scanning".to_owned(), + ))); + let stream = futures::stream::once(error).boxed(); + Ok(futures::future::ready(Ok(stream)).boxed()) + } else { + let iterator = self.records.clone().into_iter().map(Ok); + let stream = futures::stream::iter(iterator).boxed(); + Ok(futures::future::ready(Ok(stream)).boxed()) + } + } + } + + #[derive(Default)] + struct FileStreamTest { + /// Number of files in the stream + num_files: usize, + /// Global limit of records emitted by the stream + limit: Option, + /// Error-handling behavior of the stream + on_error: OnError, + /// Mock `FileOpener` + opener: TestOpener, + } + + impl FileStreamTest { + pub fn new() -> Self { + Self::default() + } + + /// Specify the number of files in the stream + pub fn with_num_files(mut self, num_files: usize) -> Self { + self.num_files = num_files; + self + } + + /// Specify the limit + pub fn with_limit(mut self, limit: Option) -> Self { + self.limit = limit; + self + } + + /// Specify the index of files in the stream which should + /// throw an error when opening + pub fn with_open_errors(mut self, idx: Vec) -> Self { + self.opener.error_opening_idx = idx; + self + } + + /// Specify the index of files in the stream which should + /// throw an error when scanning + pub fn with_scan_errors(mut self, idx: Vec) -> Self { + self.opener.error_scanning_idx = idx; + self + } + + /// Specify the behavior of the stream when an error occurs + pub fn with_on_error(mut self, on_error: OnError) -> Self { + self.on_error = on_error; + self + } + + /// Specify the record batches that should be returned from each + /// file that is successfully scanned + pub fn with_records(mut self, records: Vec) -> Self { + self.opener.records = records; + self + } + + /// Collect the results of the `FileStream` + pub async fn result(self) -> Result> { + let file_schema = self + .opener + .records + .first() + .map(|batch| batch.schema()) + .unwrap_or_else(|| Arc::new(Schema::empty())); + + // let ctx = SessionContext::new(); + let mock_files: Vec<(String, u64)> = (0..self.num_files) + .map(|idx| (format!("mock_file{idx}"), 10_u64)) + .collect(); + + // let mock_files_ref: Vec<(&str, u64)> = mock_files + // .iter() + // .map(|(name, size)| (name.as_str(), *size)) + // .collect(); + + let file_group = mock_files + .into_iter() + .map(|(name, size)| PartitionedFile::new(name, size)) + .collect(); + + let on_error = self.on_error; + + let config = FileScanConfig::new( + ObjectStoreUrl::parse("test:///").unwrap(), + file_schema, + Arc::new(MockSource::default()), + ) + .with_file_group(file_group) + .with_limit(self.limit); + let metrics_set = ExecutionPlanMetricsSet::new(); + let file_stream = + FileStream::new(&config, 0, Arc::new(self.opener), &metrics_set) + .unwrap() + .with_on_error(on_error); + + file_stream + .collect::>() + .await + .into_iter() + .collect::>>() + } + } + + /// helper that creates a stream of 2 files with the same pair of batches in each ([0,1,2] and [0,1]) + async fn create_and_collect(limit: Option) -> Vec { + FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_limit(limit) + .result() + .await + .expect("error executing stream") + } + + #[tokio::test] + async fn on_error_opening() -> Result<()> { + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Skip) + .with_open_errors(vec![0]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Skip) + .with_open_errors(vec![1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Skip) + .with_open_errors(vec![0, 1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "++", + "++", + ], &batches); + + Ok(()) + } + + #[tokio::test] + async fn on_error_scanning_fail() -> Result<()> { + let result = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Fail) + .with_scan_errors(vec![1]) + .result() + .await; + + assert!(result.is_err()); + + Ok(()) + } + + #[tokio::test] + async fn on_error_opening_fail() -> Result<()> { + let result = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Fail) + .with_open_errors(vec![1]) + .result() + .await; + + assert!(result.is_err()); + + Ok(()) + } + + #[tokio::test] + async fn on_error_scanning() -> Result<()> { + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Skip) + .with_scan_errors(vec![0]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Skip) + .with_scan_errors(vec![1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(2) + .with_on_error(OnError::Skip) + .with_scan_errors(vec![0, 1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "++", + "++", + ], &batches); + + Ok(()) + } + + #[tokio::test] + async fn on_error_mixed() -> Result<()> { + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(3) + .with_on_error(OnError::Skip) + .with_open_errors(vec![1]) + .with_scan_errors(vec![0]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(3) + .with_on_error(OnError::Skip) + .with_open_errors(vec![0]) + .with_scan_errors(vec![1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(3) + .with_on_error(OnError::Skip) + .with_open_errors(vec![2]) + .with_scan_errors(vec![0, 1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "++", + "++", + ], &batches); + + let batches = FileStreamTest::new() + .with_records(vec![make_partition(3), make_partition(2)]) + .with_num_files(3) + .with_on_error(OnError::Skip) + .with_open_errors(vec![0, 2]) + .with_scan_errors(vec![1]) + .result() + .await?; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "++", + "++", + ], &batches); + + Ok(()) + } + + #[tokio::test] + async fn without_limit() -> Result<()> { + let batches = create_and_collect(None).await; + + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + Ok(()) + } + + #[tokio::test] + async fn with_limit_between_files() -> Result<()> { + let batches = create_and_collect(Some(5)).await; + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "+---+", + ], &batches); + + Ok(()) + } + + #[tokio::test] + async fn with_limit_at_middle_of_batch() -> Result<()> { + let batches = create_and_collect(Some(6)).await; + #[rustfmt::skip] + assert_batches_eq!(&[ + "+---+", + "| i |", + "+---+", + "| 0 |", + "| 1 |", + "| 2 |", + "| 0 |", + "| 1 |", + "| 0 |", + "+---+", + ], &batches); + + Ok(()) + } +} diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index efb178ad078e..182ffebdf461 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -765,28 +765,17 @@ mod memory_source_tests { #[cfg(test)] mod tests { + use crate::tests::{aggr_test_schema, make_partition}; + use super::*; - use arrow::array::{ArrayRef, Int32Array}; + use datafusion_physical_plan::expressions::lit; - use std::collections::HashMap; use arrow::datatypes::{DataType, Field}; use datafusion_common::assert_batches_eq; use datafusion_common::stats::{ColumnStatistics, Precision}; use futures::StreamExt; - // Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" - pub fn make_partition(sz: i32) -> RecordBatch { - let seq_start = 0; - let seq_end = sz; - let values = (seq_start..seq_end).collect::>(); - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let arr = Arc::new(Int32Array::from(values)); - let arr = arr as ArrayRef; - - RecordBatch::try_new(schema, vec![arr]).unwrap() - } - #[tokio::test] async fn exec_with_limit() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); @@ -813,29 +802,6 @@ mod tests { Ok(()) } - /// Get the schema for the aggregate_test_* csv files - pub fn aggr_test_schema() -> SchemaRef { - let mut f1 = Field::new("c1", DataType::Utf8, false); - f1.set_metadata(HashMap::from_iter(vec![("testing".into(), "test".into())])); - let schema = Schema::new(vec![ - f1, - Field::new("c2", DataType::UInt32, false), - Field::new("c3", DataType::Int8, false), - Field::new("c4", DataType::Int16, false), - Field::new("c5", DataType::Int32, false), - Field::new("c6", DataType::Int64, false), - Field::new("c7", DataType::UInt8, false), - Field::new("c8", DataType::UInt16, false), - Field::new("c9", DataType::UInt32, false), - Field::new("c10", DataType::UInt64, false), - Field::new("c11", DataType::Float32, false), - Field::new("c12", DataType::Float64, false), - Field::new("c13", DataType::Utf8, false), - ]); - - Arc::new(schema) - } - #[tokio::test] async fn values_empty_case() -> Result<()> { let schema = aggr_test_schema(); diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 8183d7b53244..e60b02f9c9e6 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -24,6 +24,8 @@ //! A table that uses the `ObjectStore` listing capability //! to get the list of files to process. +pub mod display; +pub mod file; pub mod file_compression_type; pub mod file_groups; pub mod file_meta; @@ -32,6 +34,9 @@ pub mod file_sink_config; pub mod file_stream; pub mod memory; pub mod source; +mod statistics; +#[cfg(test)] +mod test_util; pub mod url; pub mod write; use chrono::TimeZone; @@ -187,13 +192,51 @@ impl From for PartitionedFile { #[cfg(test)] mod tests { use super::ListingTableUrl; + use arrow::{ + array::{ArrayRef, Int32Array, RecordBatch}, + datatypes::{DataType, Field, Schema, SchemaRef}, + }; use datafusion_execution::object_store::{ DefaultObjectStoreRegistry, ObjectStoreRegistry, }; use object_store::{local::LocalFileSystem, path::Path}; - use std::{ops::Not, sync::Arc}; + use std::{collections::HashMap, ops::Not, sync::Arc}; use url::Url; + /// Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" + pub fn make_partition(sz: i32) -> RecordBatch { + let seq_start = 0; + let seq_end = sz; + let values = (seq_start..seq_end).collect::>(); + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let arr = Arc::new(Int32Array::from(values)); + + RecordBatch::try_new(schema, vec![arr as ArrayRef]).unwrap() + } + + /// Get the schema for the aggregate_test_* csv files + pub fn aggr_test_schema() -> SchemaRef { + let mut f1 = Field::new("c1", DataType::Utf8, false); + f1.set_metadata(HashMap::from_iter(vec![("testing".into(), "test".into())])); + let schema = Schema::new(vec![ + f1, + Field::new("c2", DataType::UInt32, false), + Field::new("c3", DataType::Int8, false), + Field::new("c4", DataType::Int16, false), + Field::new("c5", DataType::Int32, false), + Field::new("c6", DataType::Int64, false), + Field::new("c7", DataType::UInt8, false), + Field::new("c8", DataType::UInt16, false), + Field::new("c9", DataType::UInt32, false), + Field::new("c10", DataType::UInt64, false), + Field::new("c11", DataType::Float32, false), + Field::new("c12", DataType::Float64, false), + Field::new("c13", DataType::Utf8, false), + ]); + + Arc::new(schema) + } + #[test] fn test_object_store_listing_url() { let listing = ListingTableUrl::parse("file:///").unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/statistics.rs b/datafusion/datasource/src/statistics.rs similarity index 97% rename from datafusion/core/src/datasource/physical_plan/statistics.rs rename to datafusion/datasource/src/statistics.rs index 5811c19be408..9df5aa993d43 100644 --- a/datafusion/core/src/datasource/physical_plan/statistics.rs +++ b/datafusion/datasource/src/statistics.rs @@ -15,18 +15,14 @@ // specific language governing permissions and limitations // under the License. -/*! - * - * Use statistics to optimize physical planning. - * - * Currently, this module houses code to sort file groups if they are non-overlapping with - * respect to the required sort order. See [`MinMaxStatistics`] - * -*/ +//! Use statistics to optimize physical planning. +//! +//! Currently, this module houses code to sort file groups if they are non-overlapping with +//! respect to the required sort order. See [`MinMaxStatistics`] use std::sync::Arc; -use crate::datasource::listing::PartitionedFile; +use crate::PartitionedFile; use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs new file mode 100644 index 000000000000..ab025069bf76 --- /dev/null +++ b/datafusion/datasource/src/test_util.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use datafusion_common::Statistics; +use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use object_store::ObjectStore; + +use crate::{ + file::FileSource, file_scan_config::FileScanConfig, file_stream::FileOpener, +}; +use datafusion_common::Result; + +/// Minimal [`FileSource`] implementation for use in tests. +#[derive(Clone, Default)] +pub struct MockSource { + metrics: ExecutionPlanMetricsSet, + projected_statistics: Option, +} + +impl FileSource for MockSource { + fn create_file_opener( + &self, + _object_store: Arc, + _base_config: &FileScanConfig, + _partition: usize, + ) -> Arc { + unimplemented!() + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn with_batch_size(&self, _batch_size: usize) -> Arc { + Arc::new(Self { ..self.clone() }) + } + + fn with_schema(&self, _schema: SchemaRef) -> Arc { + Arc::new(Self { ..self.clone() }) + } + + fn with_projection(&self, _config: &FileScanConfig) -> Arc { + Arc::new(Self { ..self.clone() }) + } + + fn with_statistics(&self, statistics: Statistics) -> Arc { + let mut source = self.clone(); + source.projected_statistics = Some(statistics); + Arc::new(source) + } + + fn metrics(&self) -> &ExecutionPlanMetricsSet { + &self.metrics + } + + fn statistics(&self) -> Result { + Ok(self + .projected_statistics + .as_ref() + .expect("projected_statistics must be set") + .clone()) + } + + fn file_type(&self) -> &str { + "mock" + } +} diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 34fb5bb6ddc1..ce01865b8c73 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -26,14 +26,13 @@ use object_store::path::Path; use object_store::ObjectMeta; use datafusion::arrow::datatypes::Schema; -use datafusion::datasource::data_source::FileSource; use datafusion::datasource::file_format::csv::CsvSink; use datafusion::datasource::file_format::json::JsonSink; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::listing::{FileRange, ListingTableUrl, PartitionedFile}; use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig}; +use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig, FileSource}; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::WindowFunctionDefinition; use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr}; From 1fedb4e000293e3997b477d87d575f3a5453171e Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 25 Feb 2025 21:49:18 +0800 Subject: [PATCH 65/71] Counting elapsed_compute in BoundedWindowAggExec (#14869) --- .../physical-plan/src/windows/bounded_window_agg_exec.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index c78c870ff383..0d9c58b3bf49 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -998,8 +998,13 @@ impl BoundedWindowAggStream { return Poll::Ready(None); } + let elapsed_compute = self.baseline_metrics.elapsed_compute().clone(); match ready!(self.input.poll_next_unpin(cx)) { Some(Ok(batch)) => { + // Start the timer for compute time within this operator. It will be + // stopped when dropped. + let _timer = elapsed_compute.timer(); + self.search_mode.update_partition_batch( &mut self.input_buffer, batch, @@ -1013,6 +1018,8 @@ impl BoundedWindowAggStream { } Some(Err(e)) => Poll::Ready(Some(Err(e))), None => { + let _timer = elapsed_compute.timer(); + self.finished = true; for (_, partition_batch_state) in self.partition_buffers.iter_mut() { partition_batch_state.is_end = true; From f1f6e5e46320094a8ef8c3e9ac710d7ff98c4d57 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Tue, 25 Feb 2025 22:39:46 +0800 Subject: [PATCH 66/71] Optimize `gcd` for array and scalar case by avoiding `make_scalar_function` where has unnecessary conversion between scalar and array (#14834) * optimize gcd * fmt * add feature * Use try_binary to make gcd even faster * rm length check --------- Co-authored-by: Andrew Lamb --- datafusion/functions/Cargo.toml | 5 + datafusion/functions/benches/gcd.rs | 92 +++++++++++++ datafusion/functions/src/math/gcd.rs | 135 +++++++++----------- datafusion/sqllogictest/test_files/math.slt | 4 +- 4 files changed, 158 insertions(+), 78 deletions(-) create mode 100644 datafusion/functions/benches/gcd.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index b44127d6a1b7..3208f2dd169f 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -113,6 +113,11 @@ harness = false name = "chr" required-features = ["string_expressions"] +[[bench]] +harness = false +name = "gcd" +required-features = ["math_expressions"] + [[bench]] harness = false name = "uuid" diff --git a/datafusion/functions/benches/gcd.rs b/datafusion/functions/benches/gcd.rs new file mode 100644 index 000000000000..f8c855c82ad4 --- /dev/null +++ b/datafusion/functions/benches/gcd.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::{ + array::{ArrayRef, Int64Array}, + datatypes::DataType, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_common::ScalarValue; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::math::gcd; +use rand::Rng; +use std::sync::Arc; + +fn generate_i64_array(n_rows: usize) -> ArrayRef { + let mut rng = rand::thread_rng(); + let values = (0..n_rows) + .map(|_| rng.gen_range(0..1000)) + .collect::>(); + Arc::new(Int64Array::from(values)) as ArrayRef +} + +fn criterion_benchmark(c: &mut Criterion) { + let n_rows = 100000; + let array_a = ColumnarValue::Array(generate_i64_array(n_rows)); + let array_b = ColumnarValue::Array(generate_i64_array(n_rows)); + let udf = gcd(); + + c.bench_function("gcd both array", |b| { + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: vec![array_a.clone(), array_b.clone()], + number_rows: 0, + return_type: &DataType::Int64, + }) + .expect("date_bin should work on valid values"), + ) + }) + }); + + // 10! = 3628800 + let scalar_b = ColumnarValue::Scalar(ScalarValue::Int64(Some(3628800))); + + c.bench_function("gcd array and scalar", |b| { + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: vec![array_a.clone(), scalar_b.clone()], + number_rows: 0, + return_type: &DataType::Int64, + }) + .expect("date_bin should work on valid values"), + ) + }) + }); + + // scalar and scalar + let scalar_a = ColumnarValue::Scalar(ScalarValue::Int64(Some(3628800))); + + c.bench_function("gcd both scalar", |b| { + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: vec![scalar_a.clone(), scalar_b.clone()], + number_rows: 0, + return_type: &DataType::Int64, + }) + .expect("date_bin should work on valid values"), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/math/gcd.rs b/datafusion/functions/src/math/gcd.rs index 911e00308ab7..7fe253b4afbc 100644 --- a/datafusion/functions/src/math/gcd.rs +++ b/datafusion/functions/src/math/gcd.rs @@ -15,19 +15,15 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, Int64Array}; +use arrow::array::{new_null_array, ArrayRef, AsArray, Int64Array, PrimitiveArray}; +use arrow::compute::try_binary; +use arrow::datatypes::{DataType, Int64Type}; use arrow::error::ArrowError; use std::any::Any; use std::mem::swap; use std::sync::Arc; -use arrow::datatypes::DataType; -use arrow::datatypes::DataType::Int64; - -use crate::utils::make_scalar_function; -use datafusion_common::{ - arrow_datafusion_err, exec_err, internal_datafusion_err, DataFusionError, Result, -}; +use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, @@ -54,9 +50,12 @@ impl Default for GcdFunc { impl GcdFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform(2, vec![Int64], Volatility::Immutable), + signature: Signature::uniform( + 2, + vec![DataType::Int64], + Volatility::Immutable, + ), } } } @@ -75,11 +74,34 @@ impl ScalarUDFImpl for GcdFunc { } fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(Int64) + Ok(DataType::Int64) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - make_scalar_function(gcd, vec![])(&args.args) + let args: [ColumnarValue; 2] = args.args.try_into().map_err(|_| { + internal_datafusion_err!("Expected 2 arguments for function gcd") + })?; + + match args { + [ColumnarValue::Array(a), ColumnarValue::Array(b)] => { + compute_gcd_for_arrays(&a, &b) + } + [ColumnarValue::Scalar(ScalarValue::Int64(a)), ColumnarValue::Scalar(ScalarValue::Int64(b))] => { + match (a, b) { + (Some(a), Some(b)) => Ok(ColumnarValue::Scalar(ScalarValue::Int64( + Some(compute_gcd(a, b)?), + ))), + _ => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))), + } + } + [ColumnarValue::Array(a), ColumnarValue::Scalar(ScalarValue::Int64(b))] => { + compute_gcd_with_scalar(&a, b) + } + [ColumnarValue::Scalar(ScalarValue::Int64(a)), ColumnarValue::Array(b)] => { + compute_gcd_with_scalar(&b, a) + } + _ => exec_err!("Unsupported argument types for function gcd"), + } } fn documentation(&self) -> Option<&Documentation> { @@ -87,24 +109,34 @@ impl ScalarUDFImpl for GcdFunc { } } -/// Gcd SQL function -fn gcd(args: &[ArrayRef]) -> Result { - match args[0].data_type() { - Int64 => { - let arg1 = downcast_named_arg!(&args[0], "x", Int64Array); - let arg2 = downcast_named_arg!(&args[1], "y", Int64Array); +fn compute_gcd_for_arrays(a: &ArrayRef, b: &ArrayRef) -> Result { + let a = a.as_primitive::(); + let b = b.as_primitive::(); + try_binary(a, b, compute_gcd) + .map(|arr: PrimitiveArray| { + ColumnarValue::Array(Arc::new(arr) as ArrayRef) + }) + .map_err(Into::into) // convert ArrowError to DataFusionError +} - Ok(arg1 +fn compute_gcd_with_scalar(arr: &ArrayRef, scalar: Option) -> Result { + match scalar { + Some(scalar_value) => { + let result: Result = arr + .as_primitive::() .iter() - .zip(arg2.iter()) - .map(|(a1, a2)| match (a1, a2) { - (Some(a1), Some(a2)) => Ok(Some(compute_gcd(a1, a2)?)), + .map(|val| match val { + Some(val) => Ok(Some(compute_gcd(val, scalar_value)?)), _ => Ok(None), }) - .collect::>() - .map(Arc::new)? as ArrayRef) + .collect(); + + result.map(|arr| ColumnarValue::Array(Arc::new(arr) as ArrayRef)) } - other => exec_err!("Unsupported data type {other:?} for function gcd"), + None => Ok(ColumnarValue::Array(new_null_array( + &DataType::Int64, + arr.len(), + ))), } } @@ -132,61 +164,12 @@ pub(super) fn unsigned_gcd(mut a: u64, mut b: u64) -> u64 { } /// Computes greatest common divisor using Binary GCD algorithm. -pub fn compute_gcd(x: i64, y: i64) -> Result { +pub fn compute_gcd(x: i64, y: i64) -> Result { let a = x.unsigned_abs(); let b = y.unsigned_abs(); let r = unsigned_gcd(a, b); // gcd(i64::MIN, i64::MIN) = i64::MIN.unsigned_abs() cannot fit into i64 r.try_into().map_err(|_| { - arrow_datafusion_err!(ArrowError::ComputeError(format!( - "Signed integer overflow in GCD({x}, {y})" - ))) + ArrowError::ComputeError(format!("Signed integer overflow in GCD({x}, {y})")) }) } - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use arrow::{ - array::{ArrayRef, Int64Array}, - error::ArrowError, - }; - - use crate::math::gcd::gcd; - use datafusion_common::{cast::as_int64_array, DataFusionError}; - - #[test] - fn test_gcd_i64() { - let args: Vec = vec![ - Arc::new(Int64Array::from(vec![0, 3, 25, -16])), // x - Arc::new(Int64Array::from(vec![0, -2, 15, 8])), // y - ]; - - let result = gcd(&args).expect("failed to initialize function gcd"); - let ints = as_int64_array(&result).expect("failed to initialize function gcd"); - - assert_eq!(ints.len(), 4); - assert_eq!(ints.value(0), 0); - assert_eq!(ints.value(1), 1); - assert_eq!(ints.value(2), 5); - assert_eq!(ints.value(3), 8); - } - - #[test] - fn overflow_on_both_param_i64_min() { - let args: Vec = vec![ - Arc::new(Int64Array::from(vec![i64::MIN])), // x - Arc::new(Int64Array::from(vec![i64::MIN])), // y - ]; - - match gcd(&args) { - // we expect a overflow - Err(DataFusionError::ArrowError(ArrowError::ComputeError(_), _)) => {} - Err(_) => { - panic!("failed to initialize function gcd") - } - Ok(_) => panic!("GCD({0}, {0}) should have overflown", i64::MIN), - }; - } -} diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index a3cf1a4e573f..a49e0a642106 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -623,12 +623,12 @@ select 1 1 1 # gcd with columns and expresions -query II rowsort +query II select gcd(a, b), gcd(c*d + 1, abs(e)) + f from signed_integers; ---- 1 11 -1 13 2 -10 +1 13 NULL NULL # gcd(i64::MIN, i64::MIN) From adad8a49124d97a36ca585b77c10a4bfe0fe7286 Mon Sep 17 00:00:00 2001 From: Amos Aidoo Date: Tue, 25 Feb 2025 16:10:22 +0100 Subject: [PATCH 67/71] refactor: replace OnceLock with LazyLock (#14870) * refactor: replace OnceLock with LazyLock * fix: apply fixes from cargo fmt and clippy --- datafusion/functions/src/math/monotonicity.rs | 474 +++++++++--------- 1 file changed, 240 insertions(+), 234 deletions(-) diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs index 7c87d025e929..baa3147f6258 100644 --- a/datafusion/functions/src/math/monotonicity.rs +++ b/datafusion/functions/src/math/monotonicity.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::sync::OnceLock; +use std::sync::LazyLock; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; @@ -38,18 +38,18 @@ pub fn acos_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_ACOS: OnceLock = OnceLock::new(); +static DOCUMENTATION_ACOS: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the arc cosine or inverse cosine of a number.", + "acos(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_acos_doc() -> &'static Documentation { - DOCUMENTATION_ACOS.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the arc cosine or inverse cosine of a number.", - "acos(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_ACOS } /// Non-decreasing for x ≥ 1, undefined otherwise. @@ -69,18 +69,18 @@ pub fn acosh_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_ACOSH: OnceLock = OnceLock::new(); +static DOCUMENTATION_ACOSH: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number.", + "acosh(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_acosh_doc() -> &'static Documentation { - DOCUMENTATION_ACOSH.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number.", - - "acosh(numeric_expression)") - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_ACOSH } /// Non-decreasing on the interval \[−1, 1\], undefined otherwise. @@ -98,18 +98,18 @@ pub fn asin_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_ASIN: OnceLock = OnceLock::new(); +static DOCUMENTATION_ASIN: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the arc sine or inverse sine of a number.", + "asin(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_asin_doc() -> &'static Documentation { - DOCUMENTATION_ASIN.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the arc sine or inverse sine of a number.", - "asin(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_ASIN } /// Non-decreasing for all real numbers. @@ -117,18 +117,18 @@ pub fn asinh_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_ASINH: OnceLock = OnceLock::new(); +static DOCUMENTATION_ASINH: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the area hyperbolic sine or inverse hyperbolic sine of a number.", + "asinh(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_asinh_doc() -> &'static Documentation { - DOCUMENTATION_ASINH.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the area hyperbolic sine or inverse hyperbolic sine of a number.", - "asinh(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_ASINH } /// Non-decreasing for all real numbers. @@ -136,18 +136,18 @@ pub fn atan_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_ATAN: OnceLock = OnceLock::new(); +static DOCUMENTATION_ATAN: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the arc tangent or inverse tangent of a number.", + "atan(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_atan_doc() -> &'static Documentation { - DOCUMENTATION_ATAN.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the arc tangent or inverse tangent of a number.", - "atan(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_ATAN } /// Non-decreasing on the interval \[−1, 1\], undefined otherwise. @@ -165,18 +165,18 @@ pub fn atanh_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_ATANH: OnceLock = OnceLock::new(); +static DOCUMENTATION_ATANH: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number.", + "atanh(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_atanh_doc() -> &'static Documentation { - DOCUMENTATION_ATANH.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number.", - - "atanh(numeric_expression)") - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_ATANH } /// Order depends on the quadrant. @@ -185,21 +185,27 @@ pub fn atan2_order(_input: &[ExprProperties]) -> Result { Ok(SortProperties::Unordered) } -static DOCUMENTATION_ATANH2: OnceLock = OnceLock::new(); +static DOCUMENTATION_ATANH2: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the arc tangent or inverse tangent of `expression_y / expression_x`.", + "atan2(expression_y, expression_x)", + ) + .with_argument( + "expression_y", + r#"First numeric expression to operate on. +Can be a constant, column, or function, and any combination of arithmetic operators."#, + ) + .with_argument( + "expression_x", + r#"Second numeric expression to operate on. +Can be a constant, column, or function, and any combination of arithmetic operators."#, + ) + .build() +}); pub fn get_atan2_doc() -> &'static Documentation { - DOCUMENTATION_ATANH2.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the arc tangent or inverse tangent of `expression_y / expression_x`.", - - "atan2(expression_y, expression_x)") - .with_argument("expression_y", r#"First numeric expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators."#) - .with_argument("expression_x", r#"Second numeric expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators."#) - .build() - }) + &DOCUMENTATION_ATANH2 } /// Non-decreasing for all real numbers. @@ -207,18 +213,18 @@ pub fn cbrt_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_CBRT: OnceLock = OnceLock::new(); +static DOCUMENTATION_CBRT: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the cube root of a number.", + "cbrt(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_cbrt_doc() -> &'static Documentation { - DOCUMENTATION_CBRT.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the cube root of a number.", - "cbrt(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_CBRT } /// Non-decreasing for all real numbers. @@ -226,18 +232,18 @@ pub fn ceil_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_CEIL: OnceLock = OnceLock::new(); +static DOCUMENTATION_CEIL: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the nearest integer greater than or equal to a number.", + "ceil(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_ceil_doc() -> &'static Documentation { - DOCUMENTATION_CEIL.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the nearest integer greater than or equal to a number.", - "ceil(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_CEIL } /// Non-increasing on \[0, π\] and then non-decreasing on \[π, 2π\]. @@ -247,18 +253,18 @@ pub fn cos_order(_input: &[ExprProperties]) -> Result { Ok(SortProperties::Unordered) } -static DOCUMENTATION_COS: OnceLock = OnceLock::new(); +static DOCUMENTATION_COS: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the cosine of a number.", + "cos(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_cos_doc() -> &'static Documentation { - DOCUMENTATION_COS.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the cosine of a number.", - "cos(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_COS } /// Non-decreasing for x ≥ 0 and symmetrically non-increasing for x ≤ 0. @@ -277,18 +283,18 @@ pub fn cosh_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_COSH: OnceLock = OnceLock::new(); +static DOCUMENTATION_COSH: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the hyperbolic cosine of a number.", + "cosh(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_cosh_doc() -> &'static Documentation { - DOCUMENTATION_COSH.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the hyperbolic cosine of a number.", - "cosh(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_COSH } /// Non-decreasing function that converts radians to degrees. @@ -296,18 +302,18 @@ pub fn degrees_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_DEGREES: OnceLock = OnceLock::new(); +static DOCUMENTATION_DEGREES: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Converts radians to degrees.", + "degrees(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_degrees_doc() -> &'static Documentation { - DOCUMENTATION_DEGREES.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Converts radians to degrees.", - "degrees(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_DEGREES } /// Non-decreasing for all real numbers. @@ -315,18 +321,18 @@ pub fn exp_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_EXP: OnceLock = OnceLock::new(); +static DOCUMENTATION_EXP: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the base-e exponential of a number.", + "exp(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_exp_doc() -> &'static Documentation { - DOCUMENTATION_EXP.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the base-e exponential of a number.", - "exp(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_EXP } /// Non-decreasing for all real numbers. @@ -334,18 +340,18 @@ pub fn floor_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_FLOOR: OnceLock = OnceLock::new(); +static DOCUMENTATION_FLOOR: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the nearest integer less than or equal to a number.", + "floor(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_floor_doc() -> &'static Documentation { - DOCUMENTATION_FLOOR.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the nearest integer less than or equal to a number.", - "floor(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_FLOOR } /// Non-decreasing for x ≥ 0, undefined otherwise. @@ -362,18 +368,18 @@ pub fn ln_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_LN: OnceLock = OnceLock::new(); +static DOCUMENTATION_LN: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the natural logarithm of a number.", + "ln(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_ln_doc() -> &'static Documentation { - DOCUMENTATION_LN.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the natural logarithm of a number.", - "ln(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_LN } /// Non-decreasing for x ≥ 0, undefined otherwise. @@ -390,18 +396,18 @@ pub fn log2_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_LOG2: OnceLock = OnceLock::new(); +static DOCUMENTATION_LOG2: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the base-2 logarithm of a number.", + "log2(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_log2_doc() -> &'static Documentation { - DOCUMENTATION_LOG2.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the base-2 logarithm of a number.", - "log2(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_LOG2 } /// Non-decreasing for x ≥ 0, undefined otherwise. @@ -418,18 +424,18 @@ pub fn log10_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_LOG10: OnceLock = OnceLock::new(); +static DOCUMENTATION_LOG10: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the base-10 logarithm of a number.", + "log10(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_log10_doc() -> &'static Documentation { - DOCUMENTATION_LOG10.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the base-10 logarithm of a number.", - "log10(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_LOG10 } /// Non-decreasing for all real numbers x. @@ -437,18 +443,18 @@ pub fn radians_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_RADIONS: OnceLock = OnceLock::new(); +static DOCUMENTATION_RADIONS: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Converts degrees to radians.", + "radians(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_radians_doc() -> &'static Documentation { - DOCUMENTATION_RADIONS.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Converts degrees to radians.", - "radians(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_RADIONS } /// Non-decreasing on \[0, π\] and then non-increasing on \[π, 2π\]. @@ -458,18 +464,18 @@ pub fn sin_order(_input: &[ExprProperties]) -> Result { Ok(SortProperties::Unordered) } -static DOCUMENTATION_SIN: OnceLock = OnceLock::new(); +static DOCUMENTATION_SIN: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the sine of a number.", + "sin(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_sin_doc() -> &'static Documentation { - DOCUMENTATION_SIN.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the sine of a number.", - "sin(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_SIN } /// Non-decreasing for all real numbers. @@ -477,18 +483,18 @@ pub fn sinh_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_SINH: OnceLock = OnceLock::new(); +static DOCUMENTATION_SINH: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the hyperbolic sine of a number.", + "sinh(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_sinh_doc() -> &'static Documentation { - DOCUMENTATION_SINH.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the hyperbolic sine of a number.", - "sinh(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_SINH } /// Non-decreasing for x ≥ 0, undefined otherwise. @@ -505,18 +511,18 @@ pub fn sqrt_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_SQRT: OnceLock = OnceLock::new(); +static DOCUMENTATION_SQRT: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the square root of a number.", + "sqrt(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_sqrt_doc() -> &'static Documentation { - DOCUMENTATION_SQRT.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the square root of a number.", - "sqrt(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_SQRT } /// Non-decreasing between vertical asymptotes at x = k * π ± π / 2 for any @@ -526,18 +532,18 @@ pub fn tan_order(_input: &[ExprProperties]) -> Result { Ok(SortProperties::Unordered) } -static DOCUMENTATION_TAN: OnceLock = OnceLock::new(); +static DOCUMENTATION_TAN: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the tangent of a number.", + "tan(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_tan_doc() -> &'static Documentation { - DOCUMENTATION_TAN.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the tangent of a number.", - "tan(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_TAN } /// Non-decreasing for all real numbers. @@ -545,18 +551,18 @@ pub fn tanh_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_TANH: OnceLock = OnceLock::new(); +static DOCUMENTATION_TANH: LazyLock = LazyLock::new(|| { + Documentation::builder( + DOC_SECTION_MATH, + "Returns the hyperbolic tangent of a number.", + "tanh(numeric_expression)", + ) + .with_standard_argument("numeric_expression", Some("Numeric")) + .build() +}); pub fn get_tanh_doc() -> &'static Documentation { - DOCUMENTATION_TANH.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the hyperbolic tangent of a number.", - "tanh(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) + &DOCUMENTATION_TANH } #[cfg(test)] From fcaefccedfae308f44e8d72030e8b385487d3fc8 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Tue, 25 Feb 2025 23:53:58 +0800 Subject: [PATCH 68/71] Add polygon.io to user list (#14871) --- docs/source/user-guide/introduction.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index bed9233b9c23..7b60bc838f49 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -114,6 +114,7 @@ Here are some active projects using DataFusion: - [OpenObserve](https://github.com/openobserve/openobserve) Distributed cloud native observability platform - [ParadeDB](https://github.com/paradedb/paradedb) PostgreSQL for Search & Analytics - [Parseable](https://github.com/parseablehq/parseable) Log storage and observability platform +- [Polygon.io](https://polygon.io/) Stock Market API - [qv](https://github.com/timvw/qv) Quickly view your data - [Restate](https://github.com/restatedev) Easily build resilient applications using distributed durable async/await - [ROAPI](https://github.com/roapi/roapi) From 3a1d9477b0f1bb2ab143342ed39777741d31a2d7 Mon Sep 17 00:00:00 2001 From: Leonid Ryzhyk Date: Tue, 25 Feb 2025 10:06:06 -0800 Subject: [PATCH 69/71] Workaround for compilation error due to rkyv#434. (#14863) When datafusion is used in a workspace that enables the `rkyv-64` feature in the `chrono` crate, this triggered a Rust compilation error: ``` error[E0277]: can't compare `Option<&std::string::String>` with `Option<&mut std::string::String>`. ``` The root cause of the error is incorrect type unification in the Rust compiler, as explained in https://github.com/rkyv/rkyv/issues/434. The workaround pushes the compiler in the right direction by converting the mutable reference to an immutable one manually. Signed-off-by: Leonid Ryzhyk Co-authored-by: Leonid Ryzhyk --- datafusion/expr/src/logical_plan/plan.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 870b0751c923..c6fd95595233 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2869,7 +2869,11 @@ fn intersect_maps<'a>( let mut inputs = inputs.into_iter(); let mut merged: HashMap = inputs.next().cloned().unwrap_or_default(); for input in inputs { - merged.retain(|k, v| input.get(k) == Some(v)); + // The extra dereference below (`&*v`) is a workaround for https://github.com/rkyv/rkyv/issues/434. + // When this crate is used in a workspace that enables the `rkyv-64` feature in the `chrono` crate, + // this triggers a Rust compilation error: + // error[E0277]: can't compare `Option<&std::string::String>` with `Option<&mut std::string::String>`. + merged.retain(|k, v| input.get(k) == Some(&*v)); } merged } From fc2f9dd8cc279a7a3f53509bae3fa275a0585509 Mon Sep 17 00:00:00 2001 From: Anlin Chen <31682964+anlinc@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:13:32 -0500 Subject: [PATCH 70/71] fix(substrait): Do not add implicit groupBy expressions in `LogicalPlanBuilder` or when building logical plans from Substrait (#14860) * feat: add add_implicit_group_by_exprs option to logical plan builder * fix: do not add implicity group by exprs in substrait path * test: add substrait tests * test: add builder option tests * style: clippy errors --- datafusion/core/src/dataframe/mod.rs | 6 +- datafusion/expr/src/logical_plan/builder.rs | 121 +++++++++- datafusion/expr/src/logical_plan/mod.rs | 2 +- datafusion/sql/src/select.rs | 9 +- .../substrait/tests/cases/logical_plans.rs | 18 ++ .../tests/cases/roundtrip_logical_plan.rs | 11 + .../multilayer_aggregate.substrait.json | 213 ++++++++++++++++++ 7 files changed, 368 insertions(+), 12 deletions(-) create mode 100644 datafusion/substrait/tests/testdata/test_plans/multilayer_aggregate.substrait.json diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 6f540fa02c75..b6949d2eea9c 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -33,7 +33,8 @@ use crate::execution::context::{SessionState, TaskContext}; use crate::execution::FunctionRegistry; use crate::logical_expr::utils::find_window_exprs; use crate::logical_expr::{ - col, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Partitioning, TableType, + col, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions, + Partitioning, TableType, }; use crate::physical_plan::{ collect, collect_partitioned, execute_stream, execute_stream_partitioned, @@ -526,7 +527,10 @@ impl DataFrame { ) -> Result { let is_grouping_set = matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]); let aggr_expr_len = aggr_expr.len(); + let options = + LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true); let plan = LogicalPlanBuilder::from(self.plan) + .with_options(options) .aggregate(group_expr, aggr_expr)? .build()?; let plan = if is_grouping_set { diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index da30f2d7a712..4d825c6bfe49 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -53,8 +53,8 @@ use datafusion_common::display::ToStringifiedPlan; use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ exec_err, get_target_functional_dependencies, internal_err, not_impl_err, - plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, - Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions, + plan_datafusion_err, plan_err, Column, Constraints, DFSchema, DFSchemaRef, + DataFusionError, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions, }; use datafusion_expr_common::type_coercion::binary::type_union_resolution; @@ -63,6 +63,26 @@ use indexmap::IndexSet; /// Default table name for unnamed table pub const UNNAMED_TABLE: &str = "?table?"; +/// Options for [`LogicalPlanBuilder`] +#[derive(Default, Debug, Clone)] +pub struct LogicalPlanBuilderOptions { + /// Flag indicating whether the plan builder should add + /// functionally dependent expressions as additional aggregation groupings. + add_implicit_group_by_exprs: bool, +} + +impl LogicalPlanBuilderOptions { + pub fn new() -> Self { + Default::default() + } + + /// Should the builder add functionally dependent expressions as additional aggregation groupings. + pub fn with_add_implicit_group_by_exprs(mut self, add: bool) -> Self { + self.add_implicit_group_by_exprs = add; + self + } +} + /// Builder for logical plans /// /// # Example building a simple plan @@ -103,6 +123,7 @@ pub const UNNAMED_TABLE: &str = "?table?"; #[derive(Debug, Clone)] pub struct LogicalPlanBuilder { plan: Arc, + options: LogicalPlanBuilderOptions, } impl LogicalPlanBuilder { @@ -110,12 +131,21 @@ impl LogicalPlanBuilder { pub fn new(plan: LogicalPlan) -> Self { Self { plan: Arc::new(plan), + options: LogicalPlanBuilderOptions::default(), } } /// Create a builder from an existing plan pub fn new_from_arc(plan: Arc) -> Self { - Self { plan } + Self { + plan, + options: LogicalPlanBuilderOptions::default(), + } + } + + pub fn with_options(mut self, options: LogicalPlanBuilderOptions) -> Self { + self.options = options; + self } /// Return the output schema of the plan build so far @@ -1138,8 +1168,12 @@ impl LogicalPlanBuilder { let group_expr = normalize_cols(group_expr, &self.plan)?; let aggr_expr = normalize_cols(aggr_expr, &self.plan)?; - let group_expr = - add_group_by_exprs_from_dependencies(group_expr, self.plan.schema())?; + let group_expr = if self.options.add_implicit_group_by_exprs { + add_group_by_exprs_from_dependencies(group_expr, self.plan.schema())? + } else { + group_expr + }; + Aggregate::try_new(self.plan, group_expr, aggr_expr) .map(LogicalPlan::Aggregate) .map(Self::new) @@ -1550,6 +1584,7 @@ pub fn add_group_by_exprs_from_dependencies( } Ok(group_expr) } + /// Errors if one or more expressions have equal names. pub fn validate_unique_names<'a>( node_name: &str, @@ -1685,7 +1720,21 @@ pub fn table_scan_with_filter_and_fetch( pub fn table_source(table_schema: &Schema) -> Arc { let table_schema = Arc::new(table_schema.clone()); - Arc::new(LogicalTableSource { table_schema }) + Arc::new(LogicalTableSource { + table_schema, + constraints: Default::default(), + }) +} + +pub fn table_source_with_constraints( + table_schema: &Schema, + constraints: Constraints, +) -> Arc { + let table_schema = Arc::new(table_schema.clone()); + Arc::new(LogicalTableSource { + table_schema, + constraints, + }) } /// Wrap projection for a plan, if the join keys contains normal expression. @@ -1756,12 +1805,21 @@ pub fn wrap_projection_for_join_if_necessary( /// DefaultTableSource. pub struct LogicalTableSource { table_schema: SchemaRef, + constraints: Constraints, } impl LogicalTableSource { /// Create a new LogicalTableSource pub fn new(table_schema: SchemaRef) -> Self { - Self { table_schema } + Self { + table_schema, + constraints: Constraints::default(), + } + } + + pub fn with_constraints(mut self, constraints: Constraints) -> Self { + self.constraints = constraints; + self } } @@ -1774,6 +1832,10 @@ impl TableSource for LogicalTableSource { Arc::clone(&self.table_schema) } + fn constraints(&self) -> Option<&Constraints> { + Some(&self.constraints) + } + fn supports_filters_pushdown( &self, filters: &[&Expr], @@ -2023,12 +2085,12 @@ pub fn unnest_with_options( #[cfg(test)] mod tests { - use super::*; use crate::logical_plan::StringifiedPlan; use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery}; - use datafusion_common::{RecursionUnnestOption, SchemaError}; + use crate::test::function_stub::sum; + use datafusion_common::{Constraint, RecursionUnnestOption, SchemaError}; #[test] fn plan_builder_simple() -> Result<()> { @@ -2575,4 +2637,45 @@ mod tests { Ok(()) } + + #[test] + fn plan_builder_aggregate_without_implicit_group_by_exprs() -> Result<()> { + let constraints = + Constraints::new_unverified(vec![Constraint::PrimaryKey(vec![0])]); + let table_source = table_source_with_constraints(&employee_schema(), constraints); + + let plan = + LogicalPlanBuilder::scan("employee_csv", table_source, Some(vec![0, 3, 4]))? + .aggregate(vec![col("id")], vec![sum(col("salary"))])? + .build()?; + + let expected = + "Aggregate: groupBy=[[employee_csv.id]], aggr=[[sum(employee_csv.salary)]]\ + \n TableScan: employee_csv projection=[id, state, salary]"; + assert_eq!(expected, format!("{plan}")); + + Ok(()) + } + + #[test] + fn plan_builder_aggregate_with_implicit_group_by_exprs() -> Result<()> { + let constraints = + Constraints::new_unverified(vec![Constraint::PrimaryKey(vec![0])]); + let table_source = table_source_with_constraints(&employee_schema(), constraints); + + let options = + LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true); + let plan = + LogicalPlanBuilder::scan("employee_csv", table_source, Some(vec![0, 3, 4]))? + .with_options(options) + .aggregate(vec![col("id")], vec![sum(col("salary"))])? + .build()?; + + let expected = + "Aggregate: groupBy=[[employee_csv.id, employee_csv.state, employee_csv.salary]], aggr=[[sum(employee_csv.salary)]]\ + \n TableScan: employee_csv projection=[id, state, salary]"; + assert_eq!(expected, format!("{plan}")); + + Ok(()) + } } diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs index 404941378663..916b2131be04 100644 --- a/datafusion/expr/src/logical_plan/mod.rs +++ b/datafusion/expr/src/logical_plan/mod.rs @@ -28,7 +28,7 @@ pub mod tree_node; pub use builder::{ build_join_schema, table_scan, union, wrap_projection_for_join_if_necessary, - LogicalPlanBuilder, LogicalTableSource, UNNAMED_TABLE, + LogicalPlanBuilder, LogicalPlanBuilderOptions, LogicalTableSource, UNNAMED_TABLE, }; pub use ddl::{ CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction, diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 05782e6ecd75..e21def4c3941 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -38,7 +38,8 @@ use datafusion_expr::utils::{ }; use datafusion_expr::{ qualified_wildcard_with_options, wildcard_with_options, Aggregate, Expr, Filter, - GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning, + GroupingSet, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions, + Partitioning, }; use indexmap::IndexMap; @@ -371,7 +372,10 @@ impl SqlToRel<'_, S> { let agg_expr = agg.aggr_expr.clone(); let (new_input, new_group_by_exprs) = self.try_process_group_by_unnest(agg)?; + let options = LogicalPlanBuilderOptions::new() + .with_add_implicit_group_by_exprs(true); LogicalPlanBuilder::from(new_input) + .with_options(options) .aggregate(new_group_by_exprs, agg_expr)? .build() } @@ -744,7 +748,10 @@ impl SqlToRel<'_, S> { aggr_exprs: &[Expr], ) -> Result<(LogicalPlan, Vec, Option)> { // create the aggregate plan + let options = + LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true); let plan = LogicalPlanBuilder::from(input.clone()) + .with_options(options) .aggregate(group_by_exprs.to_vec(), aggr_exprs.to_vec())? .build()?; let group_by_exprs = if let LogicalPlan::Aggregate(agg) = &plan { diff --git a/datafusion/substrait/tests/cases/logical_plans.rs b/datafusion/substrait/tests/cases/logical_plans.rs index 65f404bbda55..6f5899595548 100644 --- a/datafusion/substrait/tests/cases/logical_plans.rs +++ b/datafusion/substrait/tests/cases/logical_plans.rs @@ -91,4 +91,22 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn multilayer_aggregate() -> Result<()> { + let proto_plan = + read_json("tests/testdata/test_plans/multilayer_aggregate.substrait.json"); + let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?; + let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?; + + assert_eq!( + format!("{}", plan), + "Projection: lower(sales.product) AS lower(product), sum(count(sales.product)) AS product_count\ + \n Aggregate: groupBy=[[sales.product]], aggr=[[sum(count(sales.product))]]\ + \n Aggregate: groupBy=[[sales.product]], aggr=[[count(sales.product)]]\ + \n TableScan: sales" + ); + + Ok(()) + } } diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 68856117a38c..57363eb390ef 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -308,6 +308,17 @@ async fn aggregate_grouping_rollup() -> Result<()> { ).await } +#[tokio::test] +async fn multilayer_aggregate() -> Result<()> { + assert_expected_plan( + "SELECT a, sum(partial_count_b) FROM (SELECT a, count(b) as partial_count_b FROM data GROUP BY a) GROUP BY a", + "Aggregate: groupBy=[[data.a]], aggr=[[sum(count(data.b)) AS sum(partial_count_b)]]\ + \n Aggregate: groupBy=[[data.a]], aggr=[[count(data.b)]]\ + \n TableScan: data projection=[a, b]", + true + ).await +} + #[tokio::test] async fn decimal_literal() -> Result<()> { roundtrip("SELECT * FROM data WHERE b > 2.5").await diff --git a/datafusion/substrait/tests/testdata/test_plans/multilayer_aggregate.substrait.json b/datafusion/substrait/tests/testdata/test_plans/multilayer_aggregate.substrait.json new file mode 100644 index 000000000000..1f47b916daf0 --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/multilayer_aggregate.substrait.json @@ -0,0 +1,213 @@ +{ + "extensionUris": [{ + "extensionUriAnchor": 1, + "uri": "/functions_aggregate_generic.yaml" + }, { + "extensionUriAnchor": 2, + "uri": "/functions_arithmetic.yaml" + }, { + "extensionUriAnchor": 3, + "uri": "/functions_string.yaml" + }], + "extensions": [{ + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "count:any" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "sum:i64" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "lower:str" + } + }], + "relations": [{ + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [2, 3] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "product" + ], + "struct": { + "types": [ + { + "string": { + "nullability": "NULLABILITY_REQUIRED" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": [ + "sales" + ] + } + } + }, + "groupings": [{ + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }], + "expressionReferences": [] + }], + "measures": [{ + "measure": { + "functionReference": 0, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }], + "groupingExpressions": [] + } + }, + "groupings": [{ + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }], + "expressionReferences": [] + }], + "measures": [{ + "measure": { + "functionReference": 1, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }], + "groupingExpressions": [] + } + }, + "expressions": [{ + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "string": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }] + } + }, + "names": ["lower(product)", "product_count"] + } + }], + "expectedTypeUrls": [] +} From b7d9c78a81867dfc0976315cd9b89c3387db440b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:54:55 -0500 Subject: [PATCH 71/71] chore(deps): bump uuid from 1.13.2 to 1.14.0 (#14866) Bumps [uuid](https://github.com/uuid-rs/uuid) from 1.13.2 to 1.14.0. - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/v1.13.2...v1.14.0) --- updated-dependencies: - dependency-name: uuid dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- datafusion-examples/Cargo.toml | 2 +- datafusion/core/Cargo.toml | 2 +- datafusion/functions/Cargo.toml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 22f06f9932a6..7fb752574695 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6505,9 +6505,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6" +checksum = "93d59ca99a559661b96bf898d8fce28ed87935fd2bea9f05983c1464dd6c71b1" dependencies = [ "getrandom 0.3.1", "js-sys", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index feafa48b3954..f908a9c57b03 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -74,7 +74,7 @@ test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tonic = "0.12.1" url = { workspace = true } -uuid = "1.13" +uuid = "1.14" [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 69048f6a7cf4..9fba5c352b62 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -125,7 +125,7 @@ sqlparser = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } url = { workspace = true } -uuid = { version = "1.13", features = ["v4", "js"] } +uuid = { version = "1.14", features = ["v4", "js"] } xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 3208f2dd169f..632db1385709 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -85,7 +85,7 @@ rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "1.13", features = ["v4"], optional = true } +uuid = { version = "1.14", features = ["v4"], optional = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] }