From f3bb84f7ce8a08e40afa14db525b98e69fd4bda4 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Wed, 14 Sep 2022 23:04:48 +0800 Subject: [PATCH 01/30] inlist: move type coercion to logical phase (#3472) --- datafusion/core/src/physical_plan/planner.rs | 32 ++++--- datafusion/optimizer/src/type_coercion.rs | 94 ++++++++++++++++++- .../physical-expr/src/expressions/in_list.rs | 60 +++++++++++- datafusion/physical-expr/src/planner.rs | 54 +---------- 4 files changed, 171 insertions(+), 69 deletions(-) diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 40109eda2f11..c87590059625 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -1696,9 +1696,11 @@ mod tests { async fn plan(logical_plan: &LogicalPlan) -> Result> { let mut session_state = make_session_state(); session_state.config.target_partitions = 4; + // optimize the logical plan + let logical_plan = session_state.optimize(logical_plan)?; let planner = DefaultPhysicalPlanner::default(); planner - .create_physical_plan(logical_plan, &session_state) + .create_physical_plan(&logical_plan, &session_state) .await } @@ -1714,12 +1716,12 @@ mod tests { .limit(3, Some(10))? .build()?; - let plan = plan(&logical_plan).await?; + let exec_plan = plan(&logical_plan).await?; // verify that the plan correctly casts u8 to i64 // the cast here is implicit so has CastOptions with safe=true - let expected = "BinaryExpr { left: Column { name: \"c7\", index: 6 }, op: Lt, right: TryCastExpr { expr: Literal { value: UInt8(5) }, cast_type: Int64 } }"; - assert!(format!("{:?}", plan).contains(expected)); + let expected = "BinaryExpr { left: Column { name: \"c7\", index: 2 }, op: Lt, right: Literal { value: Int64(5) } }"; + assert!(format!("{:?}", exec_plan).contains(expected)); Ok(()) } @@ -1821,8 +1823,7 @@ mod tests { async fn test_with_zero_offset_plan() -> Result<()> { let logical_plan = test_csv_scan().await?.limit(0, None)?.build()?; let plan = plan(&logical_plan).await?; - assert!(format!("{:?}", plan).contains("GlobalLimitExec")); - assert!(format!("{:?}", plan).contains("skip: 0")); + assert!(format!("{:?}", plan).contains("limit: None")); Ok(()) } @@ -1952,8 +1953,8 @@ mod tests { .project(vec![col("c1").in_list(list, false)])? .build()?; let execution_plan = plan(&logical_plan).await?; - // verify that the plan correctly adds cast from Int64(1) to Utf8 - let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }], negated: false, set: None }"; + // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. + let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }], negated: false, set: None }"; assert!(format!("{:?}", execution_plan).contains(expected)); // expression: "a in (struct::null, 'a')" @@ -1965,10 +1966,9 @@ mod tests { .filter(col("c12").lt(lit(0.05)))? .project(vec![col("c12").lt_eq(lit(0.025)).in_list(list, false)])? .build()?; - let execution_plan = plan(&logical_plan).await; + let e = plan(&logical_plan).await.unwrap_err().to_string(); - let e = execution_plan.unwrap_err().to_string(); - assert_contains!(&e, "Can not find compatible types to compare Boolean with [Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }]), Utf8]"); + assert_contains!(&e, "The data type inlist should be same, the value type is Boolean, one of list expr type is Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }])"); Ok(()) } @@ -1996,7 +1996,10 @@ mod tests { .project(vec![col("c1").in_list(list, false)])? .build()?; let execution_plan = plan(&logical_plan).await?; - let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: Some(InSet { set: "; + let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }, Literal { value: Utf8(\"2\") },"; + assert!(format!("{:?}", execution_plan).contains(expected)); + let expected = + "Literal { value: Utf8(\"30\") }], negated: false, set: Some(InSet { set: "; assert!(format!("{:?}", execution_plan).contains(expected)); Ok(()) } @@ -2015,7 +2018,10 @@ mod tests { .project(vec![col("c1").in_list(list, false)])? .build()?; let execution_plan = plan(&logical_plan).await?; - let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [TryCastExpr { expr: Literal { value: Int64(NULL) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: Some(InSet {"; + let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(NULL) }, Literal { value: Utf8(\"1\") }, Literal { value: Utf8(\"2\") }"; + assert!(format!("{:?}", execution_plan).contains(expected)); + let expected = + "Literal { value: Utf8(\"30\") }], negated: false, set: Some(InSet"; assert!(format!("{:?}", execution_plan).contains(expected)); Ok(()) } diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 72ee5d19adcd..6cb711dfc87c 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -124,7 +124,6 @@ impl ExprRewriter for TypeCoercionRewriter<'_> { right.clone().cast_to(&coerced_type, &self.schema)?, ), }; - expr.rewrite(&mut self.const_evaluator) } } @@ -164,11 +163,61 @@ impl ExprRewriter for TypeCoercionRewriter<'_> { }; expr.rewrite(&mut self.const_evaluator) } + Expr::InList { + expr, + list, + negated, + } => { + let expr_data_type = expr.get_type(&self.schema)?; + let list_data_types = list + .iter() + .map(|list_expr| list_expr.get_type(&self.schema)) + .collect::>>()?; + let result_type = + get_coerce_type_for_list(&expr_data_type, &list_data_types); + match result_type { + None => Err(DataFusionError::Plan(format!( + "Can not find compatible types to compare {:?} with {:?}", + expr_data_type, list_data_types + ))), + Some(coerced_type) => { + // find the coerced type + let cast_expr = expr.cast_to(&coerced_type, &self.schema)?; + let cast_list_expr = list + .into_iter() + .map(|list_expr| { + list_expr.cast_to(&coerced_type, &self.schema) + }) + .collect::>>()?; + let expr = Expr::InList { + expr: Box::new(cast_expr), + list: cast_list_expr, + negated, + }; + expr.rewrite(&mut self.const_evaluator) + } + } + } expr => Ok(expr), } } } +/// Attempts to coerce the types of `list_types` to be comparable with the +/// `expr_type`. +/// Returns the common data type for `expr_type` and `list_types` +fn get_coerce_type_for_list( + expr_type: &DataType, + list_types: &[DataType], +) -> Option { + list_types + .iter() + .fold(Some(expr_type.clone()), |left, right_type| match left { + None => None, + Some(left_type) => comparison_coercion(&left_type, right_type), + }) +} + /// Returns `expressions` coerced to types compatible with /// `signature`, if possible. /// @@ -348,6 +397,49 @@ mod test { Ok(()) } + #[test] + fn inlist_case() -> Result<()> { + // a in (1,4,8), a is int64 + let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false); + let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: Arc::new( + DFSchema::new_with_metadata( + vec![DFField::new(None, "a", DataType::Int64, true)], + std::collections::HashMap::new(), + ) + .unwrap(), + ), + })); + let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty, None)?); + let rule = TypeCoercion::new(); + let mut config = OptimizerConfig::default(); + let plan = rule.optimize(&plan, &mut config)?; + assert_eq!( + "Projection: #a IN ([Int64(1), Int64(4), Int64(8)])\n EmptyRelation", + &format!("{:?}", plan) + ); + // a in (1,4,8), a is decimal + let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false); + let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: Arc::new( + DFSchema::new_with_metadata( + vec![DFField::new(None, "a", DataType::Decimal128(12, 4), true)], + std::collections::HashMap::new(), + ) + .unwrap(), + ), + })); + let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty, None)?); + let plan = rule.optimize(&plan, &mut config)?; + assert_eq!( + "Projection: CAST(#a AS Decimal128(24, 4)) IN ([Decimal128(Some(10000),24,4), Decimal128(Some(40000),24,4), Decimal128(Some(80000),24,4)])\n EmptyRelation", + &format!("{:?}", plan) + ); + Ok(()) + } + fn empty() -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 1848473165ce..daad48193408 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -960,6 +960,17 @@ pub fn in_list( negated: &bool, schema: &Schema, ) -> Result> { + // check the data type + let expr_data_type = expr.data_type(schema)?; + for list_expr in list.iter() { + let list_expr_data_type = list_expr.data_type(schema)?; + if !expr_data_type.eq(&list_expr_data_type) { + return Err(DataFusionError::Internal(format!( + "The data type inlist should be same, the value type is {}, one of list expr type is {}", + expr_data_type, list_expr_data_type + ))); + } + } Ok(Arc::new(InListExpr::new(expr, list, *negated, schema))) } @@ -969,9 +980,54 @@ mod tests { use super::*; use crate::expressions; - use crate::expressions::{col, lit}; - use crate::planner::in_list_cast; + use crate::expressions::{col, lit, try_cast}; use datafusion_common::Result; + use datafusion_expr::binary_rule::comparison_coercion; + + type InListCastResult = (Arc, Vec>); + + // Try to do the type coercion for list physical expr. + // It's just used in the test + fn in_list_cast( + expr: Arc, + list: Vec>, + input_schema: &Schema, + ) -> Result { + let expr_type = &expr.data_type(input_schema)?; + let list_types: Vec = list + .iter() + .map(|list_expr| list_expr.data_type(input_schema).unwrap()) + .collect(); + let result_type = get_coerce_type(expr_type, &list_types); + match result_type { + None => Err(DataFusionError::Plan(format!( + "Can not find compatible types to compare {:?} with {:?}", + expr_type, list_types + ))), + Some(data_type) => { + // find the coerced type + let cast_expr = try_cast(expr, input_schema, data_type.clone())?; + let cast_list_expr = list + .into_iter() + .map(|list_expr| { + try_cast(list_expr, input_schema, data_type.clone()).unwrap() + }) + .collect(); + Ok((cast_expr, cast_list_expr)) + } + } + } + + // Attempts to coerce the types of `list_type` to be comparable with the + // `expr_type` + fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option { + list_type + .iter() + .fold(Some(expr_type.clone()), |left, right_type| match left { + None => None, + Some(left_type) => comparison_coercion(&left_type, right_type), + }) + } // applies the in_list expr to an input batch and list macro_rules! in_list { diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 7e31dea0acb2..ba9664ef653d 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use crate::expressions::try_cast; use crate::var_provider::is_system_variables; use crate::{ execution_props::ExecutionProps, @@ -28,7 +27,6 @@ use crate::{ }; use arrow::datatypes::{DataType, Schema}; use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue}; -use datafusion_expr::binary_rule::comparison_coercion; use datafusion_expr::{binary_expr, Expr, Operator}; use std::sync::Arc; @@ -410,10 +408,7 @@ pub fn create_physical_expr( ) }) .collect::>>()?; - - let (cast_expr, cast_list_exprs) = - in_list_cast(value_expr, list_exprs, input_schema)?; - expressions::in_list(cast_expr, cast_list_exprs, negated, input_schema) + expressions::in_list(value_expr, list_exprs, negated, input_schema) } }, other => Err(DataFusionError::NotImplemented(format!( @@ -422,50 +417,3 @@ pub fn create_physical_expr( ))), } } - -type InListCastResult = (Arc, Vec>); - -pub(crate) fn in_list_cast( - expr: Arc, - list: Vec>, - input_schema: &Schema, -) -> Result { - let expr_type = &expr.data_type(input_schema)?; - let list_types: Vec = list - .iter() - .map(|list_expr| list_expr.data_type(input_schema).unwrap()) - .collect(); - let result_type = get_coerce_type(expr_type, &list_types); - match result_type { - None => Err(DataFusionError::Plan(format!( - "Can not find compatible types to compare {:?} with {:?}", - expr_type, list_types - ))), - Some(data_type) => { - // find the coerced type - let cast_expr = try_cast(expr, input_schema, data_type.clone())?; - let cast_list_expr = list - .into_iter() - .map(|list_expr| { - try_cast(list_expr, input_schema, data_type.clone()).unwrap() - }) - .collect(); - Ok((cast_expr, cast_list_expr)) - } - } -} - -/// Attempts to coerce the types of `list_type` to be comparable with the -/// `expr_type` -fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option { - // get the equal coerced data type - list_type - .iter() - .fold(Some(expr_type.clone()), |left, right_type| { - match left { - None => None, - // TODO refactor a framework to do the data type coercion - Some(left_type) => comparison_coercion(&left_type, right_type), - } - }) -} From 916654032c02fb5697ab5622b3ce444e97fc64d2 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Thu, 15 Sep 2022 01:49:50 +0800 Subject: [PATCH 02/30] [minor] Remove unused arg in macro in Inlist (#3474) * remove unused arg in macro in Inlist * fix fmt --- .../physical-expr/src/expressions/in_list.rs | 90 ++++--------------- 1 file changed, 19 insertions(+), 71 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index daad48193408..ae49b4cec4ac 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -129,7 +129,7 @@ macro_rules! make_contains_primitive { } macro_rules! set_contains_for_float { - ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr, $PHY_TYPE:ty) => {{ + ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{ let contains_null = $SET_VALUES.iter().any(|s| s.is_null()); let bool_array = if $NEGATED { // Not in @@ -173,7 +173,7 @@ macro_rules! set_contains_for_float { } macro_rules! set_contains_for_primitive { - ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr, $PHY_TYPE:ty) => {{ + ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{ let contains_null = $SET_VALUES.iter().any(|s| s.is_null()); let native_set = $SET_VALUES .iter() @@ -538,59 +538,28 @@ impl PhysicalExpr for InListExpr { array, set, Boolean, - self.negated, - bool + self.negated )) } DataType::Int8 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_primitive!( - array, - set, - Int8, - self.negated, - i8 - )) + Ok(set_contains_for_primitive!(array, set, Int8, self.negated)) } DataType::Int16 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_primitive!( - array, - set, - Int16, - self.negated, - i16 - )) + Ok(set_contains_for_primitive!(array, set, Int16, self.negated)) } DataType::Int32 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_primitive!( - array, - set, - Int32, - self.negated, - i32 - )) + Ok(set_contains_for_primitive!(array, set, Int32, self.negated)) } DataType::Int64 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_primitive!( - array, - set, - Int64, - self.negated, - i64 - )) + Ok(set_contains_for_primitive!(array, set, Int64, self.negated)) } DataType::UInt8 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_primitive!( - array, - set, - UInt8, - self.negated, - u8 - )) + Ok(set_contains_for_primitive!(array, set, UInt8, self.negated)) } DataType::UInt16 => { let array = array.as_any().downcast_ref::().unwrap(); @@ -598,8 +567,7 @@ impl PhysicalExpr for InListExpr { array, set, UInt16, - self.negated, - u16 + self.negated )) } DataType::UInt32 => { @@ -608,8 +576,7 @@ impl PhysicalExpr for InListExpr { array, set, UInt32, - self.negated, - u32 + self.negated )) } DataType::UInt64 => { @@ -618,8 +585,7 @@ impl PhysicalExpr for InListExpr { array, set, UInt64, - self.negated, - u64 + self.negated )) } DataType::Date32 => { @@ -628,8 +594,7 @@ impl PhysicalExpr for InListExpr { array, set, Date32, - self.negated, - i32 + self.negated )) } DataType::Date64 => { @@ -638,29 +603,16 @@ impl PhysicalExpr for InListExpr { array, set, Date64, - self.negated, - i64 + self.negated )) } DataType::Float32 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_float!( - array, - set, - Float32, - self.negated, - f32 - )) + Ok(set_contains_for_float!(array, set, Float32, self.negated)) } DataType::Float64 => { let array = array.as_any().downcast_ref::().unwrap(); - Ok(set_contains_for_float!( - array, - set, - Float64, - self.negated, - f64 - )) + Ok(set_contains_for_float!(array, set, Float64, self.negated)) } DataType::Utf8 => { let array = array @@ -704,8 +656,7 @@ impl PhysicalExpr for InListExpr { array, set, TimestampSecond, - self.negated, - i64 + self.negated )) } TimeUnit::Millisecond => { @@ -717,8 +668,7 @@ impl PhysicalExpr for InListExpr { array, set, TimestampMillisecond, - self.negated, - i64 + self.negated )) } TimeUnit::Microsecond => { @@ -730,8 +680,7 @@ impl PhysicalExpr for InListExpr { array, set, TimestampMicrosecond, - self.negated, - i64 + self.negated )) } TimeUnit::Nanosecond => { @@ -743,8 +692,7 @@ impl PhysicalExpr for InListExpr { array, set, TimestampNanosecond, - self.negated, - i64 + self.negated )) } }, From 9d028b3ac3e1d9e702af088a7620826c58f3c115 Mon Sep 17 00:00:00 2001 From: Morgan Cassels Date: Wed, 14 Sep 2022 10:52:25 -0700 Subject: [PATCH 03/30] add FixedSizeBinary support to create_hashes (#3458) * add FixedSizeBinary support to create_hashes * remove equality and inequality assertions about FixedSizeBinary hashes because feature force_hash_collisions changes hash values Co-authored-by: Morgan Cassels --- .../core/src/physical_plan/hash_utils.rs | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs index 315c3db3bf31..61e72b07c551 100644 --- a/datafusion/core/src/physical_plan/hash_utils.rs +++ b/datafusion/core/src/physical_plan/hash_utils.rs @@ -21,10 +21,10 @@ use crate::error::{DataFusionError, Result}; use ahash::RandomState; use arrow::array::{ Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array, - DictionaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, LargeStringArray, StringArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + DictionaryArray, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::datatypes::{ ArrowDictionaryKeyType, ArrowNativeType, DataType, Int16Type, Int32Type, Int64Type, @@ -529,6 +529,16 @@ pub fn create_hashes<'a>( multi_col ); } + DataType::FixedSizeBinary(_) => { + hash_array!( + FixedSizeBinaryArray, + col, + &[u8], + hashes_buffer, + random_state, + multi_col + ); + } DataType::LargeBinary => { hash_array!( LargeBinaryArray, @@ -627,7 +637,7 @@ pub fn create_hashes<'a>( mod tests { use crate::from_slice::FromSlice; use arrow::{ - array::{BinaryArray, DictionaryArray}, + array::{BinaryArray, DictionaryArray, FixedSizeBinaryArray}, datatypes::Int8Type, }; use std::sync::Arc; @@ -682,6 +692,21 @@ mod tests { Ok(()) } + #[test] + fn create_hashes_fixed_size_binary() -> Result<()> { + let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]]; + let fixed_size_binary_array = + Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap()); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; fixed_size_binary_array.len()]; + let hashes = + create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 3,); + + Ok(()) + } + #[test] // Tests actual values of hashes, which are different if forcing collisions #[cfg(not(feature = "force_hash_collisions"))] From 5f029cc73755b6800217e370ebe0a7b5e8a6a224 Mon Sep 17 00:00:00 2001 From: Nitish Tiwari Date: Thu, 15 Sep 2022 02:24:13 +0530 Subject: [PATCH 04/30] Add Parseable as Datafusion user (#3471) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b468089e264a..cd92f71eaa8f 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ Here are some of the projects known to use DataFusion: - [delta-rs](https://github.com/delta-io/delta-rs) Native Rust implementation of Delta Lake - [Flock](https://github.com/flock-lab/flock) - [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database +- [Parseable](https://github.com/parseablehq/parseable) Log storage and observability platform - [qv](https://github.com/timvw/qv) Quickly view your data - [ROAPI](https://github.com/roapi/roapi) - [Tensorbase](https://github.com/tensorbase/tensorbase) From 06f01eaf65b78d73ffc7fb0bf95619e1e884e850 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Thu, 15 Sep 2022 08:56:22 +0800 Subject: [PATCH 05/30] change the null type in the row filter (#3470) --- .../physical_plan/file_format/row_filter.rs | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/physical_plan/file_format/row_filter.rs b/datafusion/core/src/physical_plan/file_format/row_filter.rs index 56bdba5577e9..473856b7b3d0 100644 --- a/datafusion/core/src/physical_plan/file_format/row_filter.rs +++ b/datafusion/core/src/physical_plan/file_format/row_filter.rs @@ -19,7 +19,7 @@ use arrow::array::{Array, BooleanArray}; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::{ArrowError, Result as ArrowResult}; use arrow::record_batch::RecordBatch; -use datafusion_common::{Column, Result, ScalarValue, ToDFSchema}; +use datafusion_common::{Column, DataFusionError, Result, ScalarValue, ToDFSchema}; use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion}; use datafusion_expr::{uncombine_filter, Expr}; @@ -202,7 +202,18 @@ impl<'a> ExprRewriter for FilterCandidateBuilder<'a> { fn mutate(&mut self, expr: Expr) -> Result { if let Expr::Column(Column { name, .. }) = &expr { if self.file_schema.field_with_name(name).is_err() { - return Ok(Expr::Literal(ScalarValue::Null)); + // the column expr must be in the table schema + return match self.table_schema.field_with_name(name) { + Ok(field) => { + // return the null value corresponding to the data type + let null_value = ScalarValue::try_from(field.data_type())?; + Ok(Expr::Literal(null_value)) + } + Err(e) => { + // If the column is not in the table schema, should throw the error + Err(DataFusionError::ArrowError(e)) + } + }; } } @@ -314,12 +325,13 @@ mod test { use crate::physical_plan::file_format::row_filter::FilterCandidateBuilder; use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::ScalarValue; - use datafusion_expr::{col, lit}; + use datafusion_expr::{cast, col, lit}; use parquet::arrow::parquet_to_arrow_schema; use parquet::file::reader::{FileReader, SerializedFileReader}; // Assume a column expression for a column not in the table schema is a projected column and ignore it #[test] + #[should_panic(expected = "building candidate failed")] fn test_filter_candidate_builder_ignore_projected_columns() { let testdata = crate::test_util::parquet_test_data(); let file = std::fs::File::open(&format!("{}/alltypes_plain.parquet", testdata)) @@ -337,7 +349,7 @@ mod test { let candidate = FilterCandidateBuilder::new(expr, &table_schema, &table_schema) .build(metadata) - .expect("building candidate"); + .expect("building candidate failed"); assert!(candidate.is_none()); } @@ -386,8 +398,10 @@ mod test { Field::new("float_col", DataType::Float32, true), ]); - let expr = col("bigint_col").eq(col("int_col")); - let expected_candidate_expr = col("bigint_col").eq(lit(ScalarValue::Null)); + // The parquet file with `file_schema` just has `bigint_col` and `float_col` column, and don't have the `int_col` + let expr = col("bigint_col").eq(cast(col("int_col"), DataType::Int64)); + let expected_candidate_expr = + col("bigint_col").eq(cast(lit(ScalarValue::Int32(None)), DataType::Int64)); let candidate = FilterCandidateBuilder::new(expr, &file_schema, &table_schema) .build(metadata) From f52d8aff2e7975403699268225bca0a9efbe7076 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 14 Sep 2022 21:44:35 -0400 Subject: [PATCH 06/30] minor: fix bug in `downcast_value!` macro (#3486) --- datafusion/common/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 2992596c564c..d9fd2503e958 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -43,10 +43,10 @@ macro_rules! downcast_value { })? }}; ($Value: expr, $Type: ident, $T: tt) => {{ - $Value.as_any().downcast_ref::<$Type>().ok_or_else(|| { + $Value.as_any().downcast_ref::<$Type<$T>>().ok_or_else(|| { DataFusionError::Internal(format!( "could not cast value to {}", - type_name::<$Type>() + type_name::<$Type<$T>>() )) })? }}; From 84bee899958aaf70372ef84811c6787f53fa25eb Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 14 Sep 2022 21:44:57 -0400 Subject: [PATCH 07/30] make the function from_proto_binary_op public (#3490) Co-authored-by: askoa --- datafusion/proto/src/from_proto.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs index 5402b03ce330..b82237788a50 100644 --- a/datafusion/proto/src/from_proto.rs +++ b/datafusion/proto/src/from_proto.rs @@ -1515,7 +1515,7 @@ fn unwrap_timezone(proto_value: &str) -> Option { } } -fn from_proto_binary_op(op: &str) -> Result { +pub fn from_proto_binary_op(op: &str) -> Result { match op { "And" => Ok(Operator::And), "Or" => Ok(Operator::Or), From 6ffda6885c7acbbddeb5d336116b035f9a9fa30b Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Thu, 15 Sep 2022 08:43:22 -0400 Subject: [PATCH 08/30] Add BitwiseXor in function from_proto_binary_op (#3496) --- datafusion/proto/src/from_proto.rs | 1 + datafusion/proto/src/lib.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs index b82237788a50..8c0124b9906b 100644 --- a/datafusion/proto/src/from_proto.rs +++ b/datafusion/proto/src/from_proto.rs @@ -1536,6 +1536,7 @@ pub fn from_proto_binary_op(op: &str) -> Result { "IsNotDistinctFrom" => Ok(Operator::IsNotDistinctFrom), "BitwiseAnd" => Ok(Operator::BitwiseAnd), "BitwiseOr" => Ok(Operator::BitwiseOr), + "BitwiseXor" => Ok(Operator::BitwiseXor), "BitwiseShiftLeft" => Ok(Operator::BitwiseShiftLeft), "BitwiseShiftRight" => Ok(Operator::BitwiseShiftRight), "RegexIMatch" => Ok(Operator::RegexIMatch), diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index cce778be2e24..ce2678025a88 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -858,6 +858,7 @@ mod roundtrip_tests { test(Operator::BitwiseShiftLeft); test(Operator::BitwiseAnd); test(Operator::BitwiseOr); + test(Operator::BitwiseXor); test(Operator::IsDistinctFrom); test(Operator::IsNotDistinctFrom); test(Operator::And); From 331922036a23265084ab091fa2ab5c57da42c841 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 16 Sep 2022 01:04:17 +0800 Subject: [PATCH 09/30] add time_zone into ConfigOptions (#3485) * add time_zone into ConfigOptions * fix debug leftover * Update datafusion/core/src/config.rs Co-authored-by: Kun Liu Co-authored-by: Kun Liu --- datafusion/core/src/config.rs | 34 ++++++++++++++++- .../core/tests/sql/information_schema.rs | 38 +++++++++++++++++++ datafusion/sql/src/planner.rs | 7 +++- 3 files changed, 77 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/config.rs b/datafusion/core/src/config.rs index 3c828079e241..00ccdd523c1d 100644 --- a/datafusion/core/src/config.rs +++ b/datafusion/core/src/config.rs @@ -47,6 +47,9 @@ pub const OPT_COALESCE_TARGET_BATCH_SIZE: &str = pub const OPT_OPTIMIZER_SKIP_FAILED_RULES: &str = "datafusion.optimizer.skip_failed_rules"; +/// Configuration option "datafusion.execution.time_zone" +pub const OPT_TIME_ZONE: &str = "datafusion.execution.time_zone"; + /// Definition of a configuration option pub struct ConfigDefinition { /// key used to identifier this configuration option @@ -102,6 +105,20 @@ impl ConfigDefinition { ScalarValue::UInt64(Some(default_value)), ) } + + /// Create a configuration option definition with a string value + pub fn new_string( + key: impl Into, + description: impl Into, + default_value: String, + ) -> Self { + Self::new( + key, + description, + DataType::Utf8, + ScalarValue::Utf8(Some(default_value)), + ) + } } /// Contains definitions for all built-in configuration options @@ -167,7 +184,14 @@ impl BuiltInConfigs { messages if any optimization rules produce errors and then proceed to the next \ rule. When set to false, any rules that produce errors will cause the query to fail.", true - )], + ), + ConfigDefinition::new_string( + OPT_TIME_ZONE, + "The session time zone which some function require \ + e.g. EXTRACT(HOUR from SOME_TIME) shift the underline datetime according to the time zone, + then extract the hour", + "UTC".into() + )] } } @@ -279,6 +303,14 @@ impl ConfigOptions { } } + /// get a string configuration option + pub fn get_string(&self, key: &str) -> String { + match self.get(key) { + Some(ScalarValue::Utf8(Some(s))) => s, + _ => "".into(), + } + } + /// Access the underlying hashmap pub fn options(&self) -> &HashMap { &self.options diff --git a/datafusion/core/tests/sql/information_schema.rs b/datafusion/core/tests/sql/information_schema.rs index c9a5c9aa9cd5..854166155161 100644 --- a/datafusion/core/tests/sql/information_schema.rs +++ b/datafusion/core/tests/sql/information_schema.rs @@ -703,3 +703,41 @@ async fn show_all() { .len(); assert_eq!(expected_length, results[0].num_rows()); } + +#[tokio::test] +async fn show_time_zone_default_utc() { + // https://github.com/apache/arrow-datafusion/issues/3255 + let ctx = + SessionContext::with_config(SessionConfig::new().with_information_schema(true)); + let sql = "SHOW TIME ZONE"; + let results = plan_and_collect(&ctx, sql).await.unwrap(); + + let expected = vec![ + "+--------------------------------+---------+", + "| name | setting |", + "+--------------------------------+---------+", + "| datafusion.execution.time_zone | UTC |", + "+--------------------------------+---------+", + ]; + + assert_batches_eq!(expected, &results); +} + +#[tokio::test] +async fn show_timezone_default_utc() { + // https://github.com/apache/arrow-datafusion/issues/3255 + let ctx = + SessionContext::with_config(SessionConfig::new().with_information_schema(true)); + let sql = "SHOW TIMEZONE"; + let results = plan_and_collect(&ctx, sql).await.unwrap(); + + let expected = vec![ + "+--------------------------------+---------+", + "| name | setting |", + "+--------------------------------+---------+", + "| datafusion.execution.time_zone | UTC |", + "+--------------------------------+---------+", + ]; + + assert_batches_eq!(expected, &results); +} diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 0338d713a3c6..04518d81e457 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -2386,8 +2386,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { )); } - let query = if variable.to_lowercase() == "all" { + let variable_lower = variable.to_lowercase(); + + let query = if variable_lower == "all" { String::from("SELECT name, setting FROM information_schema.df_settings") + } else if variable_lower == "timezone" || variable_lower == "time.zone" { + // we could introduce alias in OptionDefinition if this string matching thing grows + String::from("SELECT name, setting FROM information_schema.df_settings WHERE name = 'datafusion.execution.time_zone'") } else { format!( "SELECT name, setting FROM information_schema.df_settings WHERE name = '{}'", From 66dd253d2e0cdd00c8d3611f2ca470b9cde48abc Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Fri, 16 Sep 2022 15:55:44 +0800 Subject: [PATCH 10/30] MINOR: remove unused dependencies (#3508) * MINOR: remove unused dependencies Signed-off-by: Ruihang Xia * revert rand * ad newline Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia --- datafusion/common/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 72ffbb22b144..028a4874a8bc 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -46,5 +46,4 @@ object_store = { version = "0.5.0", optional = true } ordered-float = "3.0" parquet = { version = "22.0.0", features = ["arrow"], optional = true } pyo3 = { version = "0.17.1", optional = true } -serde_json = "1.0" sqlparser = "0.23" From 25c029c9ec275f53edf29189a0c94a88ad0c9618 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 16 Sep 2022 04:45:16 -0600 Subject: [PATCH 11/30] prettier (#3504) --- dev/release/README.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/dev/release/README.md b/dev/release/README.md index 3dadf615b85e..06023ce08c3f 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -307,21 +307,20 @@ dot -Tsvg dev/release/crate-deps.dot > dev/release/crate-deps.svg (cd datafusion/optimizer && cargo publish) (cd datafusion/core && cargo publish) (cd datafusion/proto && cargo publish) -(cd datafusion-cli && cargo publish) ``` -### Publish datafusion-cli on Homebrew and crates.io +The CLI needs a `--no-verify` argument because `build.rs` generates source into the `src` directory. + +```shell +(cd datafusion-cli && cargo publish --no-verify) +``` + +### Publish datafusion-cli on Homebrew For Homebrew, Send a simple PR to update tag and commit hash for the datafusion formula in homebrew-core. Here is an example PR: https://github.com/Homebrew/homebrew-core/pull/89562. -For crates.io, run - -```shell -(cd datafusion-cli && cargo publish) -``` - ### Call the vote Call the vote on the Arrow dev list by replying to the RC voting thread. The @@ -389,7 +388,17 @@ Delete a release: svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/arrow/arrow-datafusion-7.0.0 ``` -### Write a blog post announcing the release +### Publish the User Guide to the Arrow Site + +- Run the `build.sh` in the `docs` directory from the release tarball. +- Clone the [arrow-site](https://github.com/apache/arrow-site) repository +- Checkout the `asf-site` branch +- Copy content from `docs/build/html/*` to the `datafusion` directory in arrow-site +- Create a PR against the `asf-site` branch ([example](https://github.com/apache/arrow-site/pull/237)) +- Once the PR is merged, the content will be published to https://arrow.apache.org/datafusion/ by GitHub Pages (this + can take some time). + +### Optional: Write a blog post announcing the release We typically crowdsource release announcements by collaborating on a Google document, usually starting with a copy of the previous release announcement. From c59a76754e7e45063cb51c273f2a53aa325676a4 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 16 Sep 2022 10:03:08 -0400 Subject: [PATCH 12/30] Add additional DATE_PART units (#3503) * Add dow support to date_part * Add doy support to date_part * Add quarter support to date_part --- datafusion/core/tests/sql/expr.rs | 15 +++++++++++++++ .../physical-expr/src/datetime_expressions.rs | 3 +++ 2 files changed, 18 insertions(+) diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index 3ca2c4738f45..a5288d92a464 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -1233,6 +1233,11 @@ async fn test_extract_date_part() -> Result<()> { "EXTRACT(year FROM to_timestamp('2020-09-08T12:00:00+00:00'))", "2020" ); + test_expression!("date_part('QUARTER', CAST('2000-01-01' AS DATE))", "1"); + test_expression!( + "EXTRACT(quarter FROM to_timestamp('2020-09-08T12:00:00+00:00'))", + "3" + ); test_expression!("date_part('MONTH', CAST('2000-01-01' AS DATE))", "1"); test_expression!( "EXTRACT(month FROM to_timestamp('2020-09-08T12:00:00+00:00'))", @@ -1248,6 +1253,16 @@ async fn test_extract_date_part() -> Result<()> { "EXTRACT(day FROM to_timestamp('2020-09-08T12:00:00+00:00'))", "8" ); + test_expression!("date_part('DOY', CAST('2000-01-01' AS DATE))", "1"); + test_expression!( + "EXTRACT(doy FROM to_timestamp('2020-09-08T12:00:00+00:00'))", + "252" + ); + test_expression!("date_part('DOW', CAST('2000-01-01' AS DATE))", "6"); + test_expression!( + "EXTRACT(dow FROM to_timestamp('2020-09-08T12:00:00+00:00'))", + "2" + ); test_expression!("date_part('HOUR', CAST('2000-01-01' AS DATE))", "0"); test_expression!( "EXTRACT(hour FROM to_timestamp('2020-09-08T12:03:03+00:00'))", diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index a702273790f3..0214d1bfb9be 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -449,9 +449,12 @@ pub fn date_part(args: &[ColumnarValue]) -> Result { let arr = match date_part.to_lowercase().as_str() { "year" => extract_date_part!(array, temporal::year), + "quarter" => extract_date_part!(array, temporal::quarter), "month" => extract_date_part!(array, temporal::month), "week" => extract_date_part!(array, temporal::week), "day" => extract_date_part!(array, temporal::day), + "doy" => extract_date_part!(array, temporal::doy), + "dow" => extract_date_part!(array, temporal::num_days_from_sunday), "hour" => extract_date_part!(array, temporal::hour), "minute" => extract_date_part!(array, temporal::minute), "second" => extract_date_part!(array, temporal::second), From 4380f355e4102e937b26d13645891bf69deca995 Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Fri, 16 Sep 2022 14:37:12 -0400 Subject: [PATCH 13/30] Make FileStream and FileOpener public (#3514) --- datafusion/core/src/physical_plan/file_format/file_stream.rs | 5 +++++ datafusion/core/src/physical_plan/file_format/mod.rs | 1 + 2 files changed, 6 insertions(+) diff --git a/datafusion/core/src/physical_plan/file_format/file_stream.rs b/datafusion/core/src/physical_plan/file_format/file_stream.rs index 1a54d685228c..609e3b3a9b20 100644 --- a/datafusion/core/src/physical_plan/file_format/file_stream.rs +++ b/datafusion/core/src/physical_plan/file_format/file_stream.rs @@ -51,7 +51,11 @@ use crate::physical_plan::RecordBatchStream; pub type FileOpenFuture = BoxFuture<'static, Result>>>; +/// Generic API for opening a file using an [`ObjectStore`] and resolving to a +/// stream of [`RecordBatch`] pub trait FileOpener: Unpin { + /// Asynchronously open the specified file and return a stream + /// of [`RecordBatch`] fn open( &self, store: Arc, @@ -167,6 +171,7 @@ impl FileStreamMetrics { } impl FileStream { + /// Create a new `FileStream` using the give `FileOpener` to scan underlying files pub fn new( config: &FileScanConfig, partition: usize, diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs index 707ed039c26c..2d59570078bc 100644 --- a/datafusion/core/src/physical_plan/file_format/mod.rs +++ b/datafusion/core/src/physical_plan/file_format/mod.rs @@ -39,6 +39,7 @@ use arrow::{ record_batch::RecordBatch, }; pub use avro::AvroExec; +pub use file_stream::{FileOpenFuture, FileOpener, FileStream}; pub(crate) use json::plan_to_json; pub use json::NdJsonExec; From c6ebf7bcc73067fa285af847da3bf0c5c04c270a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 16 Sep 2022 21:49:39 -0600 Subject: [PATCH 14/30] MINOR: Add more execs to list of supported execs (#3519) * prettier * add more execs to docs * Revert * Update datafusion/core/src/lib.rs Co-authored-by: Andrew Lamb * Update datafusion/core/src/lib.rs Co-authored-by: Andrew Lamb * Update datafusion/core/src/lib.rs Co-authored-by: Andrew Lamb * Update datafusion/core/src/lib.rs Co-authored-by: Andrew Lamb * Update datafusion/core/src/lib.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- datafusion/core/src/lib.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 790f03411127..74782359ec8e 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -161,11 +161,17 @@ //! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec) //! * Filter: [`FilterExec`](physical_plan::filter::FilterExec) //! * Grouped and non-grouped aggregations: [`AggregateExec`](physical_plan::aggregates::AggregateExec) +//! * Hash Join: [`HashJoinExec`](physical_plan::hash_join::HashJoinExec) +//! * Cross Join: [`CrossJoinExec`](physical_plan::cross_join::CrossJoinExec) +//! * Sort Merge Join: [`SortMergeJoinExec`](physical_plan::sort_merge_join::SortMergeJoinExec) +//! * Union: [`UnionExec`](physical_plan::union::UnionExec) //! * Sort: [`SortExec`](physical_plan::sorts::sort::SortExec) //! * Coalesce partitions: [`CoalescePartitionsExec`](physical_plan::coalesce_partitions::CoalescePartitionsExec) //! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec) -//! * Scan a CSV: [`CsvExec`](physical_plan::file_format::CsvExec) -//! * Scan a Parquet: [`ParquetExec`](physical_plan::file_format::ParquetExec) +//! * Scan CSV: [`CsvExec`](physical_plan::file_format::CsvExec) +//! * Scan Parquet: [`ParquetExec`](physical_plan::file_format::ParquetExec) +//! * Scan Avro: [`AvroExec`](physical_plan::file_format::AvroExec) +//! * Scan newline-delimited JSON: [`NdJsonExec`](physical_plan::file_format::NdJsonExec) //! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec) //! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec) //! From b106b02a362a011bd8666cd2ae146fb6cfa7c4e4 Mon Sep 17 00:00:00 2001 From: Kirk Mitchener Date: Sat, 17 Sep 2022 06:46:16 -0400 Subject: [PATCH 15/30] fix divide by zero not throwing proper error for decimal (#3517) * fix divide by zero for decimal, add tests * check for actual error strings * move scalar divide by zero check out of the loop. fix precision in test. --- .../src/expressions/binary/kernels_arrow.rs | 52 ++++++++++++++++--- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs index e99fbf0c9630..8caed7191fb2 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs @@ -378,6 +378,9 @@ pub(crate) fn divide_decimal( ) -> Result { let mul = 10_f64.powi(left.scale() as i32); let array = arith_decimal(left, right, |left, right| { + if right == 0 { + return Err(DataFusionError::ArrowError(ArrowError::DivideByZero)); + } let l_value = left as f64; let r_value = right as f64; let result = ((l_value / r_value) * mul) as i128; @@ -391,6 +394,9 @@ pub(crate) fn divide_decimal_scalar( left: &Decimal128Array, right: i128, ) -> Result { + if right == 0 { + return Err(DataFusionError::ArrowError(ArrowError::DivideByZero)); + } let mul = 10_f64.powi(left.scale() as i32); let array = arith_decimal_scalar(left, right, |left, right| { let l_value = left as f64; @@ -615,34 +621,66 @@ mod tests { assert_eq!(expect, result); // divide let left_decimal_array = create_decimal_array( - &[Some(1234567), None, Some(1234567), Some(1234567)], + &[ + Some(1234567), + None, + Some(1234567), + Some(1234567), + Some(1234567), + ], + 25, + 3, + ); + let right_decimal_array = create_decimal_array( + &[Some(10), Some(100), Some(55), Some(-123), None], 25, 3, ); - let right_decimal_array = - create_decimal_array(&[Some(10), Some(100), Some(55), Some(-123)], 25, 3); let result = divide_decimal(&left_decimal_array, &right_decimal_array)?; let expect = create_decimal_array( - &[Some(123456700), None, Some(22446672), Some(-10037130)], + &[Some(123456700), None, Some(22446672), Some(-10037130), None], 25, 3, ); assert_eq!(expect, result); let result = divide_decimal_scalar(&left_decimal_array, 10)?; let expect = create_decimal_array( - &[Some(123456700), None, Some(123456700), Some(123456700)], + &[ + Some(123456700), + None, + Some(123456700), + Some(123456700), + Some(123456700), + ], 25, 3, ); assert_eq!(expect, result); // modulus let result = modulus_decimal(&left_decimal_array, &right_decimal_array)?; - let expect = create_decimal_array(&[Some(7), None, Some(37), Some(16)], 25, 3); + let expect = + create_decimal_array(&[Some(7), None, Some(37), Some(16), None], 25, 3); assert_eq!(expect, result); let result = modulus_decimal_scalar(&left_decimal_array, 10)?; - let expect = create_decimal_array(&[Some(7), None, Some(7), Some(7)], 25, 3); + let expect = + create_decimal_array(&[Some(7), None, Some(7), Some(7), Some(7)], 25, 3); assert_eq!(expect, result); Ok(()) } + + #[test] + fn arithmetic_decimal_divide_by_zero() { + let left_decimal_array = create_decimal_array(&[Some(101)], 10, 1); + let right_decimal_array = create_decimal_array(&[Some(0)], 1, 1); + + let err = divide_decimal(&left_decimal_array, &right_decimal_array).unwrap_err(); + assert_eq!("Arrow error: Divide by zero error", err.to_string()); + let err = divide_decimal_scalar(&left_decimal_array, 0).unwrap_err(); + assert_eq!("Arrow error: Divide by zero error", err.to_string()); + let err = modulus_decimal(&left_decimal_array, &right_decimal_array).unwrap_err(); + assert_eq!("Arrow error: Divide by zero error", err.to_string()); + let err = modulus_decimal_scalar(&left_decimal_array, 0).unwrap_err(); + assert_eq!("Arrow error: Divide by zero error", err.to_string()); + } } From 12f047e365a56399bf6fc1feaf25708c2afc5ba7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 17 Sep 2022 07:00:07 -0400 Subject: [PATCH 16/30] Change `downcast_any!` macro so it does not need to use `use std::any::type_name;` (#3484) --- datafusion/common/src/lib.rs | 2 ++ datafusion/physical-expr/src/aggregate/approx_distinct.rs | 1 - .../physical-expr/src/aggregate/approx_percentile_cont.rs | 6 +----- datafusion/physical-expr/src/aggregate/average.rs | 1 - datafusion/physical-expr/src/aggregate/count.rs | 1 - datafusion/physical-expr/src/aggregate/covariance.rs | 1 - datafusion/physical-expr/src/aggregate/min_max.rs | 2 +- datafusion/physical-expr/src/aggregate/sum.rs | 2 +- datafusion/physical-expr/src/aggregate/variance.rs | 2 +- 9 files changed, 6 insertions(+), 12 deletions(-) diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index d9fd2503e958..48fd792cf9fe 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -35,6 +35,7 @@ pub use scalar::{ScalarType, ScalarValue}; #[macro_export] macro_rules! downcast_value { ($Value: expr, $Type: ident) => {{ + use std::any::type_name; $Value.as_any().downcast_ref::<$Type>().ok_or_else(|| { DataFusionError::Internal(format!( "could not cast value to {}", @@ -43,6 +44,7 @@ macro_rules! downcast_value { })? }}; ($Value: expr, $Type: ident, $T: tt) => {{ + use std::any::type_name; $Value.as_any().downcast_ref::<$Type<$T>>().ok_or_else(|| { DataFusionError::Internal(format!( "could not cast value to {}", diff --git a/datafusion/physical-expr/src/aggregate/approx_distinct.rs b/datafusion/physical-expr/src/aggregate/approx_distinct.rs index 7252ac122f44..7302008ba825 100644 --- a/datafusion/physical-expr/src/aggregate/approx_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/approx_distinct.rs @@ -31,7 +31,6 @@ use arrow::datatypes::{ use datafusion_common::{downcast_value, ScalarValue}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{Accumulator, AggregateState}; -use std::any::type_name; use std::any::Any; use std::convert::TryFrom; use std::convert::TryInto; diff --git a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs index b2457887b481..6dfa24746305 100644 --- a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs +++ b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs @@ -31,11 +31,7 @@ use datafusion_common::Result; use datafusion_common::{downcast_value, ScalarValue}; use datafusion_expr::{Accumulator, AggregateState}; use ordered_float::OrderedFloat; -use std::{ - any::{type_name, Any}, - iter, - sync::Arc, -}; +use std::{any::Any, iter, sync::Arc}; /// APPROX_PERCENTILE_CONT aggregate expression #[derive(Debug)] diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 623ccbb32b6b..501e62456414 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -17,7 +17,6 @@ //! Defines physical expressions that can evaluated at runtime during query execution -use std::any::type_name; use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs index 78a86a3cdcf7..96a8febd043d 100644 --- a/datafusion/physical-expr/src/aggregate/count.rs +++ b/datafusion/physical-expr/src/aggregate/count.rs @@ -17,7 +17,6 @@ //! Defines physical expressions that can evaluated at runtime during query execution -use std::any::type_name; use std::any::Any; use std::fmt::Debug; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/aggregate/covariance.rs b/datafusion/physical-expr/src/aggregate/covariance.rs index d621b3bad4a0..c8488162042f 100644 --- a/datafusion/physical-expr/src/aggregate/covariance.rs +++ b/datafusion/physical-expr/src/aggregate/covariance.rs @@ -17,7 +17,6 @@ //! Defines physical expressions that can evaluated at runtime during query execution -use std::any::type_name; use std::any::Any; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 3f2b698b8840..bdccdf522207 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -17,7 +17,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution -use std::any::{type_name, Any}; +use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index 0841cb6db9ff..d639a1170815 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -17,7 +17,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution -use std::any::{type_name, Any}; +use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/aggregate/variance.rs b/datafusion/physical-expr/src/aggregate/variance.rs index dde85f414266..07f8a34ce41d 100644 --- a/datafusion/physical-expr/src/aggregate/variance.rs +++ b/datafusion/physical-expr/src/aggregate/variance.rs @@ -17,7 +17,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution -use std::any::{type_name, Any}; +use std::any::Any; use std::sync::Arc; use crate::aggregate::stats::StatsType; From 86a8236627577f19b175a0d8fe76ea982ef71254 Mon Sep 17 00:00:00 2001 From: Francis Du Date: Sun, 18 Sep 2022 23:32:04 +0800 Subject: [PATCH 17/30] [DataFrame] - Add cache function for DataFrame (#3512) * feat: add cache function for dataframe * fix: function doc typo * fix: test issue --- datafusion/core/src/dataframe.rs | 65 ++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index 9be7138dca15..a1b43cf531f1 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -21,7 +21,7 @@ use crate::arrow::datatypes::Schema; use crate::arrow::datatypes::SchemaRef; use crate::arrow::record_batch::RecordBatch; use crate::arrow::util::pretty; -use crate::datasource::TableProvider; +use crate::datasource::{MemTable, TableProvider}; use crate::error::Result; use crate::execution::{ context::{SessionState, TaskContext}, @@ -35,6 +35,7 @@ use crate::physical_plan::file_format::{plan_to_csv, plan_to_json, plan_to_parqu use crate::physical_plan::SendableRecordBatchStream; use crate::physical_plan::{collect, collect_partitioned}; use crate::physical_plan::{execute_stream, execute_stream_partitioned, ExecutionPlan}; +use crate::prelude::SessionContext; use crate::scalar::ScalarValue; use async_trait::async_trait; use parking_lot::RwLock; @@ -733,6 +734,29 @@ impl DataFrame { ))) } } + + /// Cache DataFrame as a memory table. + /// + /// ``` + /// # use datafusion::prelude::*; + /// # use datafusion::error::Result; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let ctx = SessionContext::new(); + /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?; + /// let df = df.cache().await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn cache(&self) -> Result> { + let mem_table = MemTable::try_new( + SchemaRef::from(self.schema().clone()), + self.collect_partitioned().await?, + )?; + + SessionContext::with_state(self.session_state.read().clone()) + .read_table(Arc::new(mem_table)) + } } // TODO: This will introduce a ref cycle (#2659) @@ -1082,6 +1106,7 @@ mod tests { ); Ok(()) } + /// Compare the formatted string representation of two plans for equality fn assert_same_plan(plan1: &LogicalPlan, plan2: &LogicalPlan) { assert_eq!(format!("{:?}", plan1), format!("{:?}", plan2)); @@ -1265,7 +1290,7 @@ mod tests { \n Inner Join: #t1.c1 = #t2.c1\ \n TableScan: t1\ \n TableScan: t2", - format!("{:?}", df_renamed.to_unoptimized_plan()) + format!("{:?}", df_renamed.to_unoptimized_plan()) ); assert_eq!("\ @@ -1275,7 +1300,7 @@ mod tests { \n Inner Join: #t1.c1 = #t2.c1\ \n TableScan: t1 projection=[c1, c2, c3]\ \n TableScan: t2 projection=[c1, c2, c3]", - format!("{:?}", df_renamed.to_logical_plan()?) + format!("{:?}", df_renamed.to_logical_plan()?) ); let df_results = df_renamed.collect().await?; @@ -1303,6 +1328,7 @@ mod tests { .with_column("sum", cast(col("c2") + col("c3"), DataType::Int64))?; let df_results = df.collect().await?; + df.show().await?; assert_batches_sorted_eq!( vec![ "+----+----+-----+", @@ -1353,4 +1379,37 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn cache_test() -> Result<()> { + let df = test_table() + .await? + .select_columns(&["c2", "c3"])? + .limit(0, Some(1))? + .with_column("sum", cast(col("c2") + col("c3"), DataType::Int64))?; + + let cached_df = df.cache().await?; + + assert_eq!( + "TableScan: ?table? projection=[c2, c3, sum]", + format!("{:?}", cached_df.to_logical_plan()?) + ); + + let df_results = df.collect().await?; + let cached_df_results = cached_df.collect().await?; + assert_batches_sorted_eq!( + vec![ + "+----+----+-----+", + "| c2 | c3 | sum |", + "+----+----+-----+", + "| 2 | 1 | 3 |", + "+----+----+-----+", + ], + &cached_df_results + ); + + assert_eq!(&df_results, &cached_df_results); + + Ok(()) + } } From 9b221001ee1bc0e91b54f5145a8423db865c6f34 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Sep 2022 01:57:44 -0400 Subject: [PATCH 18/30] MINOR: reduce pub in OptimizerConfig (#3525) --- datafusion/core/src/execution/context.rs | 18 ++++++----- datafusion/optimizer/src/optimizer.rs | 31 ++++++++++++++++--- .../optimizer/src/simplify_expressions.rs | 10 +++--- datafusion/optimizer/src/type_coercion.rs | 2 +- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index cb0e16f545ea..947a2adb08e0 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -1564,14 +1564,16 @@ impl SessionState { /// Optimizes the logical plan by applying optimizer rules. pub fn optimize(&self, plan: &LogicalPlan) -> Result { - let mut optimizer_config = OptimizerConfig::new().with_skip_failing_rules( - self.config - .config_options - .read() - .get_bool(OPT_OPTIMIZER_SKIP_FAILED_RULES), - ); - optimizer_config.query_execution_start_time = - self.execution_props.query_execution_start_time; + let mut optimizer_config = OptimizerConfig::new() + .with_skip_failing_rules( + self.config + .config_options + .read() + .get_bool(OPT_OPTIMIZER_SKIP_FAILED_RULES), + ) + .with_query_execution_start_time( + self.execution_props.query_execution_start_time, + ); if let LogicalPlan::Explain(e) = plan { let mut stringified_plans = e.stringified_plans.clone(); diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 9170035b7d6e..e2ccd4944892 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -38,12 +38,15 @@ pub trait OptimizerRule { fn name(&self) -> &str; } -/// Placeholder for optimizer configuration options +/// Options to control the DataFusion Optimizer. #[derive(Debug)] pub struct OptimizerConfig { - /// Query execution start time that can be used to rewrite expressions such as `now()` - /// to use a literal value instead - pub query_execution_start_time: DateTime, + /// Query execution start time that can be used to rewrite + /// expressions such as `now()` to use a literal value instead + query_execution_start_time: DateTime, + /// id generator for optimizer passes + // TODO this should not be on the config, + // it should be its own 'OptimizerState' or something) next_id: usize, /// Option to skip rules that produce errors skip_failing_rules: bool, @@ -59,16 +62,34 @@ impl OptimizerConfig { } } - /// Specify whether the optimizer should skip rules that produce errors, or fail the query + /// Specify whether the optimizer should skip rules that produce + /// errors, or fail the query + pub fn with_query_execution_start_time( + mut self, + query_execution_tart_time: DateTime, + ) -> Self { + self.query_execution_start_time = query_execution_tart_time; + self + } + + /// Specify whether the optimizer should skip rules that produce + /// errors, or fail the query pub fn with_skip_failing_rules(mut self, b: bool) -> Self { self.skip_failing_rules = b; self } + /// Generate the next ID needed pub fn next_id(&mut self) -> usize { self.next_id += 1; self.next_id } + + /// Return the time at which the query execution started. This + /// time is used as the value for now() + pub fn query_execution_start_time(&self) -> DateTime { + self.query_execution_start_time + } } impl Default for OptimizerConfig { diff --git a/datafusion/optimizer/src/simplify_expressions.rs b/datafusion/optimizer/src/simplify_expressions.rs index aa87c5318ae4..a9c0d4b1f4c8 100644 --- a/datafusion/optimizer/src/simplify_expressions.rs +++ b/datafusion/optimizer/src/simplify_expressions.rs @@ -260,7 +260,7 @@ impl OptimizerRule for SimplifyExpressions { ) -> Result { let mut execution_props = ExecutionProps::new(); execution_props.query_execution_start_time = - optimizer_config.query_execution_start_time; + optimizer_config.query_execution_start_time(); self.optimize_internal(plan, &execution_props) } } @@ -1887,8 +1887,8 @@ mod tests { // expect optimizing will result in an error, returning the error string fn get_optimized_plan_err(plan: &LogicalPlan, date_time: &DateTime) -> String { - let mut config = OptimizerConfig::new(); - config.query_execution_start_time = *date_time; + let mut config = + OptimizerConfig::new().with_query_execution_start_time(*date_time); let rule = SimplifyExpressions::new(); let err = rule @@ -1902,8 +1902,8 @@ mod tests { plan: &LogicalPlan, date_time: &DateTime, ) -> String { - let mut config = OptimizerConfig::new(); - config.query_execution_start_time = *date_time; + let mut config = + OptimizerConfig::new().with_query_execution_start_time(*date_time); let rule = SimplifyExpressions::new(); let optimized_plan = rule diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 6cb711dfc87c..9d8e97f36a71 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -68,7 +68,7 @@ impl OptimizerRule for TypeCoercion { let mut execution_props = ExecutionProps::new(); execution_props.query_execution_start_time = - optimizer_config.query_execution_start_time; + optimizer_config.query_execution_start_time(); let const_evaluator = ConstEvaluator::try_new(&execution_props)?; let mut expr_rewrite = TypeCoercionRewriter { From 3a9e0d0e676836f9a9e23280a1b19b111024bf5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 19 Sep 2022 14:48:01 +0200 Subject: [PATCH 19/30] Execute sort in parallel when a limit is used after sort (#3527) * Parallel sort * Move it to optimization rule * Add rule * Improve rule * Remove bench * Fix doc * Fix indent --- datafusion/core/src/execution/context.rs | 3 + datafusion/core/src/physical_optimizer/mod.rs | 1 + .../src/physical_optimizer/parallel_sort.rs | 92 +++++++++++++++++++ datafusion/core/src/physical_plan/planner.rs | 2 +- .../core/src/physical_plan/sorts/sort.rs | 25 ++--- datafusion/core/tests/sql/explain_analyze.rs | 4 +- 6 files changed, 107 insertions(+), 20 deletions(-) create mode 100644 datafusion/core/src/physical_optimizer/parallel_sort.rs diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 947a2adb08e0..492782f53462 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -40,6 +40,7 @@ use crate::{ physical_optimizer::{ aggregate_statistics::AggregateStatistics, hash_build_probe_order::HashBuildProbeOrder, optimizer::PhysicalOptimizerRule, + parallel_sort::ParallelSort, }, }; pub use datafusion_physical_expr::execution_props::ExecutionProps; @@ -1469,6 +1470,8 @@ impl SessionState { .unwrap(), ))); } + physical_optimizers.push(Arc::new(ParallelSort::new())); + physical_optimizers.push(Arc::new(Repartition::new())); physical_optimizers.push(Arc::new(AddCoalescePartitionsExec::new())); diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs index 55550bcd2cff..82b7087ab2c5 100644 --- a/datafusion/core/src/physical_optimizer/mod.rs +++ b/datafusion/core/src/physical_optimizer/mod.rs @@ -23,6 +23,7 @@ pub mod coalesce_batches; pub mod hash_build_probe_order; pub mod merge_exec; pub mod optimizer; +pub mod parallel_sort; pub mod pruning; pub mod repartition; mod utils; diff --git a/datafusion/core/src/physical_optimizer/parallel_sort.rs b/datafusion/core/src/physical_optimizer/parallel_sort.rs new file mode 100644 index 000000000000..3361d8155f7f --- /dev/null +++ b/datafusion/core/src/physical_optimizer/parallel_sort.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parralel sort parallelizes sorts if a limit is present after a sort (`ORDER BY LIMIT N`) +use crate::{ + error::Result, + physical_optimizer::PhysicalOptimizerRule, + physical_plan::{ + limit::GlobalLimitExec, + sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec}, + with_new_children_if_necessary, + }, +}; +use std::sync::Arc; + +/// Optimizer rule that makes sort parallel if a limit is used after sort (`ORDER BY LIMIT N`) +/// The plan will use `SortPreservingMergeExec` to merge the results +#[derive(Default)] +pub struct ParallelSort {} + +impl ParallelSort { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} +impl PhysicalOptimizerRule for ParallelSort { + fn optimize( + &self, + plan: Arc, + config: &crate::execution::context::SessionConfig, + ) -> Result> { + if plan.children().is_empty() { + // leaf node, children cannot be replaced + Ok(plan.clone()) + } else { + // recurse down first + let children = plan + .children() + .iter() + .map(|child| self.optimize(child.clone(), config)) + .collect::>>()?; + let plan = with_new_children_if_necessary(plan, children)?; + let children = plan.children(); + let plan_any = plan.as_any(); + // GlobalLimitExec (SortExec preserve_partitioning=False) + // -> GlobalLimitExec (SortExec preserve_partitioning=True) + let parallel_sort = plan_any.downcast_ref::().is_some() + && children.len() == 1 + && children[0].as_any().downcast_ref::().is_some() + && !children[0] + .as_any() + .downcast_ref::() + .unwrap() + .preserve_partitioning(); + + Ok(if parallel_sort { + let sort = children[0].as_any().downcast_ref::().unwrap(); + let new_sort = SortExec::new_with_partitioning( + sort.expr().to_vec(), + sort.input().clone(), + true, + ); + let merge = SortPreservingMergeExec::new( + sort.expr().to_vec(), + Arc::new(new_sort), + ); + with_new_children_if_necessary(plan, vec![Arc::new(merge)])? + } else { + plan.clone() + }) + } + } + + fn name(&self) -> &str { + "parallel_sort" + } +} diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index c87590059625..20a819622d5d 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -841,7 +841,7 @@ impl DefaultPhysicalPlanner { )), }) .collect::>>()?; - Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?) ) + Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?)) } LogicalPlan::Join(Join { left, diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 64312327b483..cc0501785324 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -668,6 +668,11 @@ impl SortExec { Ok(Self::new_with_partitioning(expr, input, false)) } + /// Whether this `SortExec` preserves partitioning of the children + pub fn preserve_partitioning(&self) -> bool { + self.preserve_partitioning + } + /// Create a new sort execution plan with the option to preserve /// the partitioning of the input plan pub fn new_with_partitioning( @@ -741,10 +746,11 @@ impl ExecutionPlan for SortExec { self: Arc, children: Vec>, ) -> Result> { - Ok(Arc::new(SortExec::try_new( + Ok(Arc::new(SortExec::new_with_partitioning( self.expr.clone(), children[0].clone(), - )?)) + self.preserve_partitioning, + ))) } fn execute( @@ -753,21 +759,6 @@ impl ExecutionPlan for SortExec { context: Arc, ) -> Result { debug!("Start SortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()); - if !self.preserve_partitioning { - if 0 != partition { - return Err(DataFusionError::Internal(format!( - "SortExec invalid partition {}", - partition - ))); - } - - // sort needs to operate on a single partition currently - if 1 != self.input.output_partitioning().partition_count() { - return Err(DataFusionError::Internal( - "SortExec requires a single input partition".to_owned(), - )); - } - } debug!( "Start invoking SortExec's input.execute for partition: {}", diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 7f465c4c697f..a75e0e3fa515 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -686,8 +686,8 @@ async fn test_physical_plan_display_indent() { let physical_plan = ctx.create_physical_plan(&plan).await.unwrap(); let expected = vec![ "GlobalLimitExec: skip=0, fetch=10", - " SortExec: [the_min@2 DESC]", - " CoalescePartitionsExec", + " SortPreservingMergeExec: [the_min@2 DESC]", + " SortExec: [the_min@2 DESC]", " ProjectionExec: expr=[c1@0 as c1, MAX(aggregate_test_100.c12)@1 as MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)@2 as the_min]", " AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)]", " CoalesceBatchesExec: target_batch_size=4096", From a6fbf24ddb81e34258a940a48e371b1efe5f6e49 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Sep 2022 14:13:54 -0400 Subject: [PATCH 20/30] Add support for `ScalarValue::Dictionary` to datafusion-proto (#3532) --- datafusion/proto/proto/datafusion.proto | 7 +++++ datafusion/proto/src/from_proto.rs | 16 +++++++++++ datafusion/proto/src/lib.rs | 15 ++++++++-- datafusion/proto/src/to_proto.rs | 38 +++++++++++++++++++++++-- 4 files changed, 72 insertions(+), 4 deletions(-) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index baabc04cfff7..0e499ee48ffc 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -726,6 +726,12 @@ message ScalarTimestampValue { string timezone = 5; } +message ScalarDictionaryValue { + ArrowType index_type = 1; + ScalarValue value = 2; +} + + message ScalarValue{ oneof value { bool bool_value = 1; @@ -752,6 +758,7 @@ message ScalarValue{ int32 interval_yearmonth_value = 24; int64 interval_daytime_value = 25; ScalarTimestampValue timestamp_value = 26; + ScalarDictionaryValue dictionary_value = 27; } } diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs index 8c0124b9906b..95bfeb819a0c 100644 --- a/datafusion/proto/src/from_proto.rs +++ b/datafusion/proto/src/from_proto.rs @@ -776,6 +776,22 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { } } } + Value::DictionaryValue(v) => { + let index_type: DataType = v + .index_type + .as_ref() + .ok_or_else(|| Error::required("index_type"))? + .try_into()?; + + let value: Self = v + .value + .as_ref() + .ok_or_else(|| Error::required("value"))? + .as_ref() + .try_into()?; + + Self::Dictionary(Box::new(index_type), Box::new(value)) + } }) } } diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index ce2678025a88..4a1e782f3be1 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -451,11 +451,22 @@ mod roundtrip_tests { true, )), ), + ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Utf8(Some("foo".into()))), + ), + ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Utf8(None)), + ), ]; for test_case in should_pass.into_iter() { - let proto: super::protobuf::ScalarValue = (&test_case).try_into().unwrap(); - let _roundtrip: ScalarValue = (&proto).try_into().unwrap(); + let proto: super::protobuf::ScalarValue = (&test_case) + .try_into() + .expect("failed conversion to protobuf"); + let _roundtrip: ScalarValue = + (&proto).try_into().expect("failed conversion to protobuf"); } } diff --git a/datafusion/proto/src/to_proto.rs b/datafusion/proto/src/to_proto.rs index 43d649029f8d..ed0b5ec0871c 100644 --- a/datafusion/proto/src/to_proto.rs +++ b/datafusion/proto/src/to_proto.rs @@ -1176,12 +1176,46 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue { Value::IntervalDaytimeValue(*s) }) } - ScalarValue::Null => protobuf::ScalarValue { + datafusion::scalar::ScalarValue::Null => protobuf::ScalarValue { value: Some(Value::NullValue(PrimitiveScalarType::Null as i32)), }, - _ => { + + datafusion::scalar::ScalarValue::Binary(_) => { + // not yet implemented (TODO file ticket) + return Err(Error::invalid_scalar_value(val)); + } + + datafusion::scalar::ScalarValue::LargeBinary(_) => { + // not yet implemented (TODO file ticket) + return Err(Error::invalid_scalar_value(val)); + } + + datafusion::scalar::ScalarValue::Time64(_) => { + // not yet implemented (TODO file ticket) + return Err(Error::invalid_scalar_value(val)); + } + + datafusion::scalar::ScalarValue::IntervalMonthDayNano(_) => { + // not yet implemented (TODO file ticket) return Err(Error::invalid_scalar_value(val)); } + + datafusion::scalar::ScalarValue::Struct(_, _) => { + // not yet implemented (TODO file ticket) + return Err(Error::invalid_scalar_value(val)); + } + + datafusion::scalar::ScalarValue::Dictionary(index_type, val) => { + let value: protobuf::ScalarValue = val.as_ref().try_into()?; + protobuf::ScalarValue { + value: Some(Value::DictionaryValue(Box::new( + protobuf::ScalarDictionaryValue { + index_type: Some(index_type.as_ref().try_into()?), + value: Some(Box::new(value)), + }, + ))), + } + } }; Ok(scalar_val) From f30fc4eccc4337fb0875dd731553a8389f6c6f65 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 19 Sep 2022 12:38:58 -0600 Subject: [PATCH 21/30] Add Debug to TableReference and ResolvedTableReference (#3533) --- datafusion/sql/src/table_reference.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/table_reference.rs b/datafusion/sql/src/table_reference.rs index 44848eb99194..27905163dc7e 100644 --- a/datafusion/sql/src/table_reference.rs +++ b/datafusion/sql/src/table_reference.rs @@ -16,7 +16,7 @@ // under the License. // / Represents a resolved path to a table of the form "catalog.schema.table" -#[derive(Clone, Copy)] +#[derive(Debug, Clone, Copy)] pub struct ResolvedTableReference<'a> { /// The catalog (aka database) containing the table pub catalog: &'a str, @@ -27,7 +27,7 @@ pub struct ResolvedTableReference<'a> { } /// Represents a path to a table that may require further resolution -#[derive(Clone, Copy)] +#[derive(Debug, Clone, Copy)] pub enum TableReference<'a> { /// An unqualified table reference, e.g. "table" Bare { From 4b1e0444a6d82871c19ec826d61a4b33f3b52486 Mon Sep 17 00:00:00 2001 From: George Andronchik Date: Tue, 20 Sep 2022 05:49:37 +0800 Subject: [PATCH 22/30] feat: Union types coercion (#3513) 1 --- datafusion/core/src/physical_plan/union.rs | 32 +++++- datafusion/expr/src/expr_rewriter.rs | 38 ++++++- datafusion/expr/src/logical_plan/builder.rs | 73 ++++++++----- datafusion/sql/src/planner.rs | 108 +++++++++++++++++++- 4 files changed, 219 insertions(+), 32 deletions(-) diff --git a/datafusion/core/src/physical_plan/union.rs b/datafusion/core/src/physical_plan/union.rs index d57fbe0f3df3..bf9dfbd1b694 100644 --- a/datafusion/core/src/physical_plan/union.rs +++ b/datafusion/core/src/physical_plan/union.rs @@ -23,8 +23,12 @@ use std::{any::Any, sync::Arc}; -use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; +use arrow::{ + datatypes::{Field, Schema, SchemaRef}, + record_batch::RecordBatch, +}; use futures::StreamExt; +use itertools::Itertools; use log::debug; use super::{ @@ -46,14 +50,38 @@ pub struct UnionExec { inputs: Vec>, /// Execution metrics metrics: ExecutionPlanMetricsSet, + /// Schema of Union + schema: SchemaRef, } impl UnionExec { /// Create a new UnionExec pub fn new(inputs: Vec>) -> Self { + let fields: Vec = (0..inputs[0].schema().fields().len()) + .map(|i| { + inputs + .iter() + .filter_map(|input| { + if input.schema().fields().len() > i { + Some(input.schema().field(i).clone()) + } else { + None + } + }) + .find_or_first(|f| f.is_nullable()) + .unwrap() + }) + .collect(); + + let schema = Arc::new(Schema::new_with_metadata( + fields, + inputs[0].schema().metadata().clone(), + )); + UnionExec { inputs, metrics: ExecutionPlanMetricsSet::new(), + schema, } } @@ -70,7 +98,7 @@ impl ExecutionPlan for UnionExec { } fn schema(&self) -> SchemaRef { - self.inputs[0].schema() + self.schema.clone() } fn children(&self) -> Vec> { diff --git a/datafusion/expr/src/expr_rewriter.rs b/datafusion/expr/src/expr_rewriter.rs index 533f31ce1584..1c81b4c4a60c 100644 --- a/datafusion/expr/src/expr_rewriter.rs +++ b/datafusion/expr/src/expr_rewriter.rs @@ -18,8 +18,8 @@ //! Expression rewriter use crate::expr::GroupingSet; -use crate::logical_plan::Aggregate; -use crate::utils::grouping_set_to_exprlist; +use crate::logical_plan::{Aggregate, Projection}; +use crate::utils::{from_plan, grouping_set_to_exprlist}; use crate::{Expr, ExprSchemable, LogicalPlan}; use datafusion_common::Result; use datafusion_common::{Column, DFSchema}; @@ -524,6 +524,40 @@ pub fn unnormalize_cols(exprs: impl IntoIterator) -> Vec { exprs.into_iter().map(unnormalize_col).collect() } +/// Returns plan with expressions coerced to types compatible with +/// schema types +pub fn coerce_plan_expr_for_schema( + plan: &LogicalPlan, + schema: &DFSchema, +) -> Result { + let new_expr = plan + .expressions() + .into_iter() + .enumerate() + .map(|(i, expr)| { + let new_type = schema.field(i).data_type(); + if plan.schema().field(i).data_type() != schema.field(i).data_type() { + match (plan, &expr) { + ( + LogicalPlan::Projection(Projection { input, .. }), + Expr::Alias(e, alias), + ) => Ok(Expr::Alias( + Box::new(e.clone().cast_to(new_type, input.schema())?), + alias.clone(), + )), + _ => expr.cast_to(new_type, plan.schema()), + } + } else { + Ok(expr) + } + }) + .collect::>>()?; + + let new_inputs = plan.inputs().into_iter().cloned().collect::>(); + + from_plan(plan, &new_expr, &new_inputs) +} + #[cfg(test)] mod test { use super::*; diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 0125291fda0d..def9927ab6b3 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -17,7 +17,10 @@ //! This module provides a builder for creating LogicalPlans -use crate::expr_rewriter::{normalize_col, normalize_cols, rewrite_sort_cols_by_aggs}; +use crate::binary_rule::comparison_coercion; +use crate::expr_rewriter::{ + coerce_plan_expr_for_schema, normalize_col, normalize_cols, rewrite_sort_cols_by_aggs, +}; use crate::utils::{ columnize_expr, exprlist_to_fields, from_plan, grouping_set_to_exprlist, }; @@ -882,43 +885,59 @@ pub fn union_with_alias( right_plan: LogicalPlan, alias: Option, ) -> Result { - let union_schema = left_plan.schema().clone(); - let inputs_iter = vec![left_plan, right_plan] + let union_schema = (0..left_plan.schema().fields().len()) + .map(|i| { + let left_field = left_plan.schema().field(i); + let right_field = right_plan.schema().field(i); + let nullable = left_field.is_nullable() || right_field.is_nullable(); + let data_type = + comparison_coercion(left_field.data_type(), right_field.data_type()) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "UNION Column {} (type: {}) is not compatible with column {} (type: {})", + right_field.name(), + right_field.data_type(), + left_field.name(), + left_field.data_type() + )) + })?; + + Ok(DFField::new( + alias.as_deref(), + left_field.name(), + data_type, + nullable, + )) + }) + .collect::>>()? + .to_dfschema()?; + + let inputs = vec![left_plan, right_plan] .into_iter() .flat_map(|p| match p { LogicalPlan::Union(Union { inputs, .. }) => inputs, x => vec![Arc::new(x)], - }); - - inputs_iter - .clone() - .skip(1) - .try_for_each(|input_plan| -> Result<()> { - union_schema.check_arrow_schema_type_compatible( - &((**input_plan.schema()).clone().into()), - ) - })?; - - let inputs = inputs_iter - .map(|p| match p.as_ref() { - LogicalPlan::Projection(Projection { - expr, input, alias, .. - }) => Ok(Arc::new(project_with_column_index_alias( - expr.to_vec(), - input.clone(), - union_schema.clone(), - alias.clone(), - )?)), - x => Ok(Arc::new(x.clone())), }) - .into_iter() + .map(|p| { + let plan = coerce_plan_expr_for_schema(&p, &union_schema)?; + match plan { + LogicalPlan::Projection(Projection { + expr, input, alias, .. + }) => Ok(Arc::new(project_with_column_index_alias( + expr.to_vec(), + input, + Arc::new(union_schema.clone()), + alias, + )?)), + x => Ok(Arc::new(x)), + } + }) .collect::>>()?; if inputs.is_empty() { return Err(DataFusionError::Plan("Empty UNION".to_string())); } - let union_schema = (**inputs[0].schema()).clone(); let union_schema = Arc::new(match alias { Some(ref alias) => union_schema.replace_qualifier(alias.as_str()), None => union_schema.strip_qualifiers(), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 04518d81e457..541dc2520e93 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -4163,7 +4163,7 @@ mod tests { let sql = "SELECT interval '1 year 1 day' UNION ALL SELECT 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Column Int64(1) (type: Int64) is \ + "Plan(\"UNION Column Int64(1) (type: Int64) is \ not compatible with column IntervalMonthDayNano\ (\\\"950737950189618795196236955648\\\") \ (type: Interval(MonthDayNano))\")", @@ -4171,6 +4171,112 @@ mod tests { ); } + #[test] + fn union_with_different_decimal_data_types() { + let sql = "SELECT 1 a UNION ALL SELECT 1.1 a"; + let expected = "Union\ + \n Projection: CAST(Int64(1) AS Float64) AS a\ + \n EmptyRelation\ + \n Projection: Float64(1.1) AS a\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn union_with_null() { + let sql = "SELECT NULL a UNION ALL SELECT 1.1 a"; + let expected = "Union\ + \n Projection: CAST(NULL AS Float64) AS a\ + \n EmptyRelation\ + \n Projection: Float64(1.1) AS a\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn union_with_float_and_string() { + let sql = "SELECT 'a' a UNION ALL SELECT 1.1 a"; + let expected = "Union\ + \n Projection: Utf8(\"a\") AS a\ + \n EmptyRelation\ + \n Projection: CAST(Float64(1.1) AS Utf8) AS a\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn union_with_multiply_cols() { + let sql = "SELECT 'a' a, 1 b UNION ALL SELECT 1.1 a, 1.1 b"; + let expected = "Union\ + \n Projection: Utf8(\"a\") AS a, CAST(Int64(1) AS Float64) AS b\ + \n EmptyRelation\ + \n Projection: CAST(Float64(1.1) AS Utf8) AS a, Float64(1.1) AS b\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn sorted_union_with_different_types_and_group_by() { + let sql = "SELECT a FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT a FROM (select 1.1 a) x GROUP BY 1) ORDER BY 1"; + let expected = "Sort: #a ASC NULLS LAST\ + \n Union\ + \n Projection: CAST(#x.a AS Float64) AS a\ + \n Aggregate: groupBy=[[#x.a]], aggr=[[]]\ + \n Projection: #x.a, alias=x\ + \n Projection: Int64(1) AS a, alias=x\ + \n EmptyRelation\ + \n Projection: #x.a\ + \n Aggregate: groupBy=[[#x.a]], aggr=[[]]\ + \n Projection: #x.a, alias=x\ + \n Projection: Float64(1.1) AS a, alias=x\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn union_with_binary_expr_and_cast() { + let sql = "SELECT cast(0.0 + a as integer) FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT 2.1 + a FROM (select 1 a) x GROUP BY 1)"; + let expected = "Union\ + \n Projection: CAST(#Float64(0) + x.a AS Float64) AS Float64(0) + x.a\ + \n Aggregate: groupBy=[[CAST(Float64(0) + #x.a AS Int32)]], aggr=[[]]\ + \n Projection: #x.a, alias=x\ + \n Projection: Int64(1) AS a, alias=x\ + \n EmptyRelation\ + \n Projection: #Float64(2.1) + x.a\ + \n Aggregate: groupBy=[[Float64(2.1) + #x.a]], aggr=[[]]\ + \n Projection: #x.a, alias=x\ + \n Projection: Int64(1) AS a, alias=x\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn union_with_aliases() { + let sql = "SELECT a as a1 FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT a as a1 FROM (select 1.1 a) x GROUP BY 1)"; + let expected = "Union\ + \n Projection: CAST(#x.a AS Float64) AS a1\ + \n Aggregate: groupBy=[[#x.a]], aggr=[[]]\ + \n Projection: #x.a, alias=x\ + \n Projection: Int64(1) AS a, alias=x\ + \n EmptyRelation\ + \n Projection: #x.a AS a1\ + \n Aggregate: groupBy=[[#x.a]], aggr=[[]]\ + \n Projection: #x.a, alias=x\ + \n Projection: Float64(1.1) AS a, alias=x\ + \n EmptyRelation"; + quick_test(sql, expected); + } + + #[test] + fn union_with_incompatible_data_types() { + let sql = "SELECT 'a' a UNION ALL SELECT true a"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "Plan(\"UNION Column a (type: Boolean) is not compatible with column a (type: Utf8)\")", + format!("{:?}", err) + ); + } + #[test] fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; From e87342310e20b58ae2de2bb7cd0942dce089092e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Sep 2022 03:14:45 -0400 Subject: [PATCH 23/30] Actually test that `ScalarValue`s are the same after round trip serialization (#3537) --- datafusion/proto/src/lib.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index 4a1e782f3be1..f34da705f94a 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -465,8 +465,17 @@ mod roundtrip_tests { let proto: super::protobuf::ScalarValue = (&test_case) .try_into() .expect("failed conversion to protobuf"); - let _roundtrip: ScalarValue = - (&proto).try_into().expect("failed conversion to protobuf"); + + let roundtrip: ScalarValue = (&proto) + .try_into() + .expect("failed conversion from protobuf"); + + assert_eq!( + test_case, roundtrip, + "ScalarValue was not the same after round trip!\n\n\ + Input: {:?}\n\nRoundtrip: {:?}", + test_case, roundtrip + ); } } From 67002a06039523e1356a5c91894b1387b021bbf7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Sep 2022 05:55:17 -0400 Subject: [PATCH 24/30] Upgrade to arrow 23.0.0 (#3483) * Changes for API * Update avro code for API changes * Use divide_opt` kernel * Update update_arrow_deps.py * Update arrow dependency to 23.0.0 * Use nicer RecordBatchOptions API * cleanups * fix: update --- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion-examples/examples/flight_server.rs | 5 ++++- datafusion/common/Cargo.toml | 4 ++-- datafusion/core/Cargo.toml | 6 +++--- datafusion/core/fuzz-utils/Cargo.toml | 2 +- .../core/src/avro_to_arrow/arrow_array_reader.rs | 7 ++----- .../core/src/physical_plan/coalesce_batches.rs | 3 +-- .../core/src/physical_plan/file_format/mod.rs | 3 +-- datafusion/expr/Cargo.toml | 2 +- datafusion/jit/Cargo.toml | 2 +- datafusion/optimizer/Cargo.toml | 2 +- datafusion/physical-expr/Cargo.toml | 2 +- .../physical-expr/src/expressions/binary.rs | 10 ++++------ .../src/expressions/binary/kernels_arrow.rs | 7 ++++--- datafusion/proto/Cargo.toml | 2 +- datafusion/row/Cargo.toml | 2 +- datafusion/sql/Cargo.toml | 2 +- dev/update_arrow_deps.py | 16 ++++++++++++++-- 19 files changed, 45 insertions(+), 36 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 394bf59a8421..de7bfe07094a 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -29,7 +29,7 @@ rust-version = "1.62" readme = "README.md" [dependencies] -arrow = "22.0.0" +arrow = "23.0.0" clap = { version = "3", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "12.0.0" } dirs = "4.0.0" diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index c769326f1996..5d2150848afb 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -34,7 +34,7 @@ path = "examples/avro_sql.rs" required-features = ["datafusion/avro"] [dev-dependencies] -arrow-flight = "22.0.0" +arrow-flight = "23.0.0" async-trait = "0.1.41" datafusion = { path = "../datafusion/core" } futures = "0.3" diff --git a/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs index 5dbd694ac0f5..88cc6f1c23be 100644 --- a/datafusion-examples/examples/flight_server.rs +++ b/datafusion-examples/examples/flight_server.rs @@ -19,6 +19,7 @@ use std::pin::Pin; use std::sync::Arc; use arrow_flight::SchemaAsIpc; +use datafusion::arrow::error::ArrowError; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ListingOptions, ListingTableUrl}; use futures::Stream; @@ -77,7 +78,9 @@ impl FlightService for FlightServiceImpl { .unwrap(); let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default(); - let schema_result = SchemaAsIpc::new(&schema, &options).into(); + let schema_result = SchemaAsIpc::new(&schema, &options) + .try_into() + .map_err(|e: ArrowError| tonic::Status::internal(e.to_string()))?; Ok(Response::new(schema_result)) } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 028a4874a8bc..3337747efe37 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -39,11 +39,11 @@ pyarrow = ["pyo3"] [dependencies] apache-avro = { version = "0.14", features = ["snappy"], optional = true } -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } avro-rs = { version = "0.13", features = ["snappy"], optional = true } cranelift-module = { version = "0.87.0", optional = true } object_store = { version = "0.5.0", optional = true } ordered-float = "3.0" -parquet = { version = "22.0.0", features = ["arrow"], optional = true } +parquet = { version = "23.0.0", features = ["arrow"], optional = true } pyo3 = { version = "0.17.1", optional = true } sqlparser = "0.23" diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index e63f3100d6c9..f5f4350b072a 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -56,7 +56,7 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } apache-avro = { version = "0.14", optional = true } -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } async-trait = "0.1.41" bytes = "1.1" chrono = { version = "0.4", default-features = false } @@ -78,7 +78,7 @@ num_cpus = "1.13.0" object_store = "0.5.0" ordered-float = "3.0" parking_lot = "0.12" -parquet = { version = "22.0.0", features = ["arrow", "async"] } +parquet = { version = "23.0.0", features = ["arrow", "async"] } paste = "^1.0" pin-project-lite = "^0.2.7" pyo3 = { version = "0.17.1", optional = true } @@ -93,7 +93,7 @@ url = "2.2" uuid = { version = "1.0", features = ["v4"] } [dev-dependencies] -arrow = { version = "22.0.0", features = ["prettyprint", "dyn_cmp_dict"] } +arrow = { version = "23.0.0", features = ["prettyprint", "dyn_cmp_dict"] } async-trait = "0.1.53" criterion = "0.4" csv = "1.1.6" diff --git a/datafusion/core/fuzz-utils/Cargo.toml b/datafusion/core/fuzz-utils/Cargo.toml index 5f07570ad043..07f09b6362cc 100644 --- a/datafusion/core/fuzz-utils/Cargo.toml +++ b/datafusion/core/fuzz-utils/Cargo.toml @@ -23,6 +23,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } env_logger = "0.9.0" rand = "0.8" diff --git a/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs index bed5bbcdc885..6dee6f3d25a2 100644 --- a/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs +++ b/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs @@ -20,8 +20,7 @@ use crate::arrow::array::{ make_array, Array, ArrayBuilder, ArrayData, ArrayDataBuilder, ArrayRef, BooleanBuilder, LargeStringArray, ListBuilder, NullArray, OffsetSizeTrait, - PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, - StringDictionaryBuilder, + PrimitiveArray, StringArray, StringBuilder, StringDictionaryBuilder, }; use crate::arrow::buffer::{Buffer, MutableBuffer}; use crate::arrow::datatypes::{ @@ -171,9 +170,7 @@ impl<'a, R: Read> AvroArrowArrayReader<'a, R> { where T: ArrowPrimitiveType + ArrowDictionaryKeyType, { - let key_builder = PrimitiveBuilder::::with_capacity(row_len); - let values_builder = StringBuilder::with_capacity(row_len, 5); - StringDictionaryBuilder::new(key_builder, values_builder) + StringDictionaryBuilder::with_capacity(row_len, row_len, row_len) } fn build_wrapped_list_array( diff --git a/datafusion/core/src/physical_plan/coalesce_batches.rs b/datafusion/core/src/physical_plan/coalesce_batches.rs index f2edacd90847..317500ddc904 100644 --- a/datafusion/core/src/physical_plan/coalesce_batches.rs +++ b/datafusion/core/src/physical_plan/coalesce_batches.rs @@ -292,8 +292,7 @@ pub fn concat_batches( row_count ); - let mut options = RecordBatchOptions::default(); - options.row_count = Some(row_count); + let options = RecordBatchOptions::new().with_row_count(Some(row_count)); RecordBatch::try_new_with_options(schema.clone(), arrays, &options) } diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs index 2d59570078bc..295fef27b291 100644 --- a/datafusion/core/src/physical_plan/file_format/mod.rs +++ b/datafusion/core/src/physical_plan/file_format/mod.rs @@ -286,8 +286,7 @@ impl SchemaAdapter { let projected_schema = Arc::new(self.table_schema.clone().project(projections)?); // Necessary to handle empty batches - let mut options = RecordBatchOptions::default(); - options.row_count = Some(batch.num_rows()); + let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); Ok(RecordBatch::try_new_with_options( projected_schema, diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index b187c4d32665..6277091ed25b 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -36,6 +36,6 @@ path = "src/lib.rs" [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } datafusion-common = { path = "../common", version = "12.0.0" } sqlparser = "0.23" diff --git a/datafusion/jit/Cargo.toml b/datafusion/jit/Cargo.toml index 32e99de53a68..428027cb4367 100644 --- a/datafusion/jit/Cargo.toml +++ b/datafusion/jit/Cargo.toml @@ -36,7 +36,7 @@ path = "src/lib.rs" jit = [] [dependencies] -arrow = "22.0.0" +arrow = "23.0.0" cranelift = "0.87.0" cranelift-jit = "0.87.0" cranelift-module = "0.87.0" diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index aa735f7aa6e1..409521088bf6 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -37,7 +37,7 @@ default = ["unicode_expressions"] unicode_expressions = [] [dependencies] -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } async-trait = "0.1.41" chrono = { version = "0.4", default-features = false } datafusion-common = { path = "../common", version = "12.0.0" } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 182eeb40bfbe..ba618acb64bf 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -40,7 +40,7 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { version = "0.4", default-features = false } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 2395f85c1eb0..b078da5f0b34 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -24,7 +24,7 @@ use std::{any::Any, sync::Arc}; use arrow::array::*; use arrow::compute::kernels::arithmetic::{ - add, add_scalar, divide, divide_scalar, modulus, modulus_scalar, multiply, + add, add_scalar, divide_opt, divide_scalar, modulus, modulus_scalar, multiply, multiply_scalar, subtract, subtract_scalar, }; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; @@ -60,7 +60,7 @@ use kernels::{ bitwise_xor, bitwise_xor_scalar, }; use kernels_arrow::{ - add_decimal, add_decimal_scalar, divide_decimal, divide_decimal_scalar, + add_decimal, add_decimal_scalar, divide_decimal_scalar, divide_opt_decimal, eq_decimal_scalar, gt_decimal_scalar, gt_eq_decimal_scalar, is_distinct_from, is_distinct_from_bool, is_distinct_from_decimal, is_distinct_from_null, is_distinct_from_utf8, is_not_distinct_from, is_not_distinct_from_bool, @@ -844,7 +844,7 @@ impl BinaryExpr { Operator::Plus => binary_primitive_array_op!(left, right, add), Operator::Minus => binary_primitive_array_op!(left, right, subtract), Operator::Multiply => binary_primitive_array_op!(left, right, multiply), - Operator::Divide => binary_primitive_array_op!(left, right, divide), + Operator::Divide => binary_primitive_array_op!(left, right, divide_opt), Operator::Modulo => binary_primitive_array_op!(left, right, modulus), Operator::And => { if left_data_type == &DataType::Boolean { @@ -1326,9 +1326,7 @@ mod tests { let string_type = DataType::Utf8; // build dictionary - let keys_builder = PrimitiveBuilder::::with_capacity(10); - let values_builder = arrow::array::StringBuilder::with_capacity(10, 1024); - let mut dict_builder = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut dict_builder = StringDictionaryBuilder::::new(); dict_builder.append("one")?; dict_builder.append_null(); diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs index 8caed7191fb2..99b8a5a06dd6 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs @@ -372,7 +372,7 @@ pub(crate) fn multiply_decimal_scalar( Ok(array) } -pub(crate) fn divide_decimal( +pub(crate) fn divide_opt_decimal( left: &Decimal128Array, right: &Decimal128Array, ) -> Result { @@ -636,7 +636,7 @@ mod tests { 25, 3, ); - let result = divide_decimal(&left_decimal_array, &right_decimal_array)?; + let result = divide_opt_decimal(&left_decimal_array, &right_decimal_array)?; let expect = create_decimal_array( &[Some(123456700), None, Some(22446672), Some(-10037130), None], 25, @@ -674,7 +674,8 @@ mod tests { let left_decimal_array = create_decimal_array(&[Some(101)], 10, 1); let right_decimal_array = create_decimal_array(&[Some(0)], 1, 1); - let err = divide_decimal(&left_decimal_array, &right_decimal_array).unwrap_err(); + let err = + divide_opt_decimal(&left_decimal_array, &right_decimal_array).unwrap_err(); assert_eq!("Arrow error: Divide by zero error", err.to_string()); let err = divide_decimal_scalar(&left_decimal_array, 0).unwrap_err(); assert_eq!("Arrow error: Divide by zero error", err.to_string()); diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index c4a3c4149148..c8711fa0b02f 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -37,7 +37,7 @@ default = [] json = ["pbjson", "pbjson-build", "serde", "serde_json"] [dependencies] -arrow = "22.0.0" +arrow = "23.0.0" datafusion = { path = "../core", version = "12.0.0" } datafusion-common = { path = "../common", version = "12.0.0" } datafusion-expr = { path = "../expr", version = "12.0.0" } diff --git a/datafusion/row/Cargo.toml b/datafusion/row/Cargo.toml index 3adff03b02fd..d1c7fdf0cf44 100644 --- a/datafusion/row/Cargo.toml +++ b/datafusion/row/Cargo.toml @@ -37,7 +37,7 @@ path = "src/lib.rs" jit = ["datafusion-jit"] [dependencies] -arrow = "22.0.0" +arrow = "23.0.0" datafusion-common = { path = "../common", version = "12.0.0" } datafusion-jit = { path = "../jit", version = "12.0.0", optional = true } paste = "^1.0" diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index 2e1536559cc5..b2da631827a5 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -38,7 +38,7 @@ unicode_expressions = [] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "22.0.0", features = ["prettyprint"] } +arrow = { version = "23.0.0", features = ["prettyprint"] } datafusion-common = { path = "../common", version = "12.0.0" } datafusion-expr = { path = "../expr", version = "12.0.0" } hashbrown = "0.12" diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py index a991de49eea3..b685ad2738b1 100755 --- a/dev/update_arrow_deps.py +++ b/dev/update_arrow_deps.py @@ -88,8 +88,20 @@ def update_version_cargo_toml(cargo_toml, new_version): for section in ("dependencies", "dev-dependencies"): for (dep_name, constraint) in doc.get(section, {}).items(): - if dep_name in ("arrow", "parquet", "arrow-flight") and constraint.get("version") is not None: - doc[section][dep_name]["version"] = new_version + if dep_name in ("arrow", "parquet", "arrow-flight"): + if type(constraint) == tomlkit.items.String: + # handle constraint that is (only) a string like '12', + doc[section][dep_name] = new_version + elif type(constraint) == dict: + # handle constraint that is itself a struct like + # {'version': '12', 'features': ['prettyprint']} + doc[section][dep_name]["version"] = new_version + elif type(constraint) == tomlkit.items.InlineTable: + # handle constraint that is itself a struct like + # {'version': '12', 'features': ['prettyprint']} + doc[section][dep_name]["version"] = new_version + else: + print("Unknown type for {} {}: {}", dep_name, constraint, type(constraint)) with open(cargo_toml, 'w') as f: f.write(tomlkit.dumps(doc)) From c7f3a70a79ee84070f7e8f770981fc982d8df0a4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Sep 2022 05:58:57 -0400 Subject: [PATCH 25/30] Add additional pruning tests with casts, handle unsupported predicates better (#3454) * Add tests for pruning, support pruning with constant expressions * Use downcast_any! * chore: Remove uneeded use --- .../core/src/physical_optimizer/pruning.rs | 184 ++++++++++++++---- datafusion/core/tests/parquet_pruning.rs | 12 +- 2 files changed, 147 insertions(+), 49 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index f5433d7e869c..9c53e0a6a2de 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -38,11 +38,13 @@ use crate::{ logical_plan::{Column, DFSchema, Expr, Operator}, physical_plan::{ColumnarValue, PhysicalExpr}, }; +use arrow::record_batch::RecordBatchOptions; use arrow::{ array::{new_null_array, ArrayRef, BooleanArray}, datatypes::{DataType, Field, Schema, SchemaRef}, record_batch::RecordBatch, }; +use datafusion_common::{downcast_value, ScalarValue}; use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter}; use datafusion_expr::utils::expr_to_columns; @@ -169,38 +171,42 @@ impl PruningPredicate { /// simplified version `b`. The predicates are simplified via the /// ConstantFolding optimizer pass pub fn prune(&self, statistics: &S) -> Result> { - // build statistics record batch - let predicate_array = - build_statistics_record_batch(statistics, &self.required_columns) - .and_then(|statistics_batch| { - // execute predicate expression - self.predicate_expr.evaluate(&statistics_batch) - }) - .and_then(|v| match v { - ColumnarValue::Array(array) => Ok(array), - ColumnarValue::Scalar(_) => Err(DataFusionError::Internal( - "predicate expression didn't return an array".to_string(), - )), - })?; - - let predicate_array = predicate_array - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Internal(format!( - "Expected pruning predicate evaluation to be BooleanArray, \ - but was {:?}", - predicate_array - )) - })?; - - // when the result of the predicate expression for a row group is null / undefined, - // e.g. due to missing statistics, this row group can't be filtered out, - // so replace with true - Ok(predicate_array - .into_iter() - .map(|x| x.unwrap_or(true)) - .collect::>()) + // build a RecordBatch that contains the min/max values in the + // appropriate statistics columns + let statistics_batch = + build_statistics_record_batch(statistics, &self.required_columns)?; + + // Evaluate the pruning predicate on that record batch. + // + // Use true when the result of evaluating a predicate + // expression on a row group is null (aka `None`). Null can + // arise when the statistics are unknown or some calculation + // in the predicate means we don't know for sure if the row + // group can be filtered out or not. To maintain correctness + // the row group must be kept and thus `true` is returned. + match self.predicate_expr.evaluate(&statistics_batch)? { + ColumnarValue::Array(array) => { + let predicate_array = downcast_value!(array, BooleanArray); + + Ok(predicate_array + .into_iter() + .map(|x| x.unwrap_or(true)) // None -> true per comments above + .collect::>()) + + }, + // result was a column + ColumnarValue::Scalar(ScalarValue::Boolean(v)) => { + let v = v.unwrap_or(true); // None -> true per comments above + Ok(vec![v; statistics.num_containers()]) + } + other => { + Err(DataFusionError::Internal(format!( + "Unexpected result of pruning predicate evaluation. Expected Boolean array \ + or scalar but got {:?}", + other + ))) + } + } } /// Return a reference to the input schema @@ -391,8 +397,13 @@ fn build_statistics_record_batch( } let schema = Arc::new(Schema::new(fields)); - RecordBatch::try_new(schema, arrays) - .map_err(|err| DataFusionError::Plan(err.to_string())) + // provide the count in case there were no needed statistics + let mut options = RecordBatchOptions::default(); + options.row_count = Some(statistics.num_containers()); + + RecordBatch::try_new_with_options(schema, arrays, &options).map_err(|err| { + DataFusionError::Plan(format!("Can not create statistics record batch: {}", err)) + }) } struct PruningExpressionBuilder<'a> { @@ -1168,7 +1179,7 @@ mod tests { } #[test] - fn test_build_statistics_no_stats() { + fn test_build_statistics_no_required_stats() { let required_columns = RequiredStatColumns::new(); let statistics = OneContainerStats { @@ -1177,13 +1188,9 @@ mod tests { num_containers: 1, }; - let result = - build_statistics_record_batch(&statistics, &required_columns).unwrap_err(); - assert!( - result.to_string().contains("Invalid argument error"), - "{}", - result - ); + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); + assert_eq!(batch.num_rows(), 1); // had 1 container } #[test] @@ -1858,7 +1865,15 @@ mod tests { assert_eq!(result, expected_false); } - /// Creates setup for int32 chunk pruning + /// Creates a setup for chunk pruning, modeling a int32 column "i" + /// with 5 different containers (e.g. RowGroups). They have [min, + /// max]: + /// + /// i [-5, 5] + /// i [1, 11] + /// i [-11, -1] + /// i [NULL, NULL] + /// i [1, NULL] fn int32_setup() -> (SchemaRef, TestStatistics) { let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); @@ -1922,6 +1937,45 @@ mod tests { assert_eq!(result, expected_ret); } + #[test] + fn prune_int32_col_lte_zero_cast() { + let (schema, statistics) = int32_setup(); + + // Expression "cast(i as utf8) <= '0'" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass in theory, -0.22 (conservatively keep) + // i [-11, -1] ==> no rows could pass in theory (conservatively keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (conservatively keep) + let expected_ret = vec![true, true, true, true, true]; + + // cast(i as utf8) <= 0 + let expr = cast(col("i"), DataType::Utf8).lt_eq(lit("0")); + let p = PruningPredicate::try_new(expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + // try_cast(i as utf8) <= 0 + let expr = try_cast(col("i"), DataType::Utf8).lt_eq(lit("0")); + let p = PruningPredicate::try_new(expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + // cast(-i as utf8) >= 0 + let expr = + Expr::Negative(Box::new(cast(col("i"), DataType::Utf8))).gt_eq(lit("0")); + let p = PruningPredicate::try_new(expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + // try_cast(-i as utf8) >= 0 + let expr = + Expr::Negative(Box::new(try_cast(col("i"), DataType::Utf8))).gt_eq(lit("0")); + let p = PruningPredicate::try_new(expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } + #[test] fn prune_int32_col_eq_zero() { let (schema, statistics) = int32_setup(); @@ -1941,6 +1995,50 @@ mod tests { assert_eq!(result, expected_ret); } + #[test] + fn prune_int32_col_eq_zero_cast() { + let (schema, statistics) = int32_setup(); + + // Expression "cast(i as int64) = 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = vec![true, false, false, true, false]; + + let expr = cast(col("i"), DataType::Int64).eq(lit(0i64)); + let p = PruningPredicate::try_new(expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + let expr = try_cast(col("i"), DataType::Int64).eq(lit(0i64)); + let p = PruningPredicate::try_new(expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } + + #[test] + fn prune_int32_col_eq_zero_cast_as_str() { + let (schema, statistics) = int32_setup(); + + // Note the cast is to a string where sorting properties are + // not the same as integers + // + // Expression "cast(i as utf8) = '0'" + // i [-5, 5] ==> some rows could pass (keep) + // i [1, 11] ==> no rows can pass (could keep) + // i [-11, -1] ==> no rows can pass (could keep) + // i [NULL, NULL] ==> unknown (keep) + // i [1, NULL] ==> no rows can pass (could keep) + let expected_ret = vec![true, true, true, true, true]; + + let expr = cast(col("i"), DataType::Utf8).eq(lit("0")); + let p = PruningPredicate::try_new(expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } + #[test] fn prune_int32_col_lt_neg_one() { let (schema, statistics) = int32_setup(); diff --git a/datafusion/core/tests/parquet_pruning.rs b/datafusion/core/tests/parquet_pruning.rs index 1de1d7c8fe87..a7b918180016 100644 --- a/datafusion/core/tests/parquet_pruning.rs +++ b/datafusion/core/tests/parquet_pruning.rs @@ -237,7 +237,7 @@ async fn prune_int32_scalar_fun() { test_prune( Scenario::Int32, "SELECT * FROM t where abs(i) = 1", - Some(4), + Some(0), Some(0), 3, ) @@ -249,7 +249,7 @@ async fn prune_int32_complex_expr() { test_prune( Scenario::Int32, "SELECT * FROM t where i+1 = 1", - Some(4), + Some(0), Some(0), 2, ) @@ -261,7 +261,7 @@ async fn prune_int32_complex_expr_subtract() { test_prune( Scenario::Int32, "SELECT * FROM t where 1-i > 1", - Some(4), + Some(0), Some(0), 9, ) @@ -308,7 +308,7 @@ async fn prune_f64_scalar_fun() { test_prune( Scenario::Float64, "SELECT * FROM t where abs(f-1) <= 0.000001", - Some(4), + Some(0), Some(0), 1, ) @@ -321,7 +321,7 @@ async fn prune_f64_complex_expr() { test_prune( Scenario::Float64, "SELECT * FROM t where f+1 > 1.1", - Some(4), + Some(0), Some(0), 9, ) @@ -334,7 +334,7 @@ async fn prune_f64_complex_expr_subtract() { test_prune( Scenario::Float64, "SELECT * FROM t where 1-f > 1", - Some(4), + Some(0), Some(0), 9, ) From 81b57948778cfa485d8b3f2c1fe4485f3966e8e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 20 Sep 2022 15:11:42 +0200 Subject: [PATCH 26/30] Push down limit to sort (#3530) * Push down limit to sort Support skip, fix test Fmt Add limit directly after sort Update comment Simplify parallel sort by using new pushdown Clippy * Update datafusion/core/src/physical_plan/sorts/sort.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe.rs | 2 +- .../src/physical_optimizer/parallel_sort.rs | 23 +++---- .../src/physical_optimizer/repartition.rs | 2 +- datafusion/core/src/physical_plan/planner.rs | 8 +-- .../core/src/physical_plan/sorts/sort.rs | 30 +++++++-- .../sorts/sort_preserving_merge.rs | 10 ++- datafusion/core/tests/order_spill_fuzz.rs | 2 +- datafusion/core/tests/user_defined_plan.rs | 1 + datafusion/expr/src/logical_plan/builder.rs | 2 + datafusion/expr/src/logical_plan/plan.rs | 8 ++- datafusion/expr/src/utils.rs | 3 +- .../optimizer/src/common_subexpr_eliminate.rs | 3 +- datafusion/optimizer/src/limit_push_down.rs | 63 ++++++++++++++++++- datafusion/proto/proto/datafusion.proto | 2 + datafusion/proto/src/logical_plan.rs | 3 +- 15 files changed, 132 insertions(+), 30 deletions(-) diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index a1b43cf531f1..c9bf527091c0 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -1296,7 +1296,7 @@ mod tests { assert_eq!("\ Projection: #t1.c1 AS AAA, #t1.c2, #t1.c3, #t2.c1, #t2.c2, #t2.c3\ \n Limit: skip=0, fetch=1\ - \n Sort: #t1.c1 ASC NULLS FIRST, #t1.c2 ASC NULLS FIRST, #t1.c3 ASC NULLS FIRST, #t2.c1 ASC NULLS FIRST, #t2.c2 ASC NULLS FIRST, #t2.c3 ASC NULLS FIRST\ + \n Sort: #t1.c1 ASC NULLS FIRST, #t1.c2 ASC NULLS FIRST, #t1.c3 ASC NULLS FIRST, #t2.c1 ASC NULLS FIRST, #t2.c2 ASC NULLS FIRST, #t2.c3 ASC NULLS FIRST, fetch=1\ \n Inner Join: #t1.c1 = #t2.c1\ \n TableScan: t1 projection=[c1, c2, c3]\ \n TableScan: t2 projection=[c1, c2, c3]", diff --git a/datafusion/core/src/physical_optimizer/parallel_sort.rs b/datafusion/core/src/physical_optimizer/parallel_sort.rs index 3361d8155f7f..e3ca60cb5cf7 100644 --- a/datafusion/core/src/physical_optimizer/parallel_sort.rs +++ b/datafusion/core/src/physical_optimizer/parallel_sort.rs @@ -20,7 +20,6 @@ use crate::{ error::Result, physical_optimizer::PhysicalOptimizerRule, physical_plan::{ - limit::GlobalLimitExec, sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec}, with_new_children_if_necessary, }, @@ -55,31 +54,33 @@ impl PhysicalOptimizerRule for ParallelSort { .map(|child| self.optimize(child.clone(), config)) .collect::>>()?; let plan = with_new_children_if_necessary(plan, children)?; - let children = plan.children(); let plan_any = plan.as_any(); - // GlobalLimitExec (SortExec preserve_partitioning=False) - // -> GlobalLimitExec (SortExec preserve_partitioning=True) - let parallel_sort = plan_any.downcast_ref::().is_some() - && children.len() == 1 - && children[0].as_any().downcast_ref::().is_some() - && !children[0] - .as_any() + // SortExec preserve_partitioning=False, fetch=Some(n)) + // -> SortPreservingMergeExec (SortExec preserve_partitioning=True, fetch=Some(n)) + let parallel_sort = plan_any.downcast_ref::().is_some() + && plan_any + .downcast_ref::() + .unwrap() + .fetch() + .is_some() + && !plan_any .downcast_ref::() .unwrap() .preserve_partitioning(); Ok(if parallel_sort { - let sort = children[0].as_any().downcast_ref::().unwrap(); + let sort = plan_any.downcast_ref::().unwrap(); let new_sort = SortExec::new_with_partitioning( sort.expr().to_vec(), sort.input().clone(), true, + sort.fetch(), ); let merge = SortPreservingMergeExec::new( sort.expr().to_vec(), Arc::new(new_sort), ); - with_new_children_if_necessary(plan, vec![Arc::new(merge)])? + Arc::new(merge) } else { plan.clone() }) diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs index 20aa59f0cd67..1d2b25908683 100644 --- a/datafusion/core/src/physical_optimizer/repartition.rs +++ b/datafusion/core/src/physical_optimizer/repartition.rs @@ -295,7 +295,7 @@ mod tests { expr: col("c1", &schema()).unwrap(), options: SortOptions::default(), }]; - Arc::new(SortExec::try_new(sort_exprs, input).unwrap()) + Arc::new(SortExec::try_new(sort_exprs, input, None).unwrap()) } fn projection_exec(input: Arc) -> Arc { diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 20a819622d5d..005a7943265a 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -590,9 +590,9 @@ impl DefaultPhysicalPlanner { }) .collect::>>()?; Arc::new(if can_repartition { - SortExec::new_with_partitioning(sort_keys, input_exec, true) + SortExec::new_with_partitioning(sort_keys, input_exec, true, None) } else { - SortExec::try_new(sort_keys, input_exec)? + SortExec::try_new(sort_keys, input_exec, None)? }) }; @@ -815,7 +815,7 @@ impl DefaultPhysicalPlanner { physical_partitioning, )?) ) } - LogicalPlan::Sort(Sort { expr, input, .. }) => { + LogicalPlan::Sort(Sort { expr, input, fetch, .. }) => { let physical_input = self.create_initial_plan(input, session_state).await?; let input_schema = physical_input.as_ref().schema(); let input_dfschema = input.as_ref().schema(); @@ -841,7 +841,7 @@ impl DefaultPhysicalPlanner { )), }) .collect::>>()?; - Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?)) + Ok(Arc::new(SortExec::try_new(sort_expr, physical_input, *fetch)?)) } LogicalPlan::Join(Join { left, diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index cc0501785324..8e8457be2549 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -82,6 +82,7 @@ struct ExternalSorter { runtime: Arc, metrics_set: CompositeMetricsSet, metrics: BaselineMetrics, + fetch: Option, } impl ExternalSorter { @@ -92,6 +93,7 @@ impl ExternalSorter { metrics_set: CompositeMetricsSet, session_config: Arc, runtime: Arc, + fetch: Option, ) -> Self { let metrics = metrics_set.new_intermediate_baseline(partition_id); Self { @@ -104,6 +106,7 @@ impl ExternalSorter { runtime, metrics_set, metrics, + fetch, } } @@ -120,7 +123,7 @@ impl ExternalSorter { // NB timer records time taken on drop, so there are no // calls to `timer.done()` below. let _timer = tracking_metrics.elapsed_compute().timer(); - let partial = sort_batch(input, self.schema.clone(), &self.expr)?; + let partial = sort_batch(input, self.schema.clone(), &self.expr, self.fetch)?; in_mem_batches.push(partial); } Ok(()) @@ -657,6 +660,8 @@ pub struct SortExec { metrics_set: CompositeMetricsSet, /// Preserve partitions of input plan preserve_partitioning: bool, + /// Fetch highest/lowest n results + fetch: Option, } impl SortExec { @@ -664,8 +669,9 @@ impl SortExec { pub fn try_new( expr: Vec, input: Arc, + fetch: Option, ) -> Result { - Ok(Self::new_with_partitioning(expr, input, false)) + Ok(Self::new_with_partitioning(expr, input, false, fetch)) } /// Whether this `SortExec` preserves partitioning of the children @@ -679,12 +685,14 @@ impl SortExec { expr: Vec, input: Arc, preserve_partitioning: bool, + fetch: Option, ) -> Self { Self { expr, input, metrics_set: CompositeMetricsSet::new(), preserve_partitioning, + fetch, } } @@ -697,6 +705,11 @@ impl SortExec { pub fn expr(&self) -> &[PhysicalSortExpr] { &self.expr } + + /// If `Some(fetch)`, limits output to only the first "fetch" items + pub fn fetch(&self) -> Option { + self.fetch + } } impl ExecutionPlan for SortExec { @@ -750,6 +763,7 @@ impl ExecutionPlan for SortExec { self.expr.clone(), children[0].clone(), self.preserve_partitioning, + self.fetch, ))) } @@ -778,6 +792,7 @@ impl ExecutionPlan for SortExec { self.expr.clone(), self.metrics_set.clone(), context, + self.fetch(), ) .map_err(|e| ArrowError::ExternalError(Box::new(e))), ) @@ -816,14 +831,14 @@ fn sort_batch( batch: RecordBatch, schema: SchemaRef, expr: &[PhysicalSortExpr], + fetch: Option, ) -> ArrowResult { - // TODO: pushup the limit expression to sort let sort_columns = expr .iter() .map(|e| e.evaluate_to_sort_column(&batch)) .collect::>>()?; - let indices = lexsort_to_indices(&sort_columns, None)?; + let indices = lexsort_to_indices(&sort_columns, fetch)?; // reorder all rows based on sorted indices let sorted_batch = RecordBatch::try_new( @@ -870,6 +885,7 @@ async fn do_sort( expr: Vec, metrics_set: CompositeMetricsSet, context: Arc, + fetch: Option, ) -> Result { debug!( "Start do_sort for partition {} of context session_id {} and task_id {:?}", @@ -887,6 +903,7 @@ async fn do_sort( metrics_set, Arc::new(context.session_config()), context.runtime_env(), + fetch, ); context.runtime_env().register_requester(sorter.id()); while let Some(batch) = input.next().await { @@ -949,6 +966,7 @@ mod tests { }, ], Arc::new(CoalescePartitionsExec::new(csv)), + None, )?); let result = collect(sort_exec, task_ctx).await?; @@ -1011,6 +1029,7 @@ mod tests { }, ], Arc::new(CoalescePartitionsExec::new(csv)), + None, )?); let task_ctx = session_ctx.task_ctx(); @@ -1083,6 +1102,7 @@ mod tests { options: SortOptions::default(), }], input, + None, )?); let result: Vec = collect(sort_exec, task_ctx).await?; @@ -1159,6 +1179,7 @@ mod tests { }, ], Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None)?), + None, )?); assert_eq!(DataType::Float32, *sort_exec.schema().field(0).data_type()); @@ -1226,6 +1247,7 @@ mod tests { options: SortOptions::default(), }], blocking_exec, + None, )?); let fut = collect(sort_exec, task_ctx); diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index dc788be3380e..5db3c50e6c14 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -874,8 +874,12 @@ mod tests { sort: Vec, context: Arc, ) -> RecordBatch { - let sort_exec = - Arc::new(SortExec::new_with_partitioning(sort.clone(), input, true)); + let sort_exec = Arc::new(SortExec::new_with_partitioning( + sort.clone(), + input, + true, + None, + )); sorted_merge(sort_exec, sort, context).await } @@ -885,7 +889,7 @@ mod tests { context: Arc, ) -> RecordBatch { let merge = Arc::new(CoalescePartitionsExec::new(src)); - let sort_exec = Arc::new(SortExec::try_new(sort, merge).unwrap()); + let sort_exec = Arc::new(SortExec::try_new(sort, merge, None).unwrap()); let mut result = collect(sort_exec, context).await.unwrap(); assert_eq!(result.len(), 1); result.remove(0) diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index a2d47629b927..faaa2ae0bb3d 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -75,7 +75,7 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { }]; let exec = MemoryExec::try_new(&input, schema, None).unwrap(); - let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec)).unwrap()); + let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec), None).unwrap()); let runtime_config = RuntimeConfig::new().with_memory_manager( MemoryManagerConfig::try_new_limit(pool_size, 1.0).unwrap(), diff --git a/datafusion/core/tests/user_defined_plan.rs b/datafusion/core/tests/user_defined_plan.rs index 13ddb1eb8da1..c577e48e7800 100644 --- a/datafusion/core/tests/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined_plan.rs @@ -299,6 +299,7 @@ impl OptimizerRule for TopKOptimizerRule { if let LogicalPlan::Sort(Sort { ref expr, ref input, + .. }) = **input { if expr.len() == 1 { diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index def9927ab6b3..27837ff68ca8 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -396,6 +396,7 @@ impl LogicalPlanBuilder { return Ok(Self::from(LogicalPlan::Sort(Sort { expr: normalize_cols(exprs, &self.plan)?, input: Arc::new(self.plan.clone()), + fetch: None, }))); } @@ -403,6 +404,7 @@ impl LogicalPlanBuilder { let sort_plan = LogicalPlan::Sort(Sort { expr: normalize_cols(exprs, &plan)?, input: Arc::new(plan.clone()), + fetch: None, }); // remove pushed down sort columns let new_expr = schema diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index f6b2b1b74f6a..049e6158ca8f 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -806,7 +806,7 @@ impl LogicalPlan { "Aggregate: groupBy=[{:?}], aggr=[{:?}]", group_expr, aggr_expr ), - LogicalPlan::Sort(Sort { expr, .. }) => { + LogicalPlan::Sort(Sort { expr, fetch, .. }) => { write!(f, "Sort: ")?; for (i, expr_item) in expr.iter().enumerate() { if i > 0 { @@ -814,6 +814,10 @@ impl LogicalPlan { } write!(f, "{:?}", expr_item)?; } + if let Some(a) = fetch { + write!(f, ", fetch={}", a)?; + } + Ok(()) } LogicalPlan::Join(Join { @@ -1373,6 +1377,8 @@ pub struct Sort { pub expr: Vec, /// The incoming logical plan pub input: Arc, + /// Optional fetch limit + pub fetch: Option, } /// Join two logical plans on one or more join columns diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 65c4be24849b..4ace809b811d 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -420,9 +420,10 @@ pub fn from_plan( expr[group_expr.len()..].to_vec(), schema.clone(), )?)), - LogicalPlan::Sort(Sort { .. }) => Ok(LogicalPlan::Sort(Sort { + LogicalPlan::Sort(Sort { fetch, .. }) => Ok(LogicalPlan::Sort(Sort { expr: expr.to_vec(), input: Arc::new(inputs[0].clone()), + fetch: *fetch, })), LogicalPlan::Join(Join { join_type, diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 978b79d375d3..b17dd9492efc 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -196,7 +196,7 @@ fn optimize( schema.clone(), )?)) } - LogicalPlan::Sort(Sort { expr, input }) => { + LogicalPlan::Sort(Sort { expr, input, fetch }) => { let arrays = to_arrays(expr, input, &mut expr_set)?; let (mut new_expr, new_input) = rewrite_expr( @@ -210,6 +210,7 @@ fn optimize( Ok(LogicalPlan::Sort(Sort { expr: pop_expr(&mut new_expr)?, input: Arc::new(new_input), + fetch: *fetch, })) } LogicalPlan::Join { .. } diff --git a/datafusion/optimizer/src/limit_push_down.rs b/datafusion/optimizer/src/limit_push_down.rs index 65bdd64ee0c5..cacbbfc5b2b1 100644 --- a/datafusion/optimizer/src/limit_push_down.rs +++ b/datafusion/optimizer/src/limit_push_down.rs @@ -20,7 +20,9 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{ - logical_plan::{Join, JoinType, Limit, LogicalPlan, Projection, TableScan, Union}, + logical_plan::{ + Join, JoinType, Limit, LogicalPlan, Projection, Sort, TableScan, Union, + }, utils::from_plan, }; use std::sync::Arc; @@ -247,6 +249,25 @@ fn limit_push_down( ), } } + ( + LogicalPlan::Sort(Sort { expr, input, fetch }), + Ancestor::FromLimit { + skip: ancestor_skip, + fetch: Some(ancestor_fetch), + .. + }, + ) => { + // Update Sort `fetch`, but simply recurse through children (sort should receive all input for sorting) + let input = push_down_children_limit(_optimizer, _optimizer_config, input)?; + let sort_fetch = ancestor_skip + ancestor_fetch; + let plan = LogicalPlan::Sort(Sort { + expr: expr.clone(), + input: Arc::new(input), + fetch: Some(fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)), + }); + Ok(plan) + } + // For other nodes we can't push down the limit // But try to recurse and find other limit nodes to push down _ => push_down_children_limit(_optimizer, _optimizer_config, plan), @@ -340,6 +361,8 @@ impl OptimizerRule for LimitPushDown { #[cfg(test)] mod test { + use std::vec; + use super::*; use crate::test::*; use datafusion_expr::{ @@ -438,6 +461,44 @@ mod test { Ok(()) } + #[test] + fn limit_push_down_sort() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .sort(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + // Should push down limit to sort + let expected = "Limit: skip=0, fetch=10\ + \n Sort: #test.a, fetch=10\ + \n TableScan: test"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_skip() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .sort(vec![col("a")])? + .limit(5, Some(10))? + .build()?; + + // Should push down limit to sort + let expected = "Limit: skip=5, fetch=10\ + \n Sort: #test.a, fetch=15\ + \n TableScan: test"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + #[test] fn multi_stage_limit_recurses_to_deeper_limit() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 0e499ee48ffc..7ec4705b03b6 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -125,6 +125,8 @@ message SelectionNode { message SortNode { LogicalPlanNode input = 1; repeated datafusion.LogicalExprNode expr = 2; + // Maximum number of highest/lowest rows to fetch; negative means no limit + int64 fetch = 3; } message RepartitionNode { diff --git a/datafusion/proto/src/logical_plan.rs b/datafusion/proto/src/logical_plan.rs index 45399554b46d..8f9f8a9c4661 100644 --- a/datafusion/proto/src/logical_plan.rs +++ b/datafusion/proto/src/logical_plan.rs @@ -959,7 +959,7 @@ impl AsLogicalPlan for LogicalPlanNode { ))), }) } - LogicalPlan::Sort(Sort { input, expr }) => { + LogicalPlan::Sort(Sort { input, expr, fetch }) => { let input: protobuf::LogicalPlanNode = protobuf::LogicalPlanNode::try_from_logical_plan( input.as_ref(), @@ -974,6 +974,7 @@ impl AsLogicalPlan for LogicalPlanNode { protobuf::SortNode { input: Some(Box::new(input)), expr: selection_expr, + fetch: fetch.map(|f| f as i64).unwrap_or(-1i64), }, ))), }) From 1261741af2a5e142fa0c7916e759859cc18ea59a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 20 Sep 2022 12:10:07 -0600 Subject: [PATCH 27/30] Bug fix: expr_visitor was not visiting aggregate filter expressions (#3548) --- datafusion/expr/src/expr_visitor.rs | 18 ++++++++--- .../optimizer/src/projection_push_down.rs | 30 +++++++++++++++++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/expr_visitor.rs b/datafusion/expr/src/expr_visitor.rs index 0f2f5077a9a9..3885456cc3a9 100644 --- a/datafusion/expr/src/expr_visitor.rs +++ b/datafusion/expr/src/expr_visitor.rs @@ -176,12 +176,22 @@ impl ExprVisitable for Expr { Ok(visitor) } } - Expr::ScalarFunction { args, .. } - | Expr::ScalarUDF { args, .. } - | Expr::AggregateFunction { args, .. } - | Expr::AggregateUDF { args, .. } => args + Expr::ScalarFunction { args, .. } | Expr::ScalarUDF { args, .. } => args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor)), + Expr::AggregateFunction { args, filter, .. } + | Expr::AggregateUDF { args, filter, .. } => { + if let Some(f) = filter { + let mut aggr_exprs = args.clone(); + aggr_exprs.push(f.as_ref().clone()); + aggr_exprs + .iter() + .try_fold(visitor, |visitor, arg| arg.accept(visitor)) + } else { + args.iter() + .try_fold(visitor, |visitor, arg| arg.accept(visitor)) + } + } Expr::WindowFunction { args, partition_by, diff --git a/datafusion/optimizer/src/projection_push_down.rs b/datafusion/optimizer/src/projection_push_down.rs index 5a0b38b575ef..86f73c061a50 100644 --- a/datafusion/optimizer/src/projection_push_down.rs +++ b/datafusion/optimizer/src/projection_push_down.rs @@ -532,9 +532,9 @@ mod tests { use crate::test::*; use arrow::datatypes::DataType; use datafusion_expr::{ - col, lit, + col, count, lit, logical_plan::{builder::LogicalPlanBuilder, JoinType}, - max, min, Expr, + max, min, AggregateFunction, Expr, }; use std::collections::HashMap; @@ -990,6 +990,32 @@ mod tests { Ok(()) } + #[test] + fn aggregate_filter_pushdown() -> Result<()> { + let table_scan = test_table_scan()?; + + let aggr_with_filter = Expr::AggregateFunction { + fun: AggregateFunction::Count, + args: vec![col("b")], + distinct: false, + filter: Some(Box::new(col("c").gt(lit(42)))), + }; + + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate( + vec![col("a")], + vec![count(col("b")), aggr_with_filter.alias("count2")], + )? + .build()?; + + let expected = "Aggregate: groupBy=[[#test.a]], aggr=[[COUNT(#test.b), COUNT(#test.b) FILTER (WHERE #c > Int32(42)) AS count2]]\ + \n TableScan: test projection=[a, b, c]"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let optimized_plan = optimize(plan).expect("failed to optimize plan"); let formatted_plan = format!("{:?}", optimized_plan); From c590d2adcced18e73c0708c9abb371ed446a8879 Mon Sep 17 00:00:00 2001 From: Dan Harris Date: Tue, 20 Sep 2022 14:44:29 -0400 Subject: [PATCH 28/30] Make ParquetScanOptions public and add method to get a reference from ParquetExec --- datafusion/core/src/physical_plan/file_format/mod.rs | 4 +++- datafusion/core/src/physical_plan/file_format/parquet.rs | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs index 295fef27b291..2926f18a2023 100644 --- a/datafusion/core/src/physical_plan/file_format/mod.rs +++ b/datafusion/core/src/physical_plan/file_format/mod.rs @@ -30,7 +30,9 @@ mod row_filter; pub(crate) use self::csv::plan_to_csv; pub use self::csv::CsvExec; pub(crate) use self::parquet::plan_to_parquet; -pub use self::parquet::{ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory}; +pub use self::parquet::{ + ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory, ParquetScanOptions, +}; use arrow::{ array::{ArrayData, ArrayRef, DictionaryArray}, buffer::Buffer, diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index a6ce44b22c8d..f4c52e5dc699 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -81,11 +81,13 @@ pub struct ParquetScanOptions { } impl ParquetScanOptions { + /// Set whether to pushdown pruning predicate to the parquet scan pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self { self.pushdown_filters = pushdown_filters; self } + /// Set whether to reorder pruning predicate expressions in order to minimize evaluation cost pub fn with_reorder_predicates(mut self, reorder_predicates: bool) -> Self { self.reorder_predicates = reorder_predicates; self @@ -182,6 +184,11 @@ impl ParquetExec { self.scan_options = scan_options; self } + + /// Ref to the `ParquetScanOptions` + pub fn parquet_scan_options(&self) -> &ParquetScanOptions { + &self.scan_options + } } /// Stores metrics about the parquet execution for a particular parquet file. From 3fbb189e83433f879497556fb257866ee6ed126c Mon Sep 17 00:00:00 2001 From: Dan Harris Date: Wed, 21 Sep 2022 10:53:50 -0400 Subject: [PATCH 29/30] Fix issue in type coercion optimizer --- datafusion/core/tests/sql/explain_analyze.rs | 2 +- datafusion/optimizer/src/type_coercion.rs | 21 +++++++++++++- datafusion/proto/src/lib.rs | 30 ++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index a75e0e3fa515..f2069126c5ff 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -653,7 +653,7 @@ order by let expected = "\ Sort: #revenue DESC NULLS FIRST\ \n Projection: #customer.c_custkey, #customer.c_name, #SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, #customer.c_acctbal, #nation.n_name, #customer.c_address, #customer.c_phone, #customer.c_comment\ - \n Aggregate: groupBy=[[#customer.c_custkey, #customer.c_name, #customer.c_acctbal, #customer.c_phone, #nation.n_name, #customer.c_address, #customer.c_comment]], aggr=[[SUM(CAST(#lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(#lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)))]]\ + \n Aggregate: groupBy=[[#customer.c_custkey, #customer.c_name, #customer.c_acctbal, #customer.c_phone, #nation.n_name, #customer.c_address, #customer.c_comment]], aggr=[[SUM(CAST(#lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(#lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]]\ \n Inner Join: #customer.c_nationkey = #nation.n_nationkey\ \n Inner Join: #orders.o_orderkey = #lineitem.l_orderkey\ \n Inner Join: #customer.c_custkey = #orders.o_custkey\ diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 9d8e97f36a71..a5195af6a4de 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -76,10 +76,29 @@ impl OptimizerRule for TypeCoercion { const_evaluator, }; + let original_expr_names: Vec> = plan + .expressions() + .iter() + .map(|expr| expr.name().ok()) + .collect(); + let new_expr = plan .expressions() .into_iter() - .map(|expr| expr.rewrite(&mut expr_rewrite)) + .zip(original_expr_names) + .map(|(expr, original_name)| { + let expr = expr.rewrite(&mut expr_rewrite)?; + + if matches!(expr, Expr::AggregateFunction { .. }) { + if let Some((alias, name)) = original_name.zip(expr.name().ok()) { + if alias != name { + return Ok(expr.alias(&alias)); + } + } + } + + Ok(expr) + }) .collect::>>()?; from_plan(plan, &new_expr, &new_inputs) diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index f34da705f94a..e4a28d51dd4d 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -54,6 +54,7 @@ mod roundtrip_tests { logical_plan_to_bytes, logical_plan_to_bytes_with_extension_codec, }; use crate::logical_plan::LogicalExtensionCodec; + use arrow::datatypes::Schema; use arrow::{ array::ArrayRef, datatypes::{DataType, Field, IntervalUnit, TimeUnit, UnionMode}, @@ -128,6 +129,35 @@ mod roundtrip_tests { Ok(()) } + #[tokio::test] + async fn roundtrip_logical_plan_aggregation() -> Result<(), DataFusionError> { + let ctx = SessionContext::new(); + + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Decimal128(15, 2), true), + ]); + + ctx.register_csv( + "t1", + "testdata/test.csv", + CsvReadOptions::default().schema(&schema), + ) + .await?; + + let query = + "SELECT a, SUM(b + 1) as b_sum FROM t1 GROUP BY a ORDER BY b_sum DESC"; + let plan = ctx.sql(query).await?.to_logical_plan()?; + + println!("{:?}", plan); + + let bytes = logical_plan_to_bytes(&plan)?; + let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; + assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); + + Ok(()) + } + #[tokio::test] async fn roundtrip_logical_plan_with_extension() -> Result<(), DataFusionError> { let ctx = SessionContext::new(); From 7f45d3d7c4c4b5badd18eb886a50a863482a45d1 Mon Sep 17 00:00:00 2001 From: Dan Harris Date: Thu, 29 Sep 2022 10:07:01 -0400 Subject: [PATCH 30/30] Fix clippy warnings --- .github/workflows/rust.yml | 102 ------------------ .../file_format/delimited_stream.rs | 2 +- .../core/src/physical_plan/hash_join.rs | 4 +- 3 files changed, 3 insertions(+), 105 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b9a3311a7fb0..4325a0a060d4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -165,108 +165,6 @@ jobs: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres - windows: - name: cargo test (win64) - runs-on: windows-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: Install protobuf compiler - shell: bash - run: | - mkdir -p $HOME/d/protoc - cd $HOME/d/protoc - export PROTO_ZIP="protoc-21.4-win64.zip" - curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP - unzip $PROTO_ZIP - export PATH=$PATH:$HOME/d/protoc/bin - protoc.exe --version - # TODO: this won't cache anything, which is expensive. Setup this action - # with a OS-dependent path. - - name: Setup Rust toolchain - run: | - rustup toolchain install stable - rustup default stable - rustup component add rustfmt - - name: Run tests - shell: bash - run: | - export PATH=$PATH:$HOME/d/protoc/bin - cargo test - env: - # do not produce debug symbols to keep memory usage down - RUSTFLAGS: "-C debuginfo=0" - - macos: - name: cargo test (mac) - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: Install protobuf compiler - shell: bash - run: | - mkdir -p $HOME/d/protoc - cd $HOME/d/protoc - export PROTO_ZIP="protoc-21.4-osx-x86_64.zip" - curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP - unzip $PROTO_ZIP - echo "$HOME/d/protoc/bin" >> $GITHUB_PATH - export PATH=$PATH:$HOME/d/protoc/bin - protoc --version - # TODO: this won't cache anything, which is expensive. Setup this action - # with a OS-dependent path. - - name: Setup Rust toolchain - run: | - rustup toolchain install stable - rustup default stable - rustup component add rustfmt - - name: Run tests - shell: bash - run: | - cargo test - env: - # do not produce debug symbols to keep memory usage down - RUSTFLAGS: "-C debuginfo=0" - - test-datafusion-pyarrow: - name: cargo test pyarrow (amd64) - needs: [linux-build-lib] - runs-on: ubuntu-latest - container: - image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - name: Install PyArrow - run: | - echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV - python -m pip install pyarrow - - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: stable - - name: Run tests - run: | - cd datafusion - cargo test --features=pyarrow - lint: name: Lint runs-on: ubuntu-latest diff --git a/datafusion/core/src/physical_plan/file_format/delimited_stream.rs b/datafusion/core/src/physical_plan/file_format/delimited_stream.rs index 1b6e30a9464e..68878da08670 100644 --- a/datafusion/core/src/physical_plan/file_format/delimited_stream.rs +++ b/datafusion/core/src/physical_plan/file_format/delimited_stream.rs @@ -68,7 +68,7 @@ impl LineDelimiter { } else if *is_quote { None } else { - (*v == NEWLINE).then(|| idx + 1) + (*v == NEWLINE).then_some(idx + 1) } }); diff --git a/datafusion/core/src/physical_plan/hash_join.rs b/datafusion/core/src/physical_plan/hash_join.rs index 8cb7445a24dd..a22bcbc13c7e 100644 --- a/datafusion/core/src/physical_plan/hash_join.rs +++ b/datafusion/core/src/physical_plan/hash_join.rs @@ -1240,12 +1240,12 @@ fn produce_from_matched( let indices = if unmatched { UInt64Array::from_iter_values( (0..visited_left_side.len()) - .filter_map(|v| (!visited_left_side.get_bit(v)).then(|| v as u64)), + .filter_map(|v| (!visited_left_side.get_bit(v)).then_some(v as u64)), ) } else { UInt64Array::from_iter_values( (0..visited_left_side.len()) - .filter_map(|v| (visited_left_side.get_bit(v)).then(|| v as u64)), + .filter_map(|v| (visited_left_side.get_bit(v)).then_some(v as u64)), ) };