From f3bb84f7ce8a08e40afa14db525b98e69fd4bda4 Mon Sep 17 00:00:00 2001
From: Kun Liu <liukun@apache.org>
Date: Wed, 14 Sep 2022 23:04:48 +0800
Subject: [PATCH 01/30] inlist: move type coercion to logical phase (#3472)

---
 datafusion/core/src/physical_plan/planner.rs  | 32 ++++---
 datafusion/optimizer/src/type_coercion.rs     | 94 ++++++++++++++++++-
 .../physical-expr/src/expressions/in_list.rs  | 60 +++++++++++-
 datafusion/physical-expr/src/planner.rs       | 54 +----------
 4 files changed, 171 insertions(+), 69 deletions(-)
diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs
index 40109eda2f11..c87590059625 100644
--- a/datafusion/core/src/physical_plan/planner.rs
+++ b/datafusion/core/src/physical_plan/planner.rs
@@ -1696,9 +1696,11 @@ mod tests {
     async fn plan(logical_plan: &LogicalPlan) -> Result<Arc<dyn ExecutionPlan>> {
         let mut session_state = make_session_state();
         session_state.config.target_partitions = 4;
+        // optimize the logical plan
+        let logical_plan = session_state.optimize(logical_plan)?;
         let planner = DefaultPhysicalPlanner::default();
         planner
-            .create_physical_plan(logical_plan, &session_state)
+            .create_physical_plan(&logical_plan, &session_state)
             .await
     }
 
@@ -1714,12 +1716,12 @@ mod tests {
             .limit(3, Some(10))?
             .build()?;
 
-        let plan = plan(&logical_plan).await?;
+        let exec_plan = plan(&logical_plan).await?;
 
         // verify that the plan correctly casts u8 to i64
         // the cast here is implicit so has CastOptions with safe=true
-        let expected = "BinaryExpr { left: Column { name: \"c7\", index: 6 }, op: Lt, right: TryCastExpr { expr: Literal { value: UInt8(5) }, cast_type: Int64 } }";
-        assert!(format!("{:?}", plan).contains(expected));
+        let expected = "BinaryExpr { left: Column { name: \"c7\", index: 2 }, op: Lt, right: Literal { value: Int64(5) } }";
+        assert!(format!("{:?}", exec_plan).contains(expected));
 
         Ok(())
     }
@@ -1821,8 +1823,7 @@ mod tests {
     async fn test_with_zero_offset_plan() -> Result<()> {
         let logical_plan = test_csv_scan().await?.limit(0, None)?.build()?;
         let plan = plan(&logical_plan).await?;
-        assert!(format!("{:?}", plan).contains("GlobalLimitExec"));
-        assert!(format!("{:?}", plan).contains("skip: 0"));
+        assert!(format!("{:?}", plan).contains("limit: None"));
         Ok(())
     }
 
@@ -1952,8 +1953,8 @@ mod tests {
             .project(vec![col("c1").in_list(list, false)])?
             .build()?;
         let execution_plan = plan(&logical_plan).await?;
-        // verify that the plan correctly adds cast from Int64(1) to Utf8
-        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }], negated: false, set: None }";
+        // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated.
+        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }], negated: false, set: None }";
         assert!(format!("{:?}", execution_plan).contains(expected));
 
         // expression: "a in (struct::null, 'a')"
@@ -1965,10 +1966,9 @@ mod tests {
             .filter(col("c12").lt(lit(0.05)))?
             .project(vec![col("c12").lt_eq(lit(0.025)).in_list(list, false)])?
             .build()?;
-        let execution_plan = plan(&logical_plan).await;
+        let e = plan(&logical_plan).await.unwrap_err().to_string();
 
-        let e = execution_plan.unwrap_err().to_string();
-        assert_contains!(&e, "Can not find compatible types to compare Boolean with [Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }]), Utf8]");
+        assert_contains!(&e, "The data type inlist should be same, the value type is Boolean, one of list expr type is Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }])");
 
         Ok(())
     }
@@ -1996,7 +1996,10 @@ mod tests {
             .project(vec![col("c1").in_list(list, false)])?
             .build()?;
         let execution_plan = plan(&logical_plan).await?;
-        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: Some(InSet { set: ";
+        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"1\") }, Literal { value: Utf8(\"2\") },";
+        assert!(format!("{:?}", execution_plan).contains(expected));
+        let expected =
+            "Literal { value: Utf8(\"30\") }], negated: false, set: Some(InSet { set: ";
         assert!(format!("{:?}", execution_plan).contains(expected));
         Ok(())
     }
@@ -2015,7 +2018,10 @@ mod tests {
             .project(vec![col("c1").in_list(list, false)])?
             .build()?;
         let execution_plan = plan(&logical_plan).await?;
-        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [TryCastExpr { expr: Literal { value: Int64(NULL) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: Some(InSet {";
+        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(NULL) }, Literal { value: Utf8(\"1\") }, Literal { value: Utf8(\"2\") }";
+        assert!(format!("{:?}", execution_plan).contains(expected));
+        let expected =
+            "Literal { value: Utf8(\"30\") }], negated: false, set: Some(InSet";
         assert!(format!("{:?}", execution_plan).contains(expected));
         Ok(())
     }
diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs
index 72ee5d19adcd..6cb711dfc87c 100644
--- a/datafusion/optimizer/src/type_coercion.rs
+++ b/datafusion/optimizer/src/type_coercion.rs
@@ -124,7 +124,6 @@ impl ExprRewriter for TypeCoercionRewriter<'_> {
                                 right.clone().cast_to(&coerced_type, &self.schema)?,
                             ),
                         };
-
                         expr.rewrite(&mut self.const_evaluator)
                     }
                 }
@@ -164,11 +163,61 @@ impl ExprRewriter for TypeCoercionRewriter<'_> {
                 };
                 expr.rewrite(&mut self.const_evaluator)
             }
+            Expr::InList {
+                expr,
+                list,
+                negated,
+            } => {
+                let expr_data_type = expr.get_type(&self.schema)?;
+                let list_data_types = list
+                    .iter()
+                    .map(|list_expr| list_expr.get_type(&self.schema))
+                    .collect::<Result<Vec<_>>>()?;
+                let result_type =
+                    get_coerce_type_for_list(&expr_data_type, &list_data_types);
+                match result_type {
+                    None => Err(DataFusionError::Plan(format!(
+                        "Can not find compatible types to compare {:?} with {:?}",
+                        expr_data_type, list_data_types
+                    ))),
+                    Some(coerced_type) => {
+                        // find the coerced type
+                        let cast_expr = expr.cast_to(&coerced_type, &self.schema)?;
+                        let cast_list_expr = list
+                            .into_iter()
+                            .map(|list_expr| {
+                                list_expr.cast_to(&coerced_type, &self.schema)
+                            })
+                            .collect::<Result<Vec<_>>>()?;
+                        let expr = Expr::InList {
+                            expr: Box::new(cast_expr),
+                            list: cast_list_expr,
+                            negated,
+                        };
+                        expr.rewrite(&mut self.const_evaluator)
+                    }
+                }
+            }
             expr => Ok(expr),
         }
     }
 }
 
+/// Attempts to coerce the types of `list_types` to be comparable with the
+/// `expr_type`.
+/// Returns the common data type for `expr_type` and `list_types`
+fn get_coerce_type_for_list(
+    expr_type: &DataType,
+    list_types: &[DataType],
+) -> Option<DataType> {
+    list_types
+        .iter()
+        .fold(Some(expr_type.clone()), |left, right_type| match left {
+            None => None,
+            Some(left_type) => comparison_coercion(&left_type, right_type),
+        })
+}
+
 /// Returns `expressions` coerced to types compatible with
 /// `signature`, if possible.
 ///
@@ -348,6 +397,49 @@ mod test {
         Ok(())
     }
 
+    #[test]
+    fn inlist_case() -> Result<()> {
+        // a in (1,4,8), a is int64
+        let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false);
+        let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+            produce_one_row: false,
+            schema: Arc::new(
+                DFSchema::new_with_metadata(
+                    vec![DFField::new(None, "a", DataType::Int64, true)],
+                    std::collections::HashMap::new(),
+                )
+                .unwrap(),
+            ),
+        }));
+        let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty, None)?);
+        let rule = TypeCoercion::new();
+        let mut config = OptimizerConfig::default();
+        let plan = rule.optimize(&plan, &mut config)?;
+        assert_eq!(
+            "Projection: #a IN ([Int64(1), Int64(4), Int64(8)])\n  EmptyRelation",
+            &format!("{:?}", plan)
+        );
+        // a in (1,4,8), a is decimal
+        let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false);
+        let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+            produce_one_row: false,
+            schema: Arc::new(
+                DFSchema::new_with_metadata(
+                    vec![DFField::new(None, "a", DataType::Decimal128(12, 4), true)],
+                    std::collections::HashMap::new(),
+                )
+                .unwrap(),
+            ),
+        }));
+        let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty, None)?);
+        let plan = rule.optimize(&plan, &mut config)?;
+        assert_eq!(
+            "Projection: CAST(#a AS Decimal128(24, 4)) IN ([Decimal128(Some(10000),24,4), Decimal128(Some(40000),24,4), Decimal128(Some(80000),24,4)])\n  EmptyRelation",
+            &format!("{:?}", plan)
+        );
+        Ok(())
+    }
+
     fn empty() -> Arc<LogicalPlan> {
         Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
             produce_one_row: false,
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs
index 1848473165ce..daad48193408 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -960,6 +960,17 @@ pub fn in_list(
     negated: &bool,
     schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
+    // check the data type
+    let expr_data_type = expr.data_type(schema)?;
+    for list_expr in list.iter() {
+        let list_expr_data_type = list_expr.data_type(schema)?;
+        if !expr_data_type.eq(&list_expr_data_type) {
+            return Err(DataFusionError::Internal(format!(
+                "The data type inlist should be same, the value type is {}, one of list expr type is {}",
+                expr_data_type, list_expr_data_type
+            )));
+        }
+    }
     Ok(Arc::new(InListExpr::new(expr, list, *negated, schema)))
 }
 
@@ -969,9 +980,54 @@ mod tests {
 
     use super::*;
     use crate::expressions;
-    use crate::expressions::{col, lit};
-    use crate::planner::in_list_cast;
+    use crate::expressions::{col, lit, try_cast};
     use datafusion_common::Result;
+    use datafusion_expr::binary_rule::comparison_coercion;
+
+    type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
+    // Try to do the type coercion for list physical expr.
+    // It's just used in the test
+    fn in_list_cast(
+        expr: Arc<dyn PhysicalExpr>,
+        list: Vec<Arc<dyn PhysicalExpr>>,
+        input_schema: &Schema,
+    ) -> Result<InListCastResult> {
+        let expr_type = &expr.data_type(input_schema)?;
+        let list_types: Vec<DataType> = list
+            .iter()
+            .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+            .collect();
+        let result_type = get_coerce_type(expr_type, &list_types);
+        match result_type {
+            None => Err(DataFusionError::Plan(format!(
+                "Can not find compatible types to compare {:?} with {:?}",
+                expr_type, list_types
+            ))),
+            Some(data_type) => {
+                // find the coerced type
+                let cast_expr = try_cast(expr, input_schema, data_type.clone())?;
+                let cast_list_expr = list
+                    .into_iter()
+                    .map(|list_expr| {
+                        try_cast(list_expr, input_schema, data_type.clone()).unwrap()
+                    })
+                    .collect();
+                Ok((cast_expr, cast_list_expr))
+            }
+        }
+    }
+
+    // Attempts to coerce the types of `list_type` to be comparable with the
+    // `expr_type`
+    fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option<DataType> {
+        list_type
+            .iter()
+            .fold(Some(expr_type.clone()), |left, right_type| match left {
+                None => None,
+                Some(left_type) => comparison_coercion(&left_type, right_type),
+            })
+    }
 
     // applies the in_list expr to an input batch and list
     macro_rules! in_list {
diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs
index 7e31dea0acb2..ba9664ef653d 100644
--- a/datafusion/physical-expr/src/planner.rs
+++ b/datafusion/physical-expr/src/planner.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::expressions::try_cast;
 use crate::var_provider::is_system_variables;
 use crate::{
     execution_props::ExecutionProps,
@@ -28,7 +27,6 @@ use crate::{
 };
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue};
-use datafusion_expr::binary_rule::comparison_coercion;
 use datafusion_expr::{binary_expr, Expr, Operator};
 use std::sync::Arc;
 
@@ -410,10 +408,7 @@ pub fn create_physical_expr(
                         )
                     })
                     .collect::<Result<Vec<_>>>()?;
-
-                let (cast_expr, cast_list_exprs) =
-                    in_list_cast(value_expr, list_exprs, input_schema)?;
-                expressions::in_list(cast_expr, cast_list_exprs, negated, input_schema)
+                expressions::in_list(value_expr, list_exprs, negated, input_schema)
             }
         },
         other => Err(DataFusionError::NotImplemented(format!(
@@ -422,50 +417,3 @@ pub fn create_physical_expr(
         ))),
     }
 }
-
-type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
-
-pub(crate) fn in_list_cast(
-    expr: Arc<dyn PhysicalExpr>,
-    list: Vec<Arc<dyn PhysicalExpr>>,
-    input_schema: &Schema,
-) -> Result<InListCastResult> {
-    let expr_type = &expr.data_type(input_schema)?;
-    let list_types: Vec<DataType> = list
-        .iter()
-        .map(|list_expr| list_expr.data_type(input_schema).unwrap())
-        .collect();
-    let result_type = get_coerce_type(expr_type, &list_types);
-    match result_type {
-        None => Err(DataFusionError::Plan(format!(
-            "Can not find compatible types to compare {:?} with {:?}",
-            expr_type, list_types
-        ))),
-        Some(data_type) => {
-            // find the coerced type
-            let cast_expr = try_cast(expr, input_schema, data_type.clone())?;
-            let cast_list_expr = list
-                .into_iter()
-                .map(|list_expr| {
-                    try_cast(list_expr, input_schema, data_type.clone()).unwrap()
-                })
-                .collect();
-            Ok((cast_expr, cast_list_expr))
-        }
-    }
-}
-
-/// Attempts to coerce the types of `list_type` to be comparable with the
-/// `expr_type`
-fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option<DataType> {
-    // get the equal coerced data type
-    list_type
-        .iter()
-        .fold(Some(expr_type.clone()), |left, right_type| {
-            match left {
-                None => None,
-                // TODO refactor a framework to do the data type coercion
-                Some(left_type) => comparison_coercion(&left_type, right_type),
-            }
-        })
-}

From 916654032c02fb5697ab5622b3ce444e97fc64d2 Mon Sep 17 00:00:00 2001
From: Yang Jiang <yangjiang@ebay.com>
Date: Thu, 15 Sep 2022 01:49:50 +0800
Subject: [PATCH 02/30] [minor] Remove unused arg in macro in Inlist (#3474)

* remove unused arg in macro in Inlist

* fix fmt
---
 .../physical-expr/src/expressions/in_list.rs  | 90 ++++---------------
 1 file changed, 19 insertions(+), 71 deletions(-)

diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs
index daad48193408..ae49b4cec4ac 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -129,7 +129,7 @@ macro_rules! make_contains_primitive {
 }
 
 macro_rules! set_contains_for_float {
-    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr, $PHY_TYPE:ty) => {{
+    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{
         let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
         let bool_array = if $NEGATED {
             // Not in
@@ -173,7 +173,7 @@ macro_rules! set_contains_for_float {
 }
 
 macro_rules! set_contains_for_primitive {
-    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr, $PHY_TYPE:ty) => {{
+    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{
         let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
         let native_set = $SET_VALUES
             .iter()
@@ -538,59 +538,28 @@ impl PhysicalExpr for InListExpr {
                         array,
                         set,
                         Boolean,
-                        self.negated,
-                        bool
+                        self.negated
                     ))
                 }
                 DataType::Int8 => {
                     let array = array.as_any().downcast_ref::<Int8Array>().unwrap();
-                    Ok(set_contains_for_primitive!(
-                        array,
-                        set,
-                        Int8,
-                        self.negated,
-                        i8
-                    ))
+                    Ok(set_contains_for_primitive!(array, set, Int8, self.negated))
                 }
                 DataType::Int16 => {
                     let array = array.as_any().downcast_ref::<Int16Array>().unwrap();
-                    Ok(set_contains_for_primitive!(
-                        array,
-                        set,
-                        Int16,
-                        self.negated,
-                        i16
-                    ))
+                    Ok(set_contains_for_primitive!(array, set, Int16, self.negated))
                 }
                 DataType::Int32 => {
                     let array = array.as_any().downcast_ref::<Int32Array>().unwrap();
-                    Ok(set_contains_for_primitive!(
-                        array,
-                        set,
-                        Int32,
-                        self.negated,
-                        i32
-                    ))
+                    Ok(set_contains_for_primitive!(array, set, Int32, self.negated))
                 }
                 DataType::Int64 => {
                     let array = array.as_any().downcast_ref::<Int64Array>().unwrap();
-                    Ok(set_contains_for_primitive!(
-                        array,
-                        set,
-                        Int64,
-                        self.negated,
-                        i64
-                    ))
+                    Ok(set_contains_for_primitive!(array, set, Int64, self.negated))
                 }
                 DataType::UInt8 => {
                     let array = array.as_any().downcast_ref::<UInt8Array>().unwrap();
-                    Ok(set_contains_for_primitive!(
-                        array,
-                        set,
-                        UInt8,
-                        self.negated,
-                        u8
-                    ))
+                    Ok(set_contains_for_primitive!(array, set, UInt8, self.negated))
                 }
                 DataType::UInt16 => {
                     let array = array.as_any().downcast_ref::<UInt16Array>().unwrap();
@@ -598,8 +567,7 @@ impl PhysicalExpr for InListExpr {
                         array,
                         set,
                         UInt16,
-                        self.negated,
-                        u16
+                        self.negated
                     ))
                 }
                 DataType::UInt32 => {
@@ -608,8 +576,7 @@ impl PhysicalExpr for InListExpr {
                         array,
                         set,
                         UInt32,
-                        self.negated,
-                        u32
+                        self.negated
                     ))
                 }
                 DataType::UInt64 => {
@@ -618,8 +585,7 @@ impl PhysicalExpr for InListExpr {
                         array,
                         set,
                         UInt64,
-                        self.negated,
-                        u64
+                        self.negated
                     ))
                 }
                 DataType::Date32 => {
@@ -628,8 +594,7 @@ impl PhysicalExpr for InListExpr {
                         array,
                         set,
                         Date32,
-                        self.negated,
-                        i32
+                        self.negated
                     ))
                 }
                 DataType::Date64 => {
@@ -638,29 +603,16 @@ impl PhysicalExpr for InListExpr {
                         array,
                         set,
                         Date64,
-                        self.negated,
-                        i64
+                        self.negated
                     ))
                 }
                 DataType::Float32 => {
                     let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
-                    Ok(set_contains_for_float!(
-                        array,
-                        set,
-                        Float32,
-                        self.negated,
-                        f32
-                    ))
+                    Ok(set_contains_for_float!(array, set, Float32, self.negated))
                 }
                 DataType::Float64 => {
                     let array = array.as_any().downcast_ref::<Float64Array>().unwrap();
-                    Ok(set_contains_for_float!(
-                        array,
-                        set,
-                        Float64,
-                        self.negated,
-                        f64
-                    ))
+                    Ok(set_contains_for_float!(array, set, Float64, self.negated))
                 }
                 DataType::Utf8 => {
                     let array = array
@@ -704,8 +656,7 @@ impl PhysicalExpr for InListExpr {
                             array,
                             set,
                             TimestampSecond,
-                            self.negated,
-                            i64
+                            self.negated
                         ))
                     }
                     TimeUnit::Millisecond => {
@@ -717,8 +668,7 @@ impl PhysicalExpr for InListExpr {
                             array,
                             set,
                             TimestampMillisecond,
-                            self.negated,
-                            i64
+                            self.negated
                         ))
                     }
                     TimeUnit::Microsecond => {
@@ -730,8 +680,7 @@ impl PhysicalExpr for InListExpr {
                             array,
                             set,
                             TimestampMicrosecond,
-                            self.negated,
-                            i64
+                            self.negated
                         ))
                     }
                     TimeUnit::Nanosecond => {
@@ -743,8 +692,7 @@ impl PhysicalExpr for InListExpr {
                             array,
                             set,
                             TimestampNanosecond,
-                            self.negated,
-                            i64
+                            self.negated
                         ))
                     }
                 },

From 9d028b3ac3e1d9e702af088a7620826c58f3c115 Mon Sep 17 00:00:00 2001
From: Morgan Cassels <morgancassels@gmail.com>
Date: Wed, 14 Sep 2022 10:52:25 -0700
Subject: [PATCH 03/30] add FixedSizeBinary support to create_hashes (#3458)

* add FixedSizeBinary support to create_hashes

* remove equality and inequality assertions about FixedSizeBinary hashes because feature force_hash_collisions changes hash values

Co-authored-by: Morgan Cassels <morgan@urbanlogiq.com>
---
 .../core/src/physical_plan/hash_utils.rs      | 35 ++++++++++++++++---
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs
index 315c3db3bf31..61e72b07c551 100644
--- a/datafusion/core/src/physical_plan/hash_utils.rs
+++ b/datafusion/core/src/physical_plan/hash_utils.rs
@@ -21,10 +21,10 @@ use crate::error::{DataFusionError, Result};
 use ahash::RandomState;
 use arrow::array::{
     Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array,
-    DictionaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-    Int8Array, LargeStringArray, StringArray, TimestampMicrosecondArray,
-    TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
-    UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+    DictionaryArray, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
+    Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow::datatypes::{
     ArrowDictionaryKeyType, ArrowNativeType, DataType, Int16Type, Int32Type, Int64Type,
@@ -529,6 +529,16 @@ pub fn create_hashes<'a>(
                     multi_col
                 );
             }
+            DataType::FixedSizeBinary(_) => {
+                hash_array!(
+                    FixedSizeBinaryArray,
+                    col,
+                    &[u8],
+                    hashes_buffer,
+                    random_state,
+                    multi_col
+                );
+            }
             DataType::LargeBinary => {
                 hash_array!(
                     LargeBinaryArray,
@@ -627,7 +637,7 @@ pub fn create_hashes<'a>(
 mod tests {
     use crate::from_slice::FromSlice;
     use arrow::{
-        array::{BinaryArray, DictionaryArray},
+        array::{BinaryArray, DictionaryArray, FixedSizeBinaryArray},
         datatypes::Int8Type,
     };
     use std::sync::Arc;
@@ -682,6 +692,21 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn create_hashes_fixed_size_binary() -> Result<()> {
+        let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
+        let fixed_size_binary_array =
+            Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
+
+        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
+        let hashes =
+            create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
+        assert_eq!(hashes.len(), 3,);
+
+        Ok(())
+    }
+
     #[test]
     // Tests actual values of hashes, which are different if forcing collisions
     #[cfg(not(feature = "force_hash_collisions"))]

From 5f029cc73755b6800217e370ebe0a7b5e8a6a224 Mon Sep 17 00:00:00 2001
From: Nitish Tiwari <nitish@parseable.io>
Date: Thu, 15 Sep 2022 02:24:13 +0530
Subject: [PATCH 04/30] Add Parseable as Datafusion user (#3471)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b468089e264a..cd92f71eaa8f 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,7 @@ Here are some of the projects known to use DataFusion:
 - [delta-rs](https://github.com/delta-io/delta-rs) Native Rust implementation of Delta Lake
 - [Flock](https://github.com/flock-lab/flock)
 - [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database
+- [Parseable](https://github.com/parseablehq/parseable) Log storage and observability platform
 - [qv](https://github.com/timvw/qv) Quickly view your data
 - [ROAPI](https://github.com/roapi/roapi)
 - [Tensorbase](https://github.com/tensorbase/tensorbase)

From 06f01eaf65b78d73ffc7fb0bf95619e1e884e850 Mon Sep 17 00:00:00 2001
From: Kun Liu <liukun@apache.org>
Date: Thu, 15 Sep 2022 08:56:22 +0800
Subject: [PATCH 05/30] change the null type in the row filter (#3470)

---
 .../physical_plan/file_format/row_filter.rs   | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/datafusion/core/src/physical_plan/file_format/row_filter.rs b/datafusion/core/src/physical_plan/file_format/row_filter.rs
index 56bdba5577e9..473856b7b3d0 100644
--- a/datafusion/core/src/physical_plan/file_format/row_filter.rs
+++ b/datafusion/core/src/physical_plan/file_format/row_filter.rs
@@ -19,7 +19,7 @@ use arrow::array::{Array, BooleanArray};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::error::{ArrowError, Result as ArrowResult};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{Column, Result, ScalarValue, ToDFSchema};
+use datafusion_common::{Column, DataFusionError, Result, ScalarValue, ToDFSchema};
 use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion};
 
 use datafusion_expr::{uncombine_filter, Expr};
@@ -202,7 +202,18 @@ impl<'a> ExprRewriter for FilterCandidateBuilder<'a> {
     fn mutate(&mut self, expr: Expr) -> Result<Expr> {
         if let Expr::Column(Column { name, .. }) = &expr {
             if self.file_schema.field_with_name(name).is_err() {
-                return Ok(Expr::Literal(ScalarValue::Null));
+                // the column expr must be in the table schema
+                return match self.table_schema.field_with_name(name) {
+                    Ok(field) => {
+                        // return the null value corresponding to the data type
+                        let null_value = ScalarValue::try_from(field.data_type())?;
+                        Ok(Expr::Literal(null_value))
+                    }
+                    Err(e) => {
+                        // If the column is not in the table schema, should throw the error
+                        Err(DataFusionError::ArrowError(e))
+                    }
+                };
             }
         }
 
@@ -314,12 +325,13 @@ mod test {
     use crate::physical_plan::file_format::row_filter::FilterCandidateBuilder;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{col, lit};
+    use datafusion_expr::{cast, col, lit};
     use parquet::arrow::parquet_to_arrow_schema;
     use parquet::file::reader::{FileReader, SerializedFileReader};
 
     // Assume a column expression for a column not in the table schema is a projected column and ignore it
     #[test]
+    #[should_panic(expected = "building candidate failed")]
     fn test_filter_candidate_builder_ignore_projected_columns() {
         let testdata = crate::test_util::parquet_test_data();
         let file = std::fs::File::open(&format!("{}/alltypes_plain.parquet", testdata))
@@ -337,7 +349,7 @@ mod test {
 
         let candidate = FilterCandidateBuilder::new(expr, &table_schema, &table_schema)
             .build(metadata)
-            .expect("building candidate");
+            .expect("building candidate failed");
 
         assert!(candidate.is_none());
     }
@@ -386,8 +398,10 @@ mod test {
             Field::new("float_col", DataType::Float32, true),
         ]);
 
-        let expr = col("bigint_col").eq(col("int_col"));
-        let expected_candidate_expr = col("bigint_col").eq(lit(ScalarValue::Null));
+        // The parquet file with `file_schema` just has `bigint_col` and `float_col` column, and don't have the `int_col`
+        let expr = col("bigint_col").eq(cast(col("int_col"), DataType::Int64));
+        let expected_candidate_expr =
+            col("bigint_col").eq(cast(lit(ScalarValue::Int32(None)), DataType::Int64));
 
         let candidate = FilterCandidateBuilder::new(expr, &file_schema, &table_schema)
             .build(metadata)

From f52d8aff2e7975403699268225bca0a9efbe7076 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 14 Sep 2022 21:44:35 -0400
Subject: [PATCH 06/30] minor: fix bug in `downcast_value!` macro (#3486)

---
 datafusion/common/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index 2992596c564c..d9fd2503e958 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -43,10 +43,10 @@ macro_rules! downcast_value {
         })?
     }};
     ($Value: expr, $Type: ident, $T: tt) => {{
-        $Value.as_any().downcast_ref::<$Type<T>>().ok_or_else(|| {
+        $Value.as_any().downcast_ref::<$Type<$T>>().ok_or_else(|| {
             DataFusionError::Internal(format!(
                 "could not cast value to {}",
-                type_name::<$Type<T>>()
+                type_name::<$Type<$T>>()
             ))
         })?
     }};

From 84bee899958aaf70372ef84811c6787f53fa25eb Mon Sep 17 00:00:00 2001
From: askoa <112126368+askoa@users.noreply.github.com>
Date: Wed, 14 Sep 2022 21:44:57 -0400
Subject: [PATCH 07/30] make the function from_proto_binary_op public (#3490)

Co-authored-by: askoa <askoa@local>
---
 datafusion/proto/src/from_proto.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs
index 5402b03ce330..b82237788a50 100644
--- a/datafusion/proto/src/from_proto.rs
+++ b/datafusion/proto/src/from_proto.rs
@@ -1515,7 +1515,7 @@ fn unwrap_timezone(proto_value: &str) -> Option<String> {
     }
 }
 
-fn from_proto_binary_op(op: &str) -> Result<Operator, Error> {
+pub fn from_proto_binary_op(op: &str) -> Result<Operator, Error> {
     match op {
         "And" => Ok(Operator::And),
         "Or" => Ok(Operator::Or),

From 6ffda6885c7acbbddeb5d336116b035f9a9fa30b Mon Sep 17 00:00:00 2001
From: askoa <112126368+askoa@users.noreply.github.com>
Date: Thu, 15 Sep 2022 08:43:22 -0400
Subject: [PATCH 08/30] Add BitwiseXor in function from_proto_binary_op (#3496)

---
 datafusion/proto/src/from_proto.rs | 1 +
 datafusion/proto/src/lib.rs        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs
index b82237788a50..8c0124b9906b 100644
--- a/datafusion/proto/src/from_proto.rs
+++ b/datafusion/proto/src/from_proto.rs
@@ -1536,6 +1536,7 @@ pub fn from_proto_binary_op(op: &str) -> Result<Operator, Error> {
         "IsNotDistinctFrom" => Ok(Operator::IsNotDistinctFrom),
         "BitwiseAnd" => Ok(Operator::BitwiseAnd),
         "BitwiseOr" => Ok(Operator::BitwiseOr),
+        "BitwiseXor" => Ok(Operator::BitwiseXor),
         "BitwiseShiftLeft" => Ok(Operator::BitwiseShiftLeft),
         "BitwiseShiftRight" => Ok(Operator::BitwiseShiftRight),
         "RegexIMatch" => Ok(Operator::RegexIMatch),
diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs
index cce778be2e24..ce2678025a88 100644
--- a/datafusion/proto/src/lib.rs
+++ b/datafusion/proto/src/lib.rs
@@ -858,6 +858,7 @@ mod roundtrip_tests {
         test(Operator::BitwiseShiftLeft);
         test(Operator::BitwiseAnd);
         test(Operator::BitwiseOr);
+        test(Operator::BitwiseXor);
         test(Operator::IsDistinctFrom);
         test(Operator::IsNotDistinctFrom);
         test(Operator::And);

From 331922036a23265084ab091fa2ab5c57da42c841 Mon Sep 17 00:00:00 2001
From: Wei-Ting Kuo <waitingkuo0527@gmail.com>
Date: Fri, 16 Sep 2022 01:04:17 +0800
Subject: [PATCH 09/30] add time_zone into ConfigOptions (#3485)

* add time_zone into ConfigOptions

* fix debug leftover

* Update datafusion/core/src/config.rs

Co-authored-by: Kun Liu <liukun@apache.org>

Co-authored-by: Kun Liu <liukun@apache.org>
---
 datafusion/core/src/config.rs                 | 34 ++++++++++++++++-
 .../core/tests/sql/information_schema.rs      | 38 +++++++++++++++++++
 datafusion/sql/src/planner.rs                 |  7 +++-
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/src/config.rs b/datafusion/core/src/config.rs
index 3c828079e241..00ccdd523c1d 100644
--- a/datafusion/core/src/config.rs
+++ b/datafusion/core/src/config.rs
@@ -47,6 +47,9 @@ pub const OPT_COALESCE_TARGET_BATCH_SIZE: &str =
 pub const OPT_OPTIMIZER_SKIP_FAILED_RULES: &str =
     "datafusion.optimizer.skip_failed_rules";
 
+/// Configuration option "datafusion.execution.time_zone"
+pub const OPT_TIME_ZONE: &str = "datafusion.execution.time_zone";
+
 /// Definition of a configuration option
 pub struct ConfigDefinition {
     /// key used to identifier this configuration option
@@ -102,6 +105,20 @@ impl ConfigDefinition {
             ScalarValue::UInt64(Some(default_value)),
         )
     }
+
+    /// Create a configuration option definition with a string value
+    pub fn new_string(
+        key: impl Into<String>,
+        description: impl Into<String>,
+        default_value: String,
+    ) -> Self {
+        Self::new(
+            key,
+            description,
+            DataType::Utf8,
+            ScalarValue::Utf8(Some(default_value)),
+        )
+    }
 }
 
 /// Contains definitions for all built-in configuration options
@@ -167,7 +184,14 @@ impl BuiltInConfigs {
                 messages if any optimization rules produce errors and then proceed to the next \
                 rule. When set to false, any rules that produce errors will cause the query to fail.",
                 true
-            )],
+            ),
+            ConfigDefinition::new_string(
+                OPT_TIME_ZONE,
+                "The session time zone which some function require \
+                e.g. EXTRACT(HOUR from SOME_TIME) shift the underline datetime according to the time zone,
+                then extract the hour",
+                "UTC".into()
+            )]
         }
     }
 
@@ -279,6 +303,14 @@ impl ConfigOptions {
         }
     }
 
+    /// get a string configuration option
+    pub fn get_string(&self, key: &str) -> String {
+        match self.get(key) {
+            Some(ScalarValue::Utf8(Some(s))) => s,
+            _ => "".into(),
+        }
+    }
+
     /// Access the underlying hashmap
     pub fn options(&self) -> &HashMap<String, ScalarValue> {
         &self.options
diff --git a/datafusion/core/tests/sql/information_schema.rs b/datafusion/core/tests/sql/information_schema.rs
index c9a5c9aa9cd5..854166155161 100644
--- a/datafusion/core/tests/sql/information_schema.rs
+++ b/datafusion/core/tests/sql/information_schema.rs
@@ -703,3 +703,41 @@ async fn show_all() {
         .len();
     assert_eq!(expected_length, results[0].num_rows());
 }
+
+#[tokio::test]
+async fn show_time_zone_default_utc() {
+    // https://github.com/apache/arrow-datafusion/issues/3255
+    let ctx =
+        SessionContext::with_config(SessionConfig::new().with_information_schema(true));
+    let sql = "SHOW TIME ZONE";
+    let results = plan_and_collect(&ctx, sql).await.unwrap();
+
+    let expected = vec![
+        "+--------------------------------+---------+",
+        "| name                           | setting |",
+        "+--------------------------------+---------+",
+        "| datafusion.execution.time_zone | UTC     |",
+        "+--------------------------------+---------+",
+    ];
+
+    assert_batches_eq!(expected, &results);
+}
+
+#[tokio::test]
+async fn show_timezone_default_utc() {
+    // https://github.com/apache/arrow-datafusion/issues/3255
+    let ctx =
+        SessionContext::with_config(SessionConfig::new().with_information_schema(true));
+    let sql = "SHOW TIMEZONE";
+    let results = plan_and_collect(&ctx, sql).await.unwrap();
+
+    let expected = vec![
+        "+--------------------------------+---------+",
+        "| name                           | setting |",
+        "+--------------------------------+---------+",
+        "| datafusion.execution.time_zone | UTC     |",
+        "+--------------------------------+---------+",
+    ];
+
+    assert_batches_eq!(expected, &results);
+}
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 0338d713a3c6..04518d81e457 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -2386,8 +2386,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             ));
         }
 
-        let query = if variable.to_lowercase() == "all" {
+        let variable_lower = variable.to_lowercase();
+
+        let query = if variable_lower == "all" {
             String::from("SELECT name, setting FROM information_schema.df_settings")
+        } else if variable_lower == "timezone" || variable_lower == "time.zone" {
+            // we could introduce alias in OptionDefinition if this string matching thing grows
+            String::from("SELECT name, setting FROM information_schema.df_settings WHERE name = 'datafusion.execution.time_zone'")
         } else {
             format!(
                 "SELECT name, setting FROM information_schema.df_settings WHERE name = '{}'",

From 66dd253d2e0cdd00c8d3611f2ca470b9cde48abc Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Fri, 16 Sep 2022 15:55:44 +0800
Subject: [PATCH 10/30] MINOR: remove unused dependencies (#3508)

* MINOR: remove unused dependencies

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* revert rand

* ad newline

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 datafusion/common/Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index 72ffbb22b144..028a4874a8bc 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -46,5 +46,4 @@ object_store = { version = "0.5.0", optional = true }
 ordered-float = "3.0"
 parquet = { version = "22.0.0", features = ["arrow"], optional = true }
 pyo3 = { version = "0.17.1", optional = true }
-serde_json = "1.0"
 sqlparser = "0.23"

From 25c029c9ec275f53edf29189a0c94a88ad0c9618 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Fri, 16 Sep 2022 04:45:16 -0600
Subject: [PATCH 11/30] prettier (#3504)

---
 dev/release/README.md | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/dev/release/README.md b/dev/release/README.md
index 3dadf615b85e..06023ce08c3f 100644
--- a/dev/release/README.md
+++ b/dev/release/README.md
@@ -307,21 +307,20 @@ dot -Tsvg dev/release/crate-deps.dot > dev/release/crate-deps.svg
 (cd datafusion/optimizer && cargo publish)
 (cd datafusion/core && cargo publish)
 (cd datafusion/proto && cargo publish)
-(cd datafusion-cli && cargo publish)
 ```
 
-### Publish datafusion-cli on Homebrew and crates.io
+The CLI needs a `--no-verify` argument because `build.rs` generates source into the `src` directory.
+
+```shell
+(cd datafusion-cli && cargo publish --no-verify)
+```
+
+### Publish datafusion-cli on Homebrew
 
 For Homebrew, Send a simple PR to update tag and commit hash for the datafusion
 formula in homebrew-core. Here is an example PR:
 https://github.com/Homebrew/homebrew-core/pull/89562.
 
-For crates.io, run
-
-```shell
-(cd datafusion-cli && cargo publish)
-```
-
 ### Call the vote
 
 Call the vote on the Arrow dev list by replying to the RC voting thread. The
@@ -389,7 +388,17 @@ Delete a release:
 svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/arrow/arrow-datafusion-7.0.0
 ```
 
-### Write a blog post announcing the release
+### Publish the User Guide to the Arrow Site
+
+- Run the `build.sh` in the `docs` directory from the release tarball.
+- Clone the [arrow-site](https://github.com/apache/arrow-site) repository
+- Checkout the `asf-site` branch
+- Copy content from `docs/build/html/*` to the `datafusion` directory in arrow-site
+- Create a PR against the `asf-site` branch ([example](https://github.com/apache/arrow-site/pull/237))
+- Once the PR is merged, the content will be published to https://arrow.apache.org/datafusion/ by GitHub Pages (this
+  can take some time).
+
+### Optional: Write a blog post announcing the release
 
 We typically crowdsource release announcements by collaborating on a Google document, usually starting
 with a copy of the previous release announcement.

From c59a76754e7e45063cb51c273f2a53aa325676a4 Mon Sep 17 00:00:00 2001
From: Jon Mease <jon.mease.dev@gmail.com>
Date: Fri, 16 Sep 2022 10:03:08 -0400
Subject: [PATCH 12/30] Add additional DATE_PART units (#3503)

* Add dow support to date_part

* Add doy support to date_part

* Add quarter support to date_part
---
 datafusion/core/tests/sql/expr.rs                 | 15 +++++++++++++++
 .../physical-expr/src/datetime_expressions.rs     |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs
index 3ca2c4738f45..a5288d92a464 100644
--- a/datafusion/core/tests/sql/expr.rs
+++ b/datafusion/core/tests/sql/expr.rs
@@ -1233,6 +1233,11 @@ async fn test_extract_date_part() -> Result<()> {
         "EXTRACT(year FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
         "2020"
     );
+    test_expression!("date_part('QUARTER', CAST('2000-01-01' AS DATE))", "1");
+    test_expression!(
+        "EXTRACT(quarter FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
+        "3"
+    );
     test_expression!("date_part('MONTH', CAST('2000-01-01' AS DATE))", "1");
     test_expression!(
         "EXTRACT(month FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
@@ -1248,6 +1253,16 @@ async fn test_extract_date_part() -> Result<()> {
         "EXTRACT(day FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
         "8"
     );
+    test_expression!("date_part('DOY', CAST('2000-01-01' AS DATE))", "1");
+    test_expression!(
+        "EXTRACT(doy FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
+        "252"
+    );
+    test_expression!("date_part('DOW', CAST('2000-01-01' AS DATE))", "6");
+    test_expression!(
+        "EXTRACT(dow FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
+        "2"
+    );
     test_expression!("date_part('HOUR', CAST('2000-01-01' AS DATE))", "0");
     test_expression!(
         "EXTRACT(hour FROM to_timestamp('2020-09-08T12:03:03+00:00'))",
diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs
index a702273790f3..0214d1bfb9be 100644
--- a/datafusion/physical-expr/src/datetime_expressions.rs
+++ b/datafusion/physical-expr/src/datetime_expressions.rs
@@ -449,9 +449,12 @@ pub fn date_part(args: &[ColumnarValue]) -> Result<ColumnarValue> {
 
     let arr = match date_part.to_lowercase().as_str() {
         "year" => extract_date_part!(array, temporal::year),
+        "quarter" => extract_date_part!(array, temporal::quarter),
         "month" => extract_date_part!(array, temporal::month),
         "week" => extract_date_part!(array, temporal::week),
         "day" => extract_date_part!(array, temporal::day),
+        "doy" => extract_date_part!(array, temporal::doy),
+        "dow" => extract_date_part!(array, temporal::num_days_from_sunday),
         "hour" => extract_date_part!(array, temporal::hour),
         "minute" => extract_date_part!(array, temporal::minute),
         "second" => extract_date_part!(array, temporal::second),

From 4380f355e4102e937b26d13645891bf69deca995 Mon Sep 17 00:00:00 2001
From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com>
Date: Fri, 16 Sep 2022 14:37:12 -0400
Subject: [PATCH 13/30] Make FileStream and FileOpener public (#3514)

---
 datafusion/core/src/physical_plan/file_format/file_stream.rs | 5 +++++
 datafusion/core/src/physical_plan/file_format/mod.rs         | 1 +
 2 files changed, 6 insertions(+)

diff --git a/datafusion/core/src/physical_plan/file_format/file_stream.rs b/datafusion/core/src/physical_plan/file_format/file_stream.rs
index 1a54d685228c..609e3b3a9b20 100644
--- a/datafusion/core/src/physical_plan/file_format/file_stream.rs
+++ b/datafusion/core/src/physical_plan/file_format/file_stream.rs
@@ -51,7 +51,11 @@ use crate::physical_plan::RecordBatchStream;
 pub type FileOpenFuture =
     BoxFuture<'static, Result<BoxStream<'static, ArrowResult<RecordBatch>>>>;
 
+/// Generic API for opening a file using an [`ObjectStore`] and resolving to a
+/// stream of [`RecordBatch`]
 pub trait FileOpener: Unpin {
+    /// Asynchronously open the specified file and return a stream
+    /// of [`RecordBatch`]
     fn open(
         &self,
         store: Arc<dyn ObjectStore>,
@@ -167,6 +171,7 @@ impl FileStreamMetrics {
 }
 
 impl<F: FileOpener> FileStream<F> {
+    /// Create a new `FileStream` using the give `FileOpener` to scan underlying files
     pub fn new(
         config: &FileScanConfig,
         partition: usize,
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs
index 707ed039c26c..2d59570078bc 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -39,6 +39,7 @@ use arrow::{
     record_batch::RecordBatch,
 };
 pub use avro::AvroExec;
+pub use file_stream::{FileOpenFuture, FileOpener, FileStream};
 pub(crate) use json::plan_to_json;
 pub use json::NdJsonExec;
 

From c6ebf7bcc73067fa285af847da3bf0c5c04c270a Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Fri, 16 Sep 2022 21:49:39 -0600
Subject: [PATCH 14/30] MINOR: Add more execs to list of supported execs
 (#3519)

* prettier

* add more execs to docs

* Revert

* Update datafusion/core/src/lib.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update datafusion/core/src/lib.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update datafusion/core/src/lib.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update datafusion/core/src/lib.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update datafusion/core/src/lib.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/core/src/lib.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 790f03411127..74782359ec8e 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -161,11 +161,17 @@
 //! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec)
 //! * Filter: [`FilterExec`](physical_plan::filter::FilterExec)
 //! * Grouped and non-grouped aggregations: [`AggregateExec`](physical_plan::aggregates::AggregateExec)
+//! * Hash Join: [`HashJoinExec`](physical_plan::hash_join::HashJoinExec)
+//! * Cross Join: [`CrossJoinExec`](physical_plan::cross_join::CrossJoinExec)
+//! * Sort Merge Join: [`SortMergeJoinExec`](physical_plan::sort_merge_join::SortMergeJoinExec)
+//! * Union: [`UnionExec`](physical_plan::union::UnionExec)
 //! * Sort: [`SortExec`](physical_plan::sorts::sort::SortExec)
 //! * Coalesce partitions: [`CoalescePartitionsExec`](physical_plan::coalesce_partitions::CoalescePartitionsExec)
 //! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec)
-//! * Scan a CSV: [`CsvExec`](physical_plan::file_format::CsvExec)
-//! * Scan a Parquet: [`ParquetExec`](physical_plan::file_format::ParquetExec)
+//! * Scan CSV: [`CsvExec`](physical_plan::file_format::CsvExec)
+//! * Scan Parquet: [`ParquetExec`](physical_plan::file_format::ParquetExec)
+//! * Scan Avro: [`AvroExec`](physical_plan::file_format::AvroExec)
+//! * Scan newline-delimited JSON: [`NdJsonExec`](physical_plan::file_format::NdJsonExec)
 //! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec)
 //! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec)
 //!

From b106b02a362a011bd8666cd2ae146fb6cfa7c4e4 Mon Sep 17 00:00:00 2001
From: Kirk Mitchener <kmitchener@gmail.com>
Date: Sat, 17 Sep 2022 06:46:16 -0400
Subject: [PATCH 15/30] fix divide by zero not throwing proper error for
 decimal (#3517)

* fix divide by zero for decimal, add tests

* check for actual error strings

* move scalar divide by zero check out of the loop. fix precision in test.
---
 .../src/expressions/binary/kernels_arrow.rs   | 52 ++++++++++++++++---
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
index e99fbf0c9630..8caed7191fb2 100644
--- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
+++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
@@ -378,6 +378,9 @@ pub(crate) fn divide_decimal(
 ) -> Result<Decimal128Array> {
     let mul = 10_f64.powi(left.scale() as i32);
     let array = arith_decimal(left, right, |left, right| {
+        if right == 0 {
+            return Err(DataFusionError::ArrowError(ArrowError::DivideByZero));
+        }
         let l_value = left as f64;
         let r_value = right as f64;
         let result = ((l_value / r_value) * mul) as i128;
@@ -391,6 +394,9 @@ pub(crate) fn divide_decimal_scalar(
     left: &Decimal128Array,
     right: i128,
 ) -> Result<Decimal128Array> {
+    if right == 0 {
+        return Err(DataFusionError::ArrowError(ArrowError::DivideByZero));
+    }
     let mul = 10_f64.powi(left.scale() as i32);
     let array = arith_decimal_scalar(left, right, |left, right| {
         let l_value = left as f64;
@@ -615,34 +621,66 @@ mod tests {
         assert_eq!(expect, result);
         // divide
         let left_decimal_array = create_decimal_array(
-            &[Some(1234567), None, Some(1234567), Some(1234567)],
+            &[
+                Some(1234567),
+                None,
+                Some(1234567),
+                Some(1234567),
+                Some(1234567),
+            ],
+            25,
+            3,
+        );
+        let right_decimal_array = create_decimal_array(
+            &[Some(10), Some(100), Some(55), Some(-123), None],
             25,
             3,
         );
-        let right_decimal_array =
-            create_decimal_array(&[Some(10), Some(100), Some(55), Some(-123)], 25, 3);
         let result = divide_decimal(&left_decimal_array, &right_decimal_array)?;
         let expect = create_decimal_array(
-            &[Some(123456700), None, Some(22446672), Some(-10037130)],
+            &[Some(123456700), None, Some(22446672), Some(-10037130), None],
             25,
             3,
         );
         assert_eq!(expect, result);
         let result = divide_decimal_scalar(&left_decimal_array, 10)?;
         let expect = create_decimal_array(
-            &[Some(123456700), None, Some(123456700), Some(123456700)],
+            &[
+                Some(123456700),
+                None,
+                Some(123456700),
+                Some(123456700),
+                Some(123456700),
+            ],
             25,
             3,
         );
         assert_eq!(expect, result);
         // modulus
         let result = modulus_decimal(&left_decimal_array, &right_decimal_array)?;
-        let expect = create_decimal_array(&[Some(7), None, Some(37), Some(16)], 25, 3);
+        let expect =
+            create_decimal_array(&[Some(7), None, Some(37), Some(16), None], 25, 3);
         assert_eq!(expect, result);
         let result = modulus_decimal_scalar(&left_decimal_array, 10)?;
-        let expect = create_decimal_array(&[Some(7), None, Some(7), Some(7)], 25, 3);
+        let expect =
+            create_decimal_array(&[Some(7), None, Some(7), Some(7), Some(7)], 25, 3);
         assert_eq!(expect, result);
 
         Ok(())
     }
+
+    #[test]
+    fn arithmetic_decimal_divide_by_zero() {
+        let left_decimal_array = create_decimal_array(&[Some(101)], 10, 1);
+        let right_decimal_array = create_decimal_array(&[Some(0)], 1, 1);
+
+        let err = divide_decimal(&left_decimal_array, &right_decimal_array).unwrap_err();
+        assert_eq!("Arrow error: Divide by zero error", err.to_string());
+        let err = divide_decimal_scalar(&left_decimal_array, 0).unwrap_err();
+        assert_eq!("Arrow error: Divide by zero error", err.to_string());
+        let err = modulus_decimal(&left_decimal_array, &right_decimal_array).unwrap_err();
+        assert_eq!("Arrow error: Divide by zero error", err.to_string());
+        let err = modulus_decimal_scalar(&left_decimal_array, 0).unwrap_err();
+        assert_eq!("Arrow error: Divide by zero error", err.to_string());
+    }
 }

From 12f047e365a56399bf6fc1feaf25708c2afc5ba7 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sat, 17 Sep 2022 07:00:07 -0400
Subject: [PATCH 16/30] Change `downcast_any!` macro so it does not need to use
 `use std::any::type_name;` (#3484)

---
 datafusion/common/src/lib.rs                                | 2 ++
 datafusion/physical-expr/src/aggregate/approx_distinct.rs   | 1 -
 .../physical-expr/src/aggregate/approx_percentile_cont.rs   | 6 +-----
 datafusion/physical-expr/src/aggregate/average.rs           | 1 -
 datafusion/physical-expr/src/aggregate/count.rs             | 1 -
 datafusion/physical-expr/src/aggregate/covariance.rs        | 1 -
 datafusion/physical-expr/src/aggregate/min_max.rs           | 2 +-
 datafusion/physical-expr/src/aggregate/sum.rs               | 2 +-
 datafusion/physical-expr/src/aggregate/variance.rs          | 2 +-
 9 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index d9fd2503e958..48fd792cf9fe 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -35,6 +35,7 @@ pub use scalar::{ScalarType, ScalarValue};
 #[macro_export]
 macro_rules! downcast_value {
     ($Value: expr, $Type: ident) => {{
+        use std::any::type_name;
         $Value.as_any().downcast_ref::<$Type>().ok_or_else(|| {
             DataFusionError::Internal(format!(
                 "could not cast value to {}",
@@ -43,6 +44,7 @@ macro_rules! downcast_value {
         })?
     }};
     ($Value: expr, $Type: ident, $T: tt) => {{
+        use std::any::type_name;
         $Value.as_any().downcast_ref::<$Type<$T>>().ok_or_else(|| {
             DataFusionError::Internal(format!(
                 "could not cast value to {}",
diff --git a/datafusion/physical-expr/src/aggregate/approx_distinct.rs b/datafusion/physical-expr/src/aggregate/approx_distinct.rs
index 7252ac122f44..7302008ba825 100644
--- a/datafusion/physical-expr/src/aggregate/approx_distinct.rs
+++ b/datafusion/physical-expr/src/aggregate/approx_distinct.rs
@@ -31,7 +31,6 @@ use arrow::datatypes::{
 use datafusion_common::{downcast_value, ScalarValue};
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::{Accumulator, AggregateState};
-use std::any::type_name;
 use std::any::Any;
 use std::convert::TryFrom;
 use std::convert::TryInto;
diff --git a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs
index b2457887b481..6dfa24746305 100644
--- a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs
+++ b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs
@@ -31,11 +31,7 @@ use datafusion_common::Result;
 use datafusion_common::{downcast_value, ScalarValue};
 use datafusion_expr::{Accumulator, AggregateState};
 use ordered_float::OrderedFloat;
-use std::{
-    any::{type_name, Any},
-    iter,
-    sync::Arc,
-};
+use std::{any::Any, iter, sync::Arc};
 
 /// APPROX_PERCENTILE_CONT aggregate expression
 #[derive(Debug)]
diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs
index 623ccbb32b6b..501e62456414 100644
--- a/datafusion/physical-expr/src/aggregate/average.rs
+++ b/datafusion/physical-expr/src/aggregate/average.rs
@@ -17,7 +17,6 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::type_name;
 use std::any::Any;
 use std::convert::TryFrom;
 use std::sync::Arc;
diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs
index 78a86a3cdcf7..96a8febd043d 100644
--- a/datafusion/physical-expr/src/aggregate/count.rs
+++ b/datafusion/physical-expr/src/aggregate/count.rs
@@ -17,7 +17,6 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::type_name;
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
diff --git a/datafusion/physical-expr/src/aggregate/covariance.rs b/datafusion/physical-expr/src/aggregate/covariance.rs
index d621b3bad4a0..c8488162042f 100644
--- a/datafusion/physical-expr/src/aggregate/covariance.rs
+++ b/datafusion/physical-expr/src/aggregate/covariance.rs
@@ -17,7 +17,6 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::type_name;
 use std::any::Any;
 use std::sync::Arc;
 
diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs
index 3f2b698b8840..bdccdf522207 100644
--- a/datafusion/physical-expr/src/aggregate/min_max.rs
+++ b/datafusion/physical-expr/src/aggregate/min_max.rs
@@ -17,7 +17,7 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::{type_name, Any};
+use std::any::Any;
 use std::convert::TryFrom;
 use std::sync::Arc;
 
diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs
index 0841cb6db9ff..d639a1170815 100644
--- a/datafusion/physical-expr/src/aggregate/sum.rs
+++ b/datafusion/physical-expr/src/aggregate/sum.rs
@@ -17,7 +17,7 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::{type_name, Any};
+use std::any::Any;
 use std::convert::TryFrom;
 use std::sync::Arc;
 
diff --git a/datafusion/physical-expr/src/aggregate/variance.rs b/datafusion/physical-expr/src/aggregate/variance.rs
index dde85f414266..07f8a34ce41d 100644
--- a/datafusion/physical-expr/src/aggregate/variance.rs
+++ b/datafusion/physical-expr/src/aggregate/variance.rs
@@ -17,7 +17,7 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::{type_name, Any};
+use std::any::Any;
 use std::sync::Arc;
 
 use crate::aggregate::stats::StatsType;

From 86a8236627577f19b175a0d8fe76ea982ef71254 Mon Sep 17 00:00:00 2001
From: Francis Du <me@francis.run>
Date: Sun, 18 Sep 2022 23:32:04 +0800
Subject: [PATCH 17/30] [DataFrame] - Add cache function for DataFrame (#3512)

* feat: add cache function for dataframe

* fix: function doc typo

* fix: test issue
---
 datafusion/core/src/dataframe.rs | 65 ++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index 9be7138dca15..a1b43cf531f1 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -21,7 +21,7 @@ use crate::arrow::datatypes::Schema;
 use crate::arrow::datatypes::SchemaRef;
 use crate::arrow::record_batch::RecordBatch;
 use crate::arrow::util::pretty;
-use crate::datasource::TableProvider;
+use crate::datasource::{MemTable, TableProvider};
 use crate::error::Result;
 use crate::execution::{
     context::{SessionState, TaskContext},
@@ -35,6 +35,7 @@ use crate::physical_plan::file_format::{plan_to_csv, plan_to_json, plan_to_parqu
 use crate::physical_plan::SendableRecordBatchStream;
 use crate::physical_plan::{collect, collect_partitioned};
 use crate::physical_plan::{execute_stream, execute_stream_partitioned, ExecutionPlan};
+use crate::prelude::SessionContext;
 use crate::scalar::ScalarValue;
 use async_trait::async_trait;
 use parking_lot::RwLock;
@@ -733,6 +734,29 @@ impl DataFrame {
             )))
         }
     }
+
+    /// Cache DataFrame as a memory table.
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
+    /// let df = df.cache().await?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn cache(&self) -> Result<Arc<DataFrame>> {
+        let mem_table = MemTable::try_new(
+            SchemaRef::from(self.schema().clone()),
+            self.collect_partitioned().await?,
+        )?;
+
+        SessionContext::with_state(self.session_state.read().clone())
+            .read_table(Arc::new(mem_table))
+    }
 }
 
 // TODO: This will introduce a ref cycle (#2659)
@@ -1082,6 +1106,7 @@ mod tests {
         );
         Ok(())
     }
+
     /// Compare the formatted string representation of two plans for equality
     fn assert_same_plan(plan1: &LogicalPlan, plan2: &LogicalPlan) {
         assert_eq!(format!("{:?}", plan1), format!("{:?}", plan2));
@@ -1265,7 +1290,7 @@ mod tests {
         \n      Inner Join: #t1.c1 = #t2.c1\
         \n        TableScan: t1\
         \n        TableScan: t2",
-            format!("{:?}", df_renamed.to_unoptimized_plan())
+                   format!("{:?}", df_renamed.to_unoptimized_plan())
         );
 
         assert_eq!("\
@@ -1275,7 +1300,7 @@ mod tests {
         \n      Inner Join: #t1.c1 = #t2.c1\
         \n        TableScan: t1 projection=[c1, c2, c3]\
         \n        TableScan: t2 projection=[c1, c2, c3]",
-            format!("{:?}", df_renamed.to_logical_plan()?)
+                   format!("{:?}", df_renamed.to_logical_plan()?)
         );
 
         let df_results = df_renamed.collect().await?;
@@ -1303,6 +1328,7 @@ mod tests {
             .with_column("sum", cast(col("c2") + col("c3"), DataType::Int64))?;
 
         let df_results = df.collect().await?;
+        df.show().await?;
         assert_batches_sorted_eq!(
             vec![
                 "+----+----+-----+",
@@ -1353,4 +1379,37 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn cache_test() -> Result<()> {
+        let df = test_table()
+            .await?
+            .select_columns(&["c2", "c3"])?
+            .limit(0, Some(1))?
+            .with_column("sum", cast(col("c2") + col("c3"), DataType::Int64))?;
+
+        let cached_df = df.cache().await?;
+
+        assert_eq!(
+            "TableScan: ?table? projection=[c2, c3, sum]",
+            format!("{:?}", cached_df.to_logical_plan()?)
+        );
+
+        let df_results = df.collect().await?;
+        let cached_df_results = cached_df.collect().await?;
+        assert_batches_sorted_eq!(
+            vec![
+                "+----+----+-----+",
+                "| c2 | c3 | sum |",
+                "+----+----+-----+",
+                "| 2  | 1  | 3   |",
+                "+----+----+-----+",
+            ],
+            &cached_df_results
+        );
+
+        assert_eq!(&df_results, &cached_df_results);
+
+        Ok(())
+    }
 }

From 9b221001ee1bc0e91b54f5145a8423db865c6f34 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 19 Sep 2022 01:57:44 -0400
Subject: [PATCH 18/30] MINOR: reduce pub in OptimizerConfig (#3525)

---
 datafusion/core/src/execution/context.rs      | 18 ++++++-----
 datafusion/optimizer/src/optimizer.rs         | 31 ++++++++++++++++---
 .../optimizer/src/simplify_expressions.rs     | 10 +++---
 datafusion/optimizer/src/type_coercion.rs     |  2 +-
 4 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index cb0e16f545ea..947a2adb08e0 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -1564,14 +1564,16 @@ impl SessionState {
 
     /// Optimizes the logical plan by applying optimizer rules.
     pub fn optimize(&self, plan: &LogicalPlan) -> Result<LogicalPlan> {
-        let mut optimizer_config = OptimizerConfig::new().with_skip_failing_rules(
-            self.config
-                .config_options
-                .read()
-                .get_bool(OPT_OPTIMIZER_SKIP_FAILED_RULES),
-        );
-        optimizer_config.query_execution_start_time =
-            self.execution_props.query_execution_start_time;
+        let mut optimizer_config = OptimizerConfig::new()
+            .with_skip_failing_rules(
+                self.config
+                    .config_options
+                    .read()
+                    .get_bool(OPT_OPTIMIZER_SKIP_FAILED_RULES),
+            )
+            .with_query_execution_start_time(
+                self.execution_props.query_execution_start_time,
+            );
 
         if let LogicalPlan::Explain(e) = plan {
             let mut stringified_plans = e.stringified_plans.clone();
diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs
index 9170035b7d6e..e2ccd4944892 100644
--- a/datafusion/optimizer/src/optimizer.rs
+++ b/datafusion/optimizer/src/optimizer.rs
@@ -38,12 +38,15 @@ pub trait OptimizerRule {
     fn name(&self) -> &str;
 }
 
-/// Placeholder for optimizer configuration options
+/// Options to control the DataFusion Optimizer.
 #[derive(Debug)]
 pub struct OptimizerConfig {
-    /// Query execution start time that can be used to rewrite expressions such as `now()`
-    /// to use a literal value instead
-    pub query_execution_start_time: DateTime<Utc>,
+    /// Query execution start time that can be used to rewrite
+    /// expressions such as `now()` to use a literal value instead
+    query_execution_start_time: DateTime<Utc>,
+    /// id generator for optimizer passes
+    // TODO this should not be on the config,
+    // it should be its own 'OptimizerState' or something)
     next_id: usize,
     /// Option to skip rules that produce errors
     skip_failing_rules: bool,
@@ -59,16 +62,34 @@ impl OptimizerConfig {
         }
     }
 
-    /// Specify whether the optimizer should skip rules that produce errors, or fail the query
+    /// Specify whether the optimizer should skip rules that produce
+    /// errors, or fail the query
+    pub fn with_query_execution_start_time(
+        mut self,
+        query_execution_tart_time: DateTime<Utc>,
+    ) -> Self {
+        self.query_execution_start_time = query_execution_tart_time;
+        self
+    }
+
+    /// Specify whether the optimizer should skip rules that produce
+    /// errors, or fail the query
     pub fn with_skip_failing_rules(mut self, b: bool) -> Self {
         self.skip_failing_rules = b;
         self
     }
 
+    /// Generate the next ID needed
     pub fn next_id(&mut self) -> usize {
         self.next_id += 1;
         self.next_id
     }
+
+    /// Return the time at which the query execution started. This
+    /// time is used as the value for now()
+    pub fn query_execution_start_time(&self) -> DateTime<Utc> {
+        self.query_execution_start_time
+    }
 }
 
 impl Default for OptimizerConfig {
diff --git a/datafusion/optimizer/src/simplify_expressions.rs b/datafusion/optimizer/src/simplify_expressions.rs
index aa87c5318ae4..a9c0d4b1f4c8 100644
--- a/datafusion/optimizer/src/simplify_expressions.rs
+++ b/datafusion/optimizer/src/simplify_expressions.rs
@@ -260,7 +260,7 @@ impl OptimizerRule for SimplifyExpressions {
     ) -> Result<LogicalPlan> {
         let mut execution_props = ExecutionProps::new();
         execution_props.query_execution_start_time =
-            optimizer_config.query_execution_start_time;
+            optimizer_config.query_execution_start_time();
         self.optimize_internal(plan, &execution_props)
     }
 }
@@ -1887,8 +1887,8 @@ mod tests {
 
     // expect optimizing will result in an error, returning the error string
     fn get_optimized_plan_err(plan: &LogicalPlan, date_time: &DateTime<Utc>) -> String {
-        let mut config = OptimizerConfig::new();
-        config.query_execution_start_time = *date_time;
+        let mut config =
+            OptimizerConfig::new().with_query_execution_start_time(*date_time);
         let rule = SimplifyExpressions::new();
 
         let err = rule
@@ -1902,8 +1902,8 @@ mod tests {
         plan: &LogicalPlan,
         date_time: &DateTime<Utc>,
     ) -> String {
-        let mut config = OptimizerConfig::new();
-        config.query_execution_start_time = *date_time;
+        let mut config =
+            OptimizerConfig::new().with_query_execution_start_time(*date_time);
         let rule = SimplifyExpressions::new();
 
         let optimized_plan = rule
diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs
index 6cb711dfc87c..9d8e97f36a71 100644
--- a/datafusion/optimizer/src/type_coercion.rs
+++ b/datafusion/optimizer/src/type_coercion.rs
@@ -68,7 +68,7 @@ impl OptimizerRule for TypeCoercion {
 
         let mut execution_props = ExecutionProps::new();
         execution_props.query_execution_start_time =
-            optimizer_config.query_execution_start_time;
+            optimizer_config.query_execution_start_time();
         let const_evaluator = ConstEvaluator::try_new(&execution_props)?;
 
         let mut expr_rewrite = TypeCoercionRewriter {

From 3a9e0d0e676836f9a9e23280a1b19b111024bf5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 19 Sep 2022 14:48:01 +0200
Subject: [PATCH 19/30] Execute sort in parallel when a limit is used after
 sort (#3527)

* Parallel sort

* Move it to optimization rule

* Add rule

* Improve rule

* Remove bench

* Fix doc

* Fix indent
---
 datafusion/core/src/execution/context.rs      |  3 +
 datafusion/core/src/physical_optimizer/mod.rs |  1 +
 .../src/physical_optimizer/parallel_sort.rs   | 92 +++++++++++++++++++
 datafusion/core/src/physical_plan/planner.rs  |  2 +-
 .../core/src/physical_plan/sorts/sort.rs      | 25 ++---
 datafusion/core/tests/sql/explain_analyze.rs  |  4 +-
 6 files changed, 107 insertions(+), 20 deletions(-)
 create mode 100644 datafusion/core/src/physical_optimizer/parallel_sort.rs

diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index 947a2adb08e0..492782f53462 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -40,6 +40,7 @@ use crate::{
     physical_optimizer::{
         aggregate_statistics::AggregateStatistics,
         hash_build_probe_order::HashBuildProbeOrder, optimizer::PhysicalOptimizerRule,
+        parallel_sort::ParallelSort,
     },
 };
 pub use datafusion_physical_expr::execution_props::ExecutionProps;
@@ -1469,6 +1470,8 @@ impl SessionState {
                     .unwrap(),
             )));
         }
+        physical_optimizers.push(Arc::new(ParallelSort::new()));
+
         physical_optimizers.push(Arc::new(Repartition::new()));
         physical_optimizers.push(Arc::new(AddCoalescePartitionsExec::new()));
 
diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs
index 55550bcd2cff..82b7087ab2c5 100644
--- a/datafusion/core/src/physical_optimizer/mod.rs
+++ b/datafusion/core/src/physical_optimizer/mod.rs
@@ -23,6 +23,7 @@ pub mod coalesce_batches;
 pub mod hash_build_probe_order;
 pub mod merge_exec;
 pub mod optimizer;
+pub mod parallel_sort;
 pub mod pruning;
 pub mod repartition;
 mod utils;
diff --git a/datafusion/core/src/physical_optimizer/parallel_sort.rs b/datafusion/core/src/physical_optimizer/parallel_sort.rs
new file mode 100644
index 000000000000..3361d8155f7f
--- /dev/null
+++ b/datafusion/core/src/physical_optimizer/parallel_sort.rs
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Parralel sort parallelizes sorts if a limit is present after a sort (`ORDER BY LIMIT N`)
+use crate::{
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{
+        limit::GlobalLimitExec,
+        sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
+        with_new_children_if_necessary,
+    },
+};
+use std::sync::Arc;
+
+/// Optimizer rule that makes sort parallel if a limit is used after sort (`ORDER BY LIMIT N`)
+/// The plan will use `SortPreservingMergeExec` to merge the results
+#[derive(Default)]
+pub struct ParallelSort {}
+
+impl ParallelSort {
+    #[allow(missing_docs)]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+impl PhysicalOptimizerRule for ParallelSort {
+    fn optimize(
+        &self,
+        plan: Arc<dyn crate::physical_plan::ExecutionPlan>,
+        config: &crate::execution::context::SessionConfig,
+    ) -> Result<Arc<dyn crate::physical_plan::ExecutionPlan>> {
+        if plan.children().is_empty() {
+            // leaf node, children cannot be replaced
+            Ok(plan.clone())
+        } else {
+            // recurse down first
+            let children = plan
+                .children()
+                .iter()
+                .map(|child| self.optimize(child.clone(), config))
+                .collect::<Result<Vec<_>>>()?;
+            let plan = with_new_children_if_necessary(plan, children)?;
+            let children = plan.children();
+            let plan_any = plan.as_any();
+            // GlobalLimitExec (SortExec preserve_partitioning=False)
+            // -> GlobalLimitExec (SortExec preserve_partitioning=True)
+            let parallel_sort = plan_any.downcast_ref::<GlobalLimitExec>().is_some()
+                && children.len() == 1
+                && children[0].as_any().downcast_ref::<SortExec>().is_some()
+                && !children[0]
+                    .as_any()
+                    .downcast_ref::<SortExec>()
+                    .unwrap()
+                    .preserve_partitioning();
+
+            Ok(if parallel_sort {
+                let sort = children[0].as_any().downcast_ref::<SortExec>().unwrap();
+                let new_sort = SortExec::new_with_partitioning(
+                    sort.expr().to_vec(),
+                    sort.input().clone(),
+                    true,
+                );
+                let merge = SortPreservingMergeExec::new(
+                    sort.expr().to_vec(),
+                    Arc::new(new_sort),
+                );
+                with_new_children_if_necessary(plan, vec![Arc::new(merge)])?
+            } else {
+                plan.clone()
+            })
+        }
+    }
+
+    fn name(&self) -> &str {
+        "parallel_sort"
+    }
+}
diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs
index c87590059625..20a819622d5d 100644
--- a/datafusion/core/src/physical_plan/planner.rs
+++ b/datafusion/core/src/physical_plan/planner.rs
@@ -841,7 +841,7 @@ impl DefaultPhysicalPlanner {
                             )),
                         })
                         .collect::<Result<Vec<_>>>()?;
-                    Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?) )
+                    Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?))
                 }
                 LogicalPlan::Join(Join {
                     left,
diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs
index 64312327b483..cc0501785324 100644
--- a/datafusion/core/src/physical_plan/sorts/sort.rs
+++ b/datafusion/core/src/physical_plan/sorts/sort.rs
@@ -668,6 +668,11 @@ impl SortExec {
         Ok(Self::new_with_partitioning(expr, input, false))
     }
 
+    /// Whether this `SortExec` preserves partitioning of the children
+    pub fn preserve_partitioning(&self) -> bool {
+        self.preserve_partitioning
+    }
+
     /// Create a new sort execution plan with the option to preserve
     /// the partitioning of the input plan
     pub fn new_with_partitioning(
@@ -741,10 +746,11 @@ impl ExecutionPlan for SortExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(SortExec::try_new(
+        Ok(Arc::new(SortExec::new_with_partitioning(
             self.expr.clone(),
             children[0].clone(),
-        )?))
+            self.preserve_partitioning,
+        )))
     }
 
     fn execute(
@@ -753,21 +759,6 @@ impl ExecutionPlan for SortExec {
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         debug!("Start SortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-        if !self.preserve_partitioning {
-            if 0 != partition {
-                return Err(DataFusionError::Internal(format!(
-                    "SortExec invalid partition {}",
-                    partition
-                )));
-            }
-
-            // sort needs to operate on a single partition currently
-            if 1 != self.input.output_partitioning().partition_count() {
-                return Err(DataFusionError::Internal(
-                    "SortExec requires a single input partition".to_owned(),
-                ));
-            }
-        }
 
         debug!(
             "Start invoking SortExec's input.execute for partition: {}",
diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index 7f465c4c697f..a75e0e3fa515 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -686,8 +686,8 @@ async fn test_physical_plan_display_indent() {
     let physical_plan = ctx.create_physical_plan(&plan).await.unwrap();
     let expected = vec![
         "GlobalLimitExec: skip=0, fetch=10",
-        "  SortExec: [the_min@2 DESC]",
-        "    CoalescePartitionsExec",
+        "  SortPreservingMergeExec: [the_min@2 DESC]",
+        "    SortExec: [the_min@2 DESC]",
         "      ProjectionExec: expr=[c1@0 as c1, MAX(aggregate_test_100.c12)@1 as MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)@2 as the_min]",
         "        AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)]",
         "          CoalesceBatchesExec: target_batch_size=4096",

From a6fbf24ddb81e34258a940a48e371b1efe5f6e49 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 19 Sep 2022 14:13:54 -0400
Subject: [PATCH 20/30] Add support for `ScalarValue::Dictionary` to
 datafusion-proto (#3532)

---
 datafusion/proto/proto/datafusion.proto |  7 +++++
 datafusion/proto/src/from_proto.rs      | 16 +++++++++++
 datafusion/proto/src/lib.rs             | 15 ++++++++--
 datafusion/proto/src/to_proto.rs        | 38 +++++++++++++++++++++++--
 4 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto
index baabc04cfff7..0e499ee48ffc 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -726,6 +726,12 @@ message ScalarTimestampValue {
   string timezone = 5;
 }
 
+message ScalarDictionaryValue {
+  ArrowType index_type = 1;
+  ScalarValue value = 2;
+}
+
+
 message ScalarValue{
     oneof value {
         bool   bool_value = 1;
@@ -752,6 +758,7 @@ message ScalarValue{
         int32 interval_yearmonth_value = 24;
         int64 interval_daytime_value = 25;
         ScalarTimestampValue timestamp_value = 26;
+        ScalarDictionaryValue dictionary_value = 27;
     }
 }
 
diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs
index 8c0124b9906b..95bfeb819a0c 100644
--- a/datafusion/proto/src/from_proto.rs
+++ b/datafusion/proto/src/from_proto.rs
@@ -776,6 +776,22 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     }
                 }
             }
+            Value::DictionaryValue(v) => {
+                let index_type: DataType = v
+                    .index_type
+                    .as_ref()
+                    .ok_or_else(|| Error::required("index_type"))?
+                    .try_into()?;
+
+                let value: Self = v
+                    .value
+                    .as_ref()
+                    .ok_or_else(|| Error::required("value"))?
+                    .as_ref()
+                    .try_into()?;
+
+                Self::Dictionary(Box::new(index_type), Box::new(value))
+            }
         })
     }
 }
diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs
index ce2678025a88..4a1e782f3be1 100644
--- a/datafusion/proto/src/lib.rs
+++ b/datafusion/proto/src/lib.rs
@@ -451,11 +451,22 @@ mod roundtrip_tests {
                     true,
                 )),
             ),
+            ScalarValue::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(ScalarValue::Utf8(Some("foo".into()))),
+            ),
+            ScalarValue::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(ScalarValue::Utf8(None)),
+            ),
         ];
 
         for test_case in should_pass.into_iter() {
-            let proto: super::protobuf::ScalarValue = (&test_case).try_into().unwrap();
-            let _roundtrip: ScalarValue = (&proto).try_into().unwrap();
+            let proto: super::protobuf::ScalarValue = (&test_case)
+                .try_into()
+                .expect("failed conversion to protobuf");
+            let _roundtrip: ScalarValue =
+                (&proto).try_into().expect("failed conversion to protobuf");
         }
     }
 
diff --git a/datafusion/proto/src/to_proto.rs b/datafusion/proto/src/to_proto.rs
index 43d649029f8d..ed0b5ec0871c 100644
--- a/datafusion/proto/src/to_proto.rs
+++ b/datafusion/proto/src/to_proto.rs
@@ -1176,12 +1176,46 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
                     Value::IntervalDaytimeValue(*s)
                 })
             }
-            ScalarValue::Null => protobuf::ScalarValue {
+            datafusion::scalar::ScalarValue::Null => protobuf::ScalarValue {
                 value: Some(Value::NullValue(PrimitiveScalarType::Null as i32)),
             },
-            _ => {
+
+            datafusion::scalar::ScalarValue::Binary(_) => {
+                // not yet implemented (TODO file ticket)
+                return Err(Error::invalid_scalar_value(val));
+            }
+
+            datafusion::scalar::ScalarValue::LargeBinary(_) => {
+                // not yet implemented (TODO file ticket)
+                return Err(Error::invalid_scalar_value(val));
+            }
+
+            datafusion::scalar::ScalarValue::Time64(_) => {
+                // not yet implemented (TODO file ticket)
+                return Err(Error::invalid_scalar_value(val));
+            }
+
+            datafusion::scalar::ScalarValue::IntervalMonthDayNano(_) => {
+                // not yet implemented (TODO file ticket)
                 return Err(Error::invalid_scalar_value(val));
             }
+
+            datafusion::scalar::ScalarValue::Struct(_, _) => {
+                // not yet implemented (TODO file ticket)
+                return Err(Error::invalid_scalar_value(val));
+            }
+
+            datafusion::scalar::ScalarValue::Dictionary(index_type, val) => {
+                let value: protobuf::ScalarValue = val.as_ref().try_into()?;
+                protobuf::ScalarValue {
+                    value: Some(Value::DictionaryValue(Box::new(
+                        protobuf::ScalarDictionaryValue {
+                            index_type: Some(index_type.as_ref().try_into()?),
+                            value: Some(Box::new(value)),
+                        },
+                    ))),
+                }
+            }
         };
 
         Ok(scalar_val)

From f30fc4eccc4337fb0875dd731553a8389f6c6f65 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Mon, 19 Sep 2022 12:38:58 -0600
Subject: [PATCH 21/30] Add Debug to TableReference and ResolvedTableReference
 (#3533)

---
 datafusion/sql/src/table_reference.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/sql/src/table_reference.rs b/datafusion/sql/src/table_reference.rs
index 44848eb99194..27905163dc7e 100644
--- a/datafusion/sql/src/table_reference.rs
+++ b/datafusion/sql/src/table_reference.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 // / Represents a resolved path to a table of the form "catalog.schema.table"
-#[derive(Clone, Copy)]
+#[derive(Debug, Clone, Copy)]
 pub struct ResolvedTableReference<'a> {
     /// The catalog (aka database) containing the table
     pub catalog: &'a str,
@@ -27,7 +27,7 @@ pub struct ResolvedTableReference<'a> {
 }
 
 /// Represents a path to a table that may require further resolution
-#[derive(Clone, Copy)]
+#[derive(Debug, Clone, Copy)]
 pub enum TableReference<'a> {
     /// An unqualified table reference, e.g. "table"
     Bare {

From 4b1e0444a6d82871c19ec826d61a4b33f3b52486 Mon Sep 17 00:00:00 2001
From: George Andronchik <g.andronchik@gmail.com>
Date: Tue, 20 Sep 2022 05:49:37 +0800
Subject: [PATCH 22/30] feat: Union types coercion (#3513)

1
---
 datafusion/core/src/physical_plan/union.rs  |  32 +++++-
 datafusion/expr/src/expr_rewriter.rs        |  38 ++++++-
 datafusion/expr/src/logical_plan/builder.rs |  73 ++++++++-----
 datafusion/sql/src/planner.rs               | 108 +++++++++++++++++++-
 4 files changed, 219 insertions(+), 32 deletions(-)

diff --git a/datafusion/core/src/physical_plan/union.rs b/datafusion/core/src/physical_plan/union.rs
index d57fbe0f3df3..bf9dfbd1b694 100644
--- a/datafusion/core/src/physical_plan/union.rs
+++ b/datafusion/core/src/physical_plan/union.rs
@@ -23,8 +23,12 @@
 
 use std::{any::Any, sync::Arc};
 
-use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
+use arrow::{
+    datatypes::{Field, Schema, SchemaRef},
+    record_batch::RecordBatch,
+};
 use futures::StreamExt;
+use itertools::Itertools;
 use log::debug;
 
 use super::{
@@ -46,14 +50,38 @@ pub struct UnionExec {
     inputs: Vec<Arc<dyn ExecutionPlan>>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
+    /// Schema of Union
+    schema: SchemaRef,
 }
 
 impl UnionExec {
     /// Create a new UnionExec
     pub fn new(inputs: Vec<Arc<dyn ExecutionPlan>>) -> Self {
+        let fields: Vec<Field> = (0..inputs[0].schema().fields().len())
+            .map(|i| {
+                inputs
+                    .iter()
+                    .filter_map(|input| {
+                        if input.schema().fields().len() > i {
+                            Some(input.schema().field(i).clone())
+                        } else {
+                            None
+                        }
+                    })
+                    .find_or_first(|f| f.is_nullable())
+                    .unwrap()
+            })
+            .collect();
+
+        let schema = Arc::new(Schema::new_with_metadata(
+            fields,
+            inputs[0].schema().metadata().clone(),
+        ));
+
         UnionExec {
             inputs,
             metrics: ExecutionPlanMetricsSet::new(),
+            schema,
         }
     }
 
@@ -70,7 +98,7 @@ impl ExecutionPlan for UnionExec {
     }
 
     fn schema(&self) -> SchemaRef {
-        self.inputs[0].schema()
+        self.schema.clone()
     }
 
     fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
diff --git a/datafusion/expr/src/expr_rewriter.rs b/datafusion/expr/src/expr_rewriter.rs
index 533f31ce1584..1c81b4c4a60c 100644
--- a/datafusion/expr/src/expr_rewriter.rs
+++ b/datafusion/expr/src/expr_rewriter.rs
@@ -18,8 +18,8 @@
 //! Expression rewriter
 
 use crate::expr::GroupingSet;
-use crate::logical_plan::Aggregate;
-use crate::utils::grouping_set_to_exprlist;
+use crate::logical_plan::{Aggregate, Projection};
+use crate::utils::{from_plan, grouping_set_to_exprlist};
 use crate::{Expr, ExprSchemable, LogicalPlan};
 use datafusion_common::Result;
 use datafusion_common::{Column, DFSchema};
@@ -524,6 +524,40 @@ pub fn unnormalize_cols(exprs: impl IntoIterator<Item = Expr>) -> Vec<Expr> {
     exprs.into_iter().map(unnormalize_col).collect()
 }
 
+/// Returns plan with expressions coerced to types compatible with
+/// schema types
+pub fn coerce_plan_expr_for_schema(
+    plan: &LogicalPlan,
+    schema: &DFSchema,
+) -> Result<LogicalPlan> {
+    let new_expr = plan
+        .expressions()
+        .into_iter()
+        .enumerate()
+        .map(|(i, expr)| {
+            let new_type = schema.field(i).data_type();
+            if plan.schema().field(i).data_type() != schema.field(i).data_type() {
+                match (plan, &expr) {
+                    (
+                        LogicalPlan::Projection(Projection { input, .. }),
+                        Expr::Alias(e, alias),
+                    ) => Ok(Expr::Alias(
+                        Box::new(e.clone().cast_to(new_type, input.schema())?),
+                        alias.clone(),
+                    )),
+                    _ => expr.cast_to(new_type, plan.schema()),
+                }
+            } else {
+                Ok(expr)
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    let new_inputs = plan.inputs().into_iter().cloned().collect::<Vec<_>>();
+
+    from_plan(plan, &new_expr, &new_inputs)
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index 0125291fda0d..def9927ab6b3 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -17,7 +17,10 @@
 
 //! This module provides a builder for creating LogicalPlans
 
-use crate::expr_rewriter::{normalize_col, normalize_cols, rewrite_sort_cols_by_aggs};
+use crate::binary_rule::comparison_coercion;
+use crate::expr_rewriter::{
+    coerce_plan_expr_for_schema, normalize_col, normalize_cols, rewrite_sort_cols_by_aggs,
+};
 use crate::utils::{
     columnize_expr, exprlist_to_fields, from_plan, grouping_set_to_exprlist,
 };
@@ -882,43 +885,59 @@ pub fn union_with_alias(
     right_plan: LogicalPlan,
     alias: Option<String>,
 ) -> Result<LogicalPlan> {
-    let union_schema = left_plan.schema().clone();
-    let inputs_iter = vec![left_plan, right_plan]
+    let union_schema = (0..left_plan.schema().fields().len())
+        .map(|i| {
+            let left_field = left_plan.schema().field(i);
+            let right_field = right_plan.schema().field(i);
+            let nullable = left_field.is_nullable() || right_field.is_nullable();
+            let data_type =
+                comparison_coercion(left_field.data_type(), right_field.data_type())
+                    .ok_or_else(|| {
+                        DataFusionError::Plan(format!(
+                    "UNION Column {} (type: {}) is not compatible with column {} (type: {})",
+                    right_field.name(),
+                    right_field.data_type(),
+                    left_field.name(),
+                    left_field.data_type()
+                ))
+                    })?;
+
+            Ok(DFField::new(
+                alias.as_deref(),
+                left_field.name(),
+                data_type,
+                nullable,
+            ))
+        })
+        .collect::<Result<Vec<_>>>()?
+        .to_dfschema()?;
+
+    let inputs = vec![left_plan, right_plan]
         .into_iter()
         .flat_map(|p| match p {
             LogicalPlan::Union(Union { inputs, .. }) => inputs,
             x => vec![Arc::new(x)],
-        });
-
-    inputs_iter
-        .clone()
-        .skip(1)
-        .try_for_each(|input_plan| -> Result<()> {
-            union_schema.check_arrow_schema_type_compatible(
-                &((**input_plan.schema()).clone().into()),
-            )
-        })?;
-
-    let inputs = inputs_iter
-        .map(|p| match p.as_ref() {
-            LogicalPlan::Projection(Projection {
-                expr, input, alias, ..
-            }) => Ok(Arc::new(project_with_column_index_alias(
-                expr.to_vec(),
-                input.clone(),
-                union_schema.clone(),
-                alias.clone(),
-            )?)),
-            x => Ok(Arc::new(x.clone())),
         })
-        .into_iter()
+        .map(|p| {
+            let plan = coerce_plan_expr_for_schema(&p, &union_schema)?;
+            match plan {
+                LogicalPlan::Projection(Projection {
+                    expr, input, alias, ..
+                }) => Ok(Arc::new(project_with_column_index_alias(
+                    expr.to_vec(),
+                    input,
+                    Arc::new(union_schema.clone()),
+                    alias,
+                )?)),
+                x => Ok(Arc::new(x)),
+            }
+        })
         .collect::<Result<Vec<_>>>()?;
 
     if inputs.is_empty() {
         return Err(DataFusionError::Plan("Empty UNION".to_string()));
     }
 
-    let union_schema = (**inputs[0].schema()).clone();
     let union_schema = Arc::new(match alias {
         Some(ref alias) => union_schema.replace_qualifier(alias.as_str()),
         None => union_schema.strip_qualifiers(),
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 04518d81e457..541dc2520e93 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -4163,7 +4163,7 @@ mod tests {
         let sql = "SELECT interval '1 year 1 day' UNION ALL SELECT 1";
         let err = logical_plan(sql).expect_err("query should have failed");
         assert_eq!(
-            "Plan(\"Column Int64(1) (type: Int64) is \
+            "Plan(\"UNION Column Int64(1) (type: Int64) is \
             not compatible with column IntervalMonthDayNano\
             (\\\"950737950189618795196236955648\\\") \
             (type: Interval(MonthDayNano))\")",
@@ -4171,6 +4171,112 @@ mod tests {
         );
     }
 
+    #[test]
+    fn union_with_different_decimal_data_types() {
+        let sql = "SELECT 1 a UNION ALL SELECT 1.1 a";
+        let expected = "Union\
+            \n  Projection: CAST(Int64(1) AS Float64) AS a\
+            \n    EmptyRelation\
+            \n  Projection: Float64(1.1) AS a\
+            \n    EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn union_with_null() {
+        let sql = "SELECT NULL a UNION ALL SELECT 1.1 a";
+        let expected = "Union\
+            \n  Projection: CAST(NULL AS Float64) AS a\
+            \n    EmptyRelation\
+            \n  Projection: Float64(1.1) AS a\
+            \n    EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn union_with_float_and_string() {
+        let sql = "SELECT 'a' a UNION ALL SELECT 1.1 a";
+        let expected = "Union\
+            \n  Projection: Utf8(\"a\") AS a\
+            \n    EmptyRelation\
+            \n  Projection: CAST(Float64(1.1) AS Utf8) AS a\
+            \n    EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn union_with_multiply_cols() {
+        let sql = "SELECT 'a' a, 1 b UNION ALL SELECT 1.1 a, 1.1 b";
+        let expected = "Union\
+            \n  Projection: Utf8(\"a\") AS a, CAST(Int64(1) AS Float64) AS b\
+            \n    EmptyRelation\
+            \n  Projection: CAST(Float64(1.1) AS Utf8) AS a, Float64(1.1) AS b\
+            \n    EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn sorted_union_with_different_types_and_group_by() {
+        let sql = "SELECT a FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT a FROM (select 1.1 a) x GROUP BY 1) ORDER BY 1";
+        let expected = "Sort: #a ASC NULLS LAST\
+            \n  Union\
+            \n    Projection: CAST(#x.a AS Float64) AS a\
+            \n      Aggregate: groupBy=[[#x.a]], aggr=[[]]\
+            \n        Projection: #x.a, alias=x\
+            \n          Projection: Int64(1) AS a, alias=x\
+            \n            EmptyRelation\
+            \n    Projection: #x.a\
+            \n      Aggregate: groupBy=[[#x.a]], aggr=[[]]\
+            \n        Projection: #x.a, alias=x\
+            \n          Projection: Float64(1.1) AS a, alias=x\
+            \n            EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn union_with_binary_expr_and_cast() {
+        let sql = "SELECT cast(0.0 + a as integer) FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT 2.1 + a FROM (select 1 a) x GROUP BY 1)";
+        let expected = "Union\
+            \n  Projection: CAST(#Float64(0) + x.a AS Float64) AS Float64(0) + x.a\
+            \n    Aggregate: groupBy=[[CAST(Float64(0) + #x.a AS Int32)]], aggr=[[]]\
+            \n      Projection: #x.a, alias=x\
+            \n        Projection: Int64(1) AS a, alias=x\
+            \n          EmptyRelation\
+            \n  Projection: #Float64(2.1) + x.a\
+            \n    Aggregate: groupBy=[[Float64(2.1) + #x.a]], aggr=[[]]\
+            \n      Projection: #x.a, alias=x\
+            \n        Projection: Int64(1) AS a, alias=x\
+            \n          EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn union_with_aliases() {
+        let sql = "SELECT a as a1 FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT a as a1 FROM (select 1.1 a) x GROUP BY 1)";
+        let expected = "Union\
+            \n  Projection: CAST(#x.a AS Float64) AS a1\
+            \n    Aggregate: groupBy=[[#x.a]], aggr=[[]]\
+            \n      Projection: #x.a, alias=x\
+            \n        Projection: Int64(1) AS a, alias=x\
+            \n          EmptyRelation\
+            \n  Projection: #x.a AS a1\
+            \n    Aggregate: groupBy=[[#x.a]], aggr=[[]]\
+            \n      Projection: #x.a, alias=x\
+            \n        Projection: Float64(1.1) AS a, alias=x\
+            \n          EmptyRelation";
+        quick_test(sql, expected);
+    }
+
+    #[test]
+    fn union_with_incompatible_data_types() {
+        let sql = "SELECT 'a' a UNION ALL SELECT true a";
+        let err = logical_plan(sql).expect_err("query should have failed");
+        assert_eq!(
+            "Plan(\"UNION Column a (type: Boolean) is not compatible with column a (type: Utf8)\")",
+            format!("{:?}", err)
+        );
+    }
+
     #[test]
     fn empty_over() {
         let sql = "SELECT order_id, MAX(order_id) OVER () from orders";

From e87342310e20b58ae2de2bb7cd0942dce089092e Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 20 Sep 2022 03:14:45 -0400
Subject: [PATCH 23/30] Actually test that `ScalarValue`s are the same after
 round trip serialization (#3537)

---
 datafusion/proto/src/lib.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs
index 4a1e782f3be1..f34da705f94a 100644
--- a/datafusion/proto/src/lib.rs
+++ b/datafusion/proto/src/lib.rs
@@ -465,8 +465,17 @@ mod roundtrip_tests {
             let proto: super::protobuf::ScalarValue = (&test_case)
                 .try_into()
                 .expect("failed conversion to protobuf");
-            let _roundtrip: ScalarValue =
-                (&proto).try_into().expect("failed conversion to protobuf");
+
+            let roundtrip: ScalarValue = (&proto)
+                .try_into()
+                .expect("failed conversion from protobuf");
+
+            assert_eq!(
+                test_case, roundtrip,
+                "ScalarValue was not the same after round trip!\n\n\
+                        Input: {:?}\n\nRoundtrip: {:?}",
+                test_case, roundtrip
+            );
         }
     }
 

From 67002a06039523e1356a5c91894b1387b021bbf7 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 20 Sep 2022 05:55:17 -0400
Subject: [PATCH 24/30] Upgrade to arrow 23.0.0 (#3483)

* Changes for API

* Update avro code for API changes

* Use divide_opt` kernel

* Update update_arrow_deps.py

* Update arrow dependency to 23.0.0

* Use nicer RecordBatchOptions API

* cleanups

* fix: update
---
 datafusion-cli/Cargo.toml                        |  2 +-
 datafusion-examples/Cargo.toml                   |  2 +-
 datafusion-examples/examples/flight_server.rs    |  5 ++++-
 datafusion/common/Cargo.toml                     |  4 ++--
 datafusion/core/Cargo.toml                       |  6 +++---
 datafusion/core/fuzz-utils/Cargo.toml            |  2 +-
 .../core/src/avro_to_arrow/arrow_array_reader.rs |  7 ++-----
 .../core/src/physical_plan/coalesce_batches.rs   |  3 +--
 .../core/src/physical_plan/file_format/mod.rs    |  3 +--
 datafusion/expr/Cargo.toml                       |  2 +-
 datafusion/jit/Cargo.toml                        |  2 +-
 datafusion/optimizer/Cargo.toml                  |  2 +-
 datafusion/physical-expr/Cargo.toml              |  2 +-
 .../physical-expr/src/expressions/binary.rs      | 10 ++++------
 .../src/expressions/binary/kernels_arrow.rs      |  7 ++++---
 datafusion/proto/Cargo.toml                      |  2 +-
 datafusion/row/Cargo.toml                        |  2 +-
 datafusion/sql/Cargo.toml                        |  2 +-
 dev/update_arrow_deps.py                         | 16 ++++++++++++++--
 19 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 394bf59a8421..de7bfe07094a 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -29,7 +29,7 @@ rust-version = "1.62"
 readme = "README.md"
 
 [dependencies]
-arrow = "22.0.0"
+arrow = "23.0.0"
 clap = { version = "3", features = ["derive", "cargo"] }
 datafusion = { path = "../datafusion/core", version = "12.0.0" }
 dirs = "4.0.0"
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
index c769326f1996..5d2150848afb 100644
--- a/datafusion-examples/Cargo.toml
+++ b/datafusion-examples/Cargo.toml
@@ -34,7 +34,7 @@ path = "examples/avro_sql.rs"
 required-features = ["datafusion/avro"]
 
 [dev-dependencies]
-arrow-flight = "22.0.0"
+arrow-flight = "23.0.0"
 async-trait = "0.1.41"
 datafusion = { path = "../datafusion/core" }
 futures = "0.3"
diff --git a/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs
index 5dbd694ac0f5..88cc6f1c23be 100644
--- a/datafusion-examples/examples/flight_server.rs
+++ b/datafusion-examples/examples/flight_server.rs
@@ -19,6 +19,7 @@ use std::pin::Pin;
 use std::sync::Arc;
 
 use arrow_flight::SchemaAsIpc;
+use datafusion::arrow::error::ArrowError;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{ListingOptions, ListingTableUrl};
 use futures::Stream;
@@ -77,7 +78,9 @@ impl FlightService for FlightServiceImpl {
             .unwrap();
 
         let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default();
-        let schema_result = SchemaAsIpc::new(&schema, &options).into();
+        let schema_result = SchemaAsIpc::new(&schema, &options)
+            .try_into()
+            .map_err(|e: ArrowError| tonic::Status::internal(e.to_string()))?;
 
         Ok(Response::new(schema_result))
     }
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index 028a4874a8bc..3337747efe37 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -39,11 +39,11 @@ pyarrow = ["pyo3"]
 
 [dependencies]
 apache-avro = { version = "0.14", features = ["snappy"], optional = true }
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 avro-rs = { version = "0.13", features = ["snappy"], optional = true }
 cranelift-module = { version = "0.87.0", optional = true }
 object_store = { version = "0.5.0", optional = true }
 ordered-float = "3.0"
-parquet = { version = "22.0.0", features = ["arrow"], optional = true }
+parquet = { version = "23.0.0", features = ["arrow"], optional = true }
 pyo3 = { version = "0.17.1", optional = true }
 sqlparser = "0.23"
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index e63f3100d6c9..f5f4350b072a 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -56,7 +56,7 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
 apache-avro = { version = "0.14", optional = true }
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 async-trait = "0.1.41"
 bytes = "1.1"
 chrono = { version = "0.4", default-features = false }
@@ -78,7 +78,7 @@ num_cpus = "1.13.0"
 object_store = "0.5.0"
 ordered-float = "3.0"
 parking_lot = "0.12"
-parquet = { version = "22.0.0", features = ["arrow", "async"] }
+parquet = { version = "23.0.0", features = ["arrow", "async"] }
 paste = "^1.0"
 pin-project-lite = "^0.2.7"
 pyo3 = { version = "0.17.1", optional = true }
@@ -93,7 +93,7 @@ url = "2.2"
 uuid = { version = "1.0", features = ["v4"] }
 
 [dev-dependencies]
-arrow = { version = "22.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
+arrow = { version = "23.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
 async-trait = "0.1.53"
 criterion = "0.4"
 csv = "1.1.6"
diff --git a/datafusion/core/fuzz-utils/Cargo.toml b/datafusion/core/fuzz-utils/Cargo.toml
index 5f07570ad043..07f09b6362cc 100644
--- a/datafusion/core/fuzz-utils/Cargo.toml
+++ b/datafusion/core/fuzz-utils/Cargo.toml
@@ -23,6 +23,6 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 env_logger = "0.9.0"
 rand = "0.8"
diff --git a/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs
index bed5bbcdc885..6dee6f3d25a2 100644
--- a/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs
+++ b/datafusion/core/src/avro_to_arrow/arrow_array_reader.rs
@@ -20,8 +20,7 @@
 use crate::arrow::array::{
     make_array, Array, ArrayBuilder, ArrayData, ArrayDataBuilder, ArrayRef,
     BooleanBuilder, LargeStringArray, ListBuilder, NullArray, OffsetSizeTrait,
-    PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder,
-    StringDictionaryBuilder,
+    PrimitiveArray, StringArray, StringBuilder, StringDictionaryBuilder,
 };
 use crate::arrow::buffer::{Buffer, MutableBuffer};
 use crate::arrow::datatypes::{
@@ -171,9 +170,7 @@ impl<'a, R: Read> AvroArrowArrayReader<'a, R> {
     where
         T: ArrowPrimitiveType + ArrowDictionaryKeyType,
     {
-        let key_builder = PrimitiveBuilder::<T>::with_capacity(row_len);
-        let values_builder = StringBuilder::with_capacity(row_len, 5);
-        StringDictionaryBuilder::new(key_builder, values_builder)
+        StringDictionaryBuilder::with_capacity(row_len, row_len, row_len)
     }
 
     fn build_wrapped_list_array(
diff --git a/datafusion/core/src/physical_plan/coalesce_batches.rs b/datafusion/core/src/physical_plan/coalesce_batches.rs
index f2edacd90847..317500ddc904 100644
--- a/datafusion/core/src/physical_plan/coalesce_batches.rs
+++ b/datafusion/core/src/physical_plan/coalesce_batches.rs
@@ -292,8 +292,7 @@ pub fn concat_batches(
         row_count
     );
 
-    let mut options = RecordBatchOptions::default();
-    options.row_count = Some(row_count);
+    let options = RecordBatchOptions::new().with_row_count(Some(row_count));
 
     RecordBatch::try_new_with_options(schema.clone(), arrays, &options)
 }
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs
index 2d59570078bc..295fef27b291 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -286,8 +286,7 @@ impl SchemaAdapter {
         let projected_schema = Arc::new(self.table_schema.clone().project(projections)?);
 
         // Necessary to handle empty batches
-        let mut options = RecordBatchOptions::default();
-        options.row_count = Some(batch.num_rows());
+        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
 
         Ok(RecordBatch::try_new_with_options(
             projected_schema,
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index b187c4d32665..6277091ed25b 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -36,6 +36,6 @@ path = "src/lib.rs"
 
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 datafusion-common = { path = "../common", version = "12.0.0" }
 sqlparser = "0.23"
diff --git a/datafusion/jit/Cargo.toml b/datafusion/jit/Cargo.toml
index 32e99de53a68..428027cb4367 100644
--- a/datafusion/jit/Cargo.toml
+++ b/datafusion/jit/Cargo.toml
@@ -36,7 +36,7 @@ path = "src/lib.rs"
 jit = []
 
 [dependencies]
-arrow = "22.0.0"
+arrow = "23.0.0"
 cranelift = "0.87.0"
 cranelift-jit = "0.87.0"
 cranelift-module = "0.87.0"
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index aa735f7aa6e1..409521088bf6 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -37,7 +37,7 @@ default = ["unicode_expressions"]
 unicode_expressions = []
 
 [dependencies]
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 async-trait = "0.1.41"
 chrono = { version = "0.4", default-features = false }
 datafusion-common = { path = "../common", version = "12.0.0" }
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index 182eeb40bfbe..ba618acb64bf 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -40,7 +40,7 @@ unicode_expressions = ["unicode-segmentation"]
 
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 blake2 = { version = "^0.10.2", optional = true }
 blake3 = { version = "1.0", optional = true }
 chrono = { version = "0.4", default-features = false }
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index 2395f85c1eb0..b078da5f0b34 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -24,7 +24,7 @@ use std::{any::Any, sync::Arc};
 
 use arrow::array::*;
 use arrow::compute::kernels::arithmetic::{
-    add, add_scalar, divide, divide_scalar, modulus, modulus_scalar, multiply,
+    add, add_scalar, divide_opt, divide_scalar, modulus, modulus_scalar, multiply,
     multiply_scalar, subtract, subtract_scalar,
 };
 use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene};
@@ -60,7 +60,7 @@ use kernels::{
     bitwise_xor, bitwise_xor_scalar,
 };
 use kernels_arrow::{
-    add_decimal, add_decimal_scalar, divide_decimal, divide_decimal_scalar,
+    add_decimal, add_decimal_scalar, divide_decimal_scalar, divide_opt_decimal,
     eq_decimal_scalar, gt_decimal_scalar, gt_eq_decimal_scalar, is_distinct_from,
     is_distinct_from_bool, is_distinct_from_decimal, is_distinct_from_null,
     is_distinct_from_utf8, is_not_distinct_from, is_not_distinct_from_bool,
@@ -844,7 +844,7 @@ impl BinaryExpr {
             Operator::Plus => binary_primitive_array_op!(left, right, add),
             Operator::Minus => binary_primitive_array_op!(left, right, subtract),
             Operator::Multiply => binary_primitive_array_op!(left, right, multiply),
-            Operator::Divide => binary_primitive_array_op!(left, right, divide),
+            Operator::Divide => binary_primitive_array_op!(left, right, divide_opt),
             Operator::Modulo => binary_primitive_array_op!(left, right, modulus),
             Operator::And => {
                 if left_data_type == &DataType::Boolean {
@@ -1326,9 +1326,7 @@ mod tests {
         let string_type = DataType::Utf8;
 
         // build dictionary
-        let keys_builder = PrimitiveBuilder::<Int32Type>::with_capacity(10);
-        let values_builder = arrow::array::StringBuilder::with_capacity(10, 1024);
-        let mut dict_builder = StringDictionaryBuilder::new(keys_builder, values_builder);
+        let mut dict_builder = StringDictionaryBuilder::<Int32Type>::new();
 
         dict_builder.append("one")?;
         dict_builder.append_null();
diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
index 8caed7191fb2..99b8a5a06dd6 100644
--- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
+++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
@@ -372,7 +372,7 @@ pub(crate) fn multiply_decimal_scalar(
     Ok(array)
 }
 
-pub(crate) fn divide_decimal(
+pub(crate) fn divide_opt_decimal(
     left: &Decimal128Array,
     right: &Decimal128Array,
 ) -> Result<Decimal128Array> {
@@ -636,7 +636,7 @@ mod tests {
             25,
             3,
         );
-        let result = divide_decimal(&left_decimal_array, &right_decimal_array)?;
+        let result = divide_opt_decimal(&left_decimal_array, &right_decimal_array)?;
         let expect = create_decimal_array(
             &[Some(123456700), None, Some(22446672), Some(-10037130), None],
             25,
@@ -674,7 +674,8 @@ mod tests {
         let left_decimal_array = create_decimal_array(&[Some(101)], 10, 1);
         let right_decimal_array = create_decimal_array(&[Some(0)], 1, 1);
 
-        let err = divide_decimal(&left_decimal_array, &right_decimal_array).unwrap_err();
+        let err =
+            divide_opt_decimal(&left_decimal_array, &right_decimal_array).unwrap_err();
         assert_eq!("Arrow error: Divide by zero error", err.to_string());
         let err = divide_decimal_scalar(&left_decimal_array, 0).unwrap_err();
         assert_eq!("Arrow error: Divide by zero error", err.to_string());
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index c4a3c4149148..c8711fa0b02f 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -37,7 +37,7 @@ default = []
 json = ["pbjson", "pbjson-build", "serde", "serde_json"]
 
 [dependencies]
-arrow = "22.0.0"
+arrow = "23.0.0"
 datafusion = { path = "../core", version = "12.0.0" }
 datafusion-common = { path = "../common", version = "12.0.0" }
 datafusion-expr = { path = "../expr", version = "12.0.0" }
diff --git a/datafusion/row/Cargo.toml b/datafusion/row/Cargo.toml
index 3adff03b02fd..d1c7fdf0cf44 100644
--- a/datafusion/row/Cargo.toml
+++ b/datafusion/row/Cargo.toml
@@ -37,7 +37,7 @@ path = "src/lib.rs"
 jit = ["datafusion-jit"]
 
 [dependencies]
-arrow = "22.0.0"
+arrow = "23.0.0"
 datafusion-common = { path = "../common", version = "12.0.0" }
 datafusion-jit = { path = "../jit", version = "12.0.0", optional = true }
 paste = "^1.0"
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index 2e1536559cc5..b2da631827a5 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -38,7 +38,7 @@ unicode_expressions = []
 
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-arrow = { version = "22.0.0", features = ["prettyprint"] }
+arrow = { version = "23.0.0", features = ["prettyprint"] }
 datafusion-common = { path = "../common", version = "12.0.0" }
 datafusion-expr = { path = "../expr", version = "12.0.0" }
 hashbrown = "0.12"
diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py
index a991de49eea3..b685ad2738b1 100755
--- a/dev/update_arrow_deps.py
+++ b/dev/update_arrow_deps.py
@@ -88,8 +88,20 @@ def update_version_cargo_toml(cargo_toml, new_version):
 
     for section in ("dependencies", "dev-dependencies"):
         for (dep_name, constraint) in doc.get(section, {}).items():
-            if dep_name in ("arrow", "parquet", "arrow-flight") and constraint.get("version") is not None:
-                doc[section][dep_name]["version"] = new_version
+            if dep_name in ("arrow", "parquet", "arrow-flight"):
+                if type(constraint) == tomlkit.items.String:
+                    # handle constraint that is (only) a string like '12',
+                    doc[section][dep_name] = new_version
+                elif type(constraint) == dict:
+                    # handle constraint that is itself a struct like
+                    # {'version': '12', 'features': ['prettyprint']}
+                    doc[section][dep_name]["version"] = new_version
+                elif type(constraint) == tomlkit.items.InlineTable:
+                    # handle constraint that is itself a struct like
+                    # {'version': '12', 'features': ['prettyprint']}
+                    doc[section][dep_name]["version"] = new_version
+                else:
+                    print("Unknown type for {} {}: {}", dep_name, constraint, type(constraint))
 
     with open(cargo_toml, 'w') as f:
         f.write(tomlkit.dumps(doc))

From c7f3a70a79ee84070f7e8f770981fc982d8df0a4 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 20 Sep 2022 05:58:57 -0400
Subject: [PATCH 25/30] Add additional pruning tests with casts, handle
 unsupported predicates better (#3454)

* Add tests for pruning, support pruning with constant expressions

* Use downcast_any!

* chore: Remove uneeded use
---
 .../core/src/physical_optimizer/pruning.rs    | 184 ++++++++++++++----
 datafusion/core/tests/parquet_pruning.rs      |  12 +-
 2 files changed, 147 insertions(+), 49 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index f5433d7e869c..9c53e0a6a2de 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -38,11 +38,13 @@ use crate::{
     logical_plan::{Column, DFSchema, Expr, Operator},
     physical_plan::{ColumnarValue, PhysicalExpr},
 };
+use arrow::record_batch::RecordBatchOptions;
 use arrow::{
     array::{new_null_array, ArrayRef, BooleanArray},
     datatypes::{DataType, Field, Schema, SchemaRef},
     record_batch::RecordBatch,
 };
+use datafusion_common::{downcast_value, ScalarValue};
 use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter};
 
 use datafusion_expr::utils::expr_to_columns;
@@ -169,38 +171,42 @@ impl PruningPredicate {
     /// simplified version `b`. The predicates are simplified via the
     /// ConstantFolding optimizer pass
     pub fn prune<S: PruningStatistics>(&self, statistics: &S) -> Result<Vec<bool>> {
-        // build statistics record batch
-        let predicate_array =
-            build_statistics_record_batch(statistics, &self.required_columns)
-                .and_then(|statistics_batch| {
-                    // execute predicate expression
-                    self.predicate_expr.evaluate(&statistics_batch)
-                })
-                .and_then(|v| match v {
-                    ColumnarValue::Array(array) => Ok(array),
-                    ColumnarValue::Scalar(_) => Err(DataFusionError::Internal(
-                        "predicate expression didn't return an array".to_string(),
-                    )),
-                })?;
-
-        let predicate_array = predicate_array
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(format!(
-                    "Expected pruning predicate evaluation to be BooleanArray, \
-                     but was {:?}",
-                    predicate_array
-                ))
-            })?;
-
-        // when the result of the predicate expression for a row group is null / undefined,
-        // e.g. due to missing statistics, this row group can't be filtered out,
-        // so replace with true
-        Ok(predicate_array
-            .into_iter()
-            .map(|x| x.unwrap_or(true))
-            .collect::<Vec<_>>())
+        // build a RecordBatch that contains the min/max values in the
+        // appropriate statistics columns
+        let statistics_batch =
+            build_statistics_record_batch(statistics, &self.required_columns)?;
+
+        // Evaluate the pruning predicate on that record batch.
+        //
+        // Use true when the result of evaluating a predicate
+        // expression on a row group is null (aka `None`). Null can
+        // arise when the statistics are unknown or some calculation
+        // in the predicate means we don't know for sure if the row
+        // group can be filtered out or not. To maintain correctness
+        // the row group must be kept and thus `true` is returned.
+        match self.predicate_expr.evaluate(&statistics_batch)? {
+            ColumnarValue::Array(array) => {
+                let predicate_array = downcast_value!(array, BooleanArray);
+
+                Ok(predicate_array
+                   .into_iter()
+                   .map(|x| x.unwrap_or(true)) // None -> true per comments above
+                   .collect::<Vec<_>>())
+
+            },
+            // result was a column
+            ColumnarValue::Scalar(ScalarValue::Boolean(v)) => {
+                let v = v.unwrap_or(true); // None -> true per comments above
+                Ok(vec![v; statistics.num_containers()])
+            }
+            other => {
+                Err(DataFusionError::Internal(format!(
+                    "Unexpected result of pruning predicate evaluation. Expected Boolean array \
+                     or scalar but got {:?}",
+                    other
+                )))
+            }
+        }
     }
 
     /// Return a reference to the input schema
@@ -391,8 +397,13 @@ fn build_statistics_record_batch<S: PruningStatistics>(
     }
 
     let schema = Arc::new(Schema::new(fields));
-    RecordBatch::try_new(schema, arrays)
-        .map_err(|err| DataFusionError::Plan(err.to_string()))
+    // provide the count in case there were no needed statistics
+    let mut options = RecordBatchOptions::default();
+    options.row_count = Some(statistics.num_containers());
+
+    RecordBatch::try_new_with_options(schema, arrays, &options).map_err(|err| {
+        DataFusionError::Plan(format!("Can not create statistics record batch: {}", err))
+    })
 }
 
 struct PruningExpressionBuilder<'a> {
@@ -1168,7 +1179,7 @@ mod tests {
     }
 
     #[test]
-    fn test_build_statistics_no_stats() {
+    fn test_build_statistics_no_required_stats() {
         let required_columns = RequiredStatColumns::new();
 
         let statistics = OneContainerStats {
@@ -1177,13 +1188,9 @@ mod tests {
             num_containers: 1,
         };
 
-        let result =
-            build_statistics_record_batch(&statistics, &required_columns).unwrap_err();
-        assert!(
-            result.to_string().contains("Invalid argument error"),
-            "{}",
-            result
-        );
+        let batch =
+            build_statistics_record_batch(&statistics, &required_columns).unwrap();
+        assert_eq!(batch.num_rows(), 1); // had 1 container
     }
 
     #[test]
@@ -1858,7 +1865,15 @@ mod tests {
         assert_eq!(result, expected_false);
     }
 
-    /// Creates setup for int32 chunk pruning
+    /// Creates a setup for chunk pruning, modeling a int32 column "i"
+    /// with 5 different containers (e.g. RowGroups). They have [min,
+    /// max]:
+    ///
+    /// i [-5, 5]
+    /// i [1, 11]
+    /// i [-11, -1]
+    /// i [NULL, NULL]
+    /// i [1, NULL]
     fn int32_setup() -> (SchemaRef, TestStatistics) {
         let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)]));
 
@@ -1922,6 +1937,45 @@ mod tests {
         assert_eq!(result, expected_ret);
     }
 
+    #[test]
+    fn prune_int32_col_lte_zero_cast() {
+        let (schema, statistics) = int32_setup();
+
+        // Expression "cast(i as utf8) <= '0'"
+        // i [-5, 5] ==> some rows could pass (must keep)
+        // i [1, 11] ==> no rows can pass in theory, -0.22 (conservatively keep)
+        // i [-11, -1] ==>  no rows could pass in theory (conservatively keep)
+        // i [NULL, NULL]  ==> unknown (must keep)
+        // i [1, NULL]  ==> no rows can pass (conservatively keep)
+        let expected_ret = vec![true, true, true, true, true];
+
+        // cast(i as utf8) <= 0
+        let expr = cast(col("i"), DataType::Utf8).lt_eq(lit("0"));
+        let p = PruningPredicate::try_new(expr, schema.clone()).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+
+        // try_cast(i as utf8) <= 0
+        let expr = try_cast(col("i"), DataType::Utf8).lt_eq(lit("0"));
+        let p = PruningPredicate::try_new(expr, schema.clone()).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+
+        // cast(-i as utf8) >= 0
+        let expr =
+            Expr::Negative(Box::new(cast(col("i"), DataType::Utf8))).gt_eq(lit("0"));
+        let p = PruningPredicate::try_new(expr, schema.clone()).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+
+        // try_cast(-i as utf8) >= 0
+        let expr =
+            Expr::Negative(Box::new(try_cast(col("i"), DataType::Utf8))).gt_eq(lit("0"));
+        let p = PruningPredicate::try_new(expr, schema).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+    }
+
     #[test]
     fn prune_int32_col_eq_zero() {
         let (schema, statistics) = int32_setup();
@@ -1941,6 +1995,50 @@ mod tests {
         assert_eq!(result, expected_ret);
     }
 
+    #[test]
+    fn prune_int32_col_eq_zero_cast() {
+        let (schema, statistics) = int32_setup();
+
+        // Expression "cast(i as int64) = 0"
+        // i [-5, 5] ==> some rows could pass (must keep)
+        // i [1, 11] ==> no rows can pass (not keep)
+        // i [-11, -1] ==>  no rows can pass (not keep)
+        // i [NULL, NULL]  ==> unknown (must keep)
+        // i [1, NULL]  ==> no rows can pass (not keep)
+        let expected_ret = vec![true, false, false, true, false];
+
+        let expr = cast(col("i"), DataType::Int64).eq(lit(0i64));
+        let p = PruningPredicate::try_new(expr, schema.clone()).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+
+        let expr = try_cast(col("i"), DataType::Int64).eq(lit(0i64));
+        let p = PruningPredicate::try_new(expr, schema).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+    }
+
+    #[test]
+    fn prune_int32_col_eq_zero_cast_as_str() {
+        let (schema, statistics) = int32_setup();
+
+        // Note the cast is to a string where sorting properties are
+        // not the same as integers
+        //
+        // Expression "cast(i as utf8) = '0'"
+        // i [-5, 5] ==> some rows could pass (keep)
+        // i [1, 11] ==> no rows can pass  (could keep)
+        // i [-11, -1] ==>  no rows can pass (could keep)
+        // i [NULL, NULL]  ==> unknown (keep)
+        // i [1, NULL]  ==> no rows can pass (could keep)
+        let expected_ret = vec![true, true, true, true, true];
+
+        let expr = cast(col("i"), DataType::Utf8).eq(lit("0"));
+        let p = PruningPredicate::try_new(expr, schema).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected_ret);
+    }
+
     #[test]
     fn prune_int32_col_lt_neg_one() {
         let (schema, statistics) = int32_setup();
diff --git a/datafusion/core/tests/parquet_pruning.rs b/datafusion/core/tests/parquet_pruning.rs
index 1de1d7c8fe87..a7b918180016 100644
--- a/datafusion/core/tests/parquet_pruning.rs
+++ b/datafusion/core/tests/parquet_pruning.rs
@@ -237,7 +237,7 @@ async fn prune_int32_scalar_fun() {
     test_prune(
         Scenario::Int32,
         "SELECT * FROM t where abs(i) = 1",
-        Some(4),
+        Some(0),
         Some(0),
         3,
     )
@@ -249,7 +249,7 @@ async fn prune_int32_complex_expr() {
     test_prune(
         Scenario::Int32,
         "SELECT * FROM t where i+1 = 1",
-        Some(4),
+        Some(0),
         Some(0),
         2,
     )
@@ -261,7 +261,7 @@ async fn prune_int32_complex_expr_subtract() {
     test_prune(
         Scenario::Int32,
         "SELECT * FROM t where 1-i > 1",
-        Some(4),
+        Some(0),
         Some(0),
         9,
     )
@@ -308,7 +308,7 @@ async fn prune_f64_scalar_fun() {
     test_prune(
         Scenario::Float64,
         "SELECT * FROM t where abs(f-1) <= 0.000001",
-        Some(4),
+        Some(0),
         Some(0),
         1,
     )
@@ -321,7 +321,7 @@ async fn prune_f64_complex_expr() {
     test_prune(
         Scenario::Float64,
         "SELECT * FROM t where f+1 > 1.1",
-        Some(4),
+        Some(0),
         Some(0),
         9,
     )
@@ -334,7 +334,7 @@ async fn prune_f64_complex_expr_subtract() {
     test_prune(
         Scenario::Float64,
         "SELECT * FROM t where 1-f > 1",
-        Some(4),
+        Some(0),
         Some(0),
         9,
     )

From 81b57948778cfa485d8b3f2c1fe4485f3966e8e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Tue, 20 Sep 2022 15:11:42 +0200
Subject: [PATCH 26/30] Push down limit to sort (#3530)

* Push down limit to sort

Support skip, fix test

Fmt

Add limit directly after sort

Update comment

Simplify parallel sort by using new pushdown

Clippy

* Update datafusion/core/src/physical_plan/sorts/sort.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/core/src/dataframe.rs              |  2 +-
 .../src/physical_optimizer/parallel_sort.rs   | 23 +++----
 .../src/physical_optimizer/repartition.rs     |  2 +-
 datafusion/core/src/physical_plan/planner.rs  |  8 +--
 .../core/src/physical_plan/sorts/sort.rs      | 30 +++++++--
 .../sorts/sort_preserving_merge.rs            | 10 ++-
 datafusion/core/tests/order_spill_fuzz.rs     |  2 +-
 datafusion/core/tests/user_defined_plan.rs    |  1 +
 datafusion/expr/src/logical_plan/builder.rs   |  2 +
 datafusion/expr/src/logical_plan/plan.rs      |  8 ++-
 datafusion/expr/src/utils.rs                  |  3 +-
 .../optimizer/src/common_subexpr_eliminate.rs |  3 +-
 datafusion/optimizer/src/limit_push_down.rs   | 63 ++++++++++++++++++-
 datafusion/proto/proto/datafusion.proto       |  2 +
 datafusion/proto/src/logical_plan.rs          |  3 +-
 15 files changed, 132 insertions(+), 30 deletions(-)

diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index a1b43cf531f1..c9bf527091c0 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -1296,7 +1296,7 @@ mod tests {
         assert_eq!("\
         Projection: #t1.c1 AS AAA, #t1.c2, #t1.c3, #t2.c1, #t2.c2, #t2.c3\
         \n  Limit: skip=0, fetch=1\
-        \n    Sort: #t1.c1 ASC NULLS FIRST, #t1.c2 ASC NULLS FIRST, #t1.c3 ASC NULLS FIRST, #t2.c1 ASC NULLS FIRST, #t2.c2 ASC NULLS FIRST, #t2.c3 ASC NULLS FIRST\
+        \n    Sort: #t1.c1 ASC NULLS FIRST, #t1.c2 ASC NULLS FIRST, #t1.c3 ASC NULLS FIRST, #t2.c1 ASC NULLS FIRST, #t2.c2 ASC NULLS FIRST, #t2.c3 ASC NULLS FIRST, fetch=1\
         \n      Inner Join: #t1.c1 = #t2.c1\
         \n        TableScan: t1 projection=[c1, c2, c3]\
         \n        TableScan: t2 projection=[c1, c2, c3]",
diff --git a/datafusion/core/src/physical_optimizer/parallel_sort.rs b/datafusion/core/src/physical_optimizer/parallel_sort.rs
index 3361d8155f7f..e3ca60cb5cf7 100644
--- a/datafusion/core/src/physical_optimizer/parallel_sort.rs
+++ b/datafusion/core/src/physical_optimizer/parallel_sort.rs
@@ -20,7 +20,6 @@ use crate::{
     error::Result,
     physical_optimizer::PhysicalOptimizerRule,
     physical_plan::{
-        limit::GlobalLimitExec,
         sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
         with_new_children_if_necessary,
     },
@@ -55,31 +54,33 @@ impl PhysicalOptimizerRule for ParallelSort {
                 .map(|child| self.optimize(child.clone(), config))
                 .collect::<Result<Vec<_>>>()?;
             let plan = with_new_children_if_necessary(plan, children)?;
-            let children = plan.children();
             let plan_any = plan.as_any();
-            // GlobalLimitExec (SortExec preserve_partitioning=False)
-            // -> GlobalLimitExec (SortExec preserve_partitioning=True)
-            let parallel_sort = plan_any.downcast_ref::<GlobalLimitExec>().is_some()
-                && children.len() == 1
-                && children[0].as_any().downcast_ref::<SortExec>().is_some()
-                && !children[0]
-                    .as_any()
+            // SortExec preserve_partitioning=False, fetch=Some(n))
+            // ->  SortPreservingMergeExec (SortExec preserve_partitioning=True, fetch=Some(n))
+            let parallel_sort = plan_any.downcast_ref::<SortExec>().is_some()
+                && plan_any
+                    .downcast_ref::<SortExec>()
+                    .unwrap()
+                    .fetch()
+                    .is_some()
+                && !plan_any
                     .downcast_ref::<SortExec>()
                     .unwrap()
                     .preserve_partitioning();
 
             Ok(if parallel_sort {
-                let sort = children[0].as_any().downcast_ref::<SortExec>().unwrap();
+                let sort = plan_any.downcast_ref::<SortExec>().unwrap();
                 let new_sort = SortExec::new_with_partitioning(
                     sort.expr().to_vec(),
                     sort.input().clone(),
                     true,
+                    sort.fetch(),
                 );
                 let merge = SortPreservingMergeExec::new(
                     sort.expr().to_vec(),
                     Arc::new(new_sort),
                 );
-                with_new_children_if_necessary(plan, vec![Arc::new(merge)])?
+                Arc::new(merge)
             } else {
                 plan.clone()
             })
diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs
index 20aa59f0cd67..1d2b25908683 100644
--- a/datafusion/core/src/physical_optimizer/repartition.rs
+++ b/datafusion/core/src/physical_optimizer/repartition.rs
@@ -295,7 +295,7 @@ mod tests {
             expr: col("c1", &schema()).unwrap(),
             options: SortOptions::default(),
         }];
-        Arc::new(SortExec::try_new(sort_exprs, input).unwrap())
+        Arc::new(SortExec::try_new(sort_exprs, input, None).unwrap())
     }
 
     fn projection_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs
index 20a819622d5d..005a7943265a 100644
--- a/datafusion/core/src/physical_plan/planner.rs
+++ b/datafusion/core/src/physical_plan/planner.rs
@@ -590,9 +590,9 @@ impl DefaultPhysicalPlanner {
                             })
                             .collect::<Result<Vec<_>>>()?;
                         Arc::new(if can_repartition {
-                            SortExec::new_with_partitioning(sort_keys, input_exec, true)
+                            SortExec::new_with_partitioning(sort_keys, input_exec, true, None)
                         } else {
-                            SortExec::try_new(sort_keys, input_exec)?
+                            SortExec::try_new(sort_keys, input_exec, None)?
                         })
                     };
 
@@ -815,7 +815,7 @@ impl DefaultPhysicalPlanner {
                         physical_partitioning,
                     )?) )
                 }
-                LogicalPlan::Sort(Sort { expr, input, .. }) => {
+                LogicalPlan::Sort(Sort { expr, input, fetch, .. }) => {
                     let physical_input = self.create_initial_plan(input, session_state).await?;
                     let input_schema = physical_input.as_ref().schema();
                     let input_dfschema = input.as_ref().schema();
@@ -841,7 +841,7 @@ impl DefaultPhysicalPlanner {
                             )),
                         })
                         .collect::<Result<Vec<_>>>()?;
-                    Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?))
+                    Ok(Arc::new(SortExec::try_new(sort_expr, physical_input, *fetch)?))
                 }
                 LogicalPlan::Join(Join {
                     left,
diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs
index cc0501785324..8e8457be2549 100644
--- a/datafusion/core/src/physical_plan/sorts/sort.rs
+++ b/datafusion/core/src/physical_plan/sorts/sort.rs
@@ -82,6 +82,7 @@ struct ExternalSorter {
     runtime: Arc<RuntimeEnv>,
     metrics_set: CompositeMetricsSet,
     metrics: BaselineMetrics,
+    fetch: Option<usize>,
 }
 
 impl ExternalSorter {
@@ -92,6 +93,7 @@ impl ExternalSorter {
         metrics_set: CompositeMetricsSet,
         session_config: Arc<SessionConfig>,
         runtime: Arc<RuntimeEnv>,
+        fetch: Option<usize>,
     ) -> Self {
         let metrics = metrics_set.new_intermediate_baseline(partition_id);
         Self {
@@ -104,6 +106,7 @@ impl ExternalSorter {
             runtime,
             metrics_set,
             metrics,
+            fetch,
         }
     }
 
@@ -120,7 +123,7 @@ impl ExternalSorter {
             // NB timer records time taken on drop, so there are no
             // calls to `timer.done()` below.
             let _timer = tracking_metrics.elapsed_compute().timer();
-            let partial = sort_batch(input, self.schema.clone(), &self.expr)?;
+            let partial = sort_batch(input, self.schema.clone(), &self.expr, self.fetch)?;
             in_mem_batches.push(partial);
         }
         Ok(())
@@ -657,6 +660,8 @@ pub struct SortExec {
     metrics_set: CompositeMetricsSet,
     /// Preserve partitions of input plan
     preserve_partitioning: bool,
+    /// Fetch highest/lowest n results
+    fetch: Option<usize>,
 }
 
 impl SortExec {
@@ -664,8 +669,9 @@ impl SortExec {
     pub fn try_new(
         expr: Vec<PhysicalSortExpr>,
         input: Arc<dyn ExecutionPlan>,
+        fetch: Option<usize>,
     ) -> Result<Self> {
-        Ok(Self::new_with_partitioning(expr, input, false))
+        Ok(Self::new_with_partitioning(expr, input, false, fetch))
     }
 
     /// Whether this `SortExec` preserves partitioning of the children
@@ -679,12 +685,14 @@ impl SortExec {
         expr: Vec<PhysicalSortExpr>,
         input: Arc<dyn ExecutionPlan>,
         preserve_partitioning: bool,
+        fetch: Option<usize>,
     ) -> Self {
         Self {
             expr,
             input,
             metrics_set: CompositeMetricsSet::new(),
             preserve_partitioning,
+            fetch,
         }
     }
 
@@ -697,6 +705,11 @@ impl SortExec {
     pub fn expr(&self) -> &[PhysicalSortExpr] {
         &self.expr
     }
+
+    /// If `Some(fetch)`, limits output to only the first "fetch" items
+    pub fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
 }
 
 impl ExecutionPlan for SortExec {
@@ -750,6 +763,7 @@ impl ExecutionPlan for SortExec {
             self.expr.clone(),
             children[0].clone(),
             self.preserve_partitioning,
+            self.fetch,
         )))
     }
 
@@ -778,6 +792,7 @@ impl ExecutionPlan for SortExec {
                     self.expr.clone(),
                     self.metrics_set.clone(),
                     context,
+                    self.fetch(),
                 )
                 .map_err(|e| ArrowError::ExternalError(Box::new(e))),
             )
@@ -816,14 +831,14 @@ fn sort_batch(
     batch: RecordBatch,
     schema: SchemaRef,
     expr: &[PhysicalSortExpr],
+    fetch: Option<usize>,
 ) -> ArrowResult<BatchWithSortArray> {
-    // TODO: pushup the limit expression to sort
     let sort_columns = expr
         .iter()
         .map(|e| e.evaluate_to_sort_column(&batch))
         .collect::<Result<Vec<SortColumn>>>()?;
 
-    let indices = lexsort_to_indices(&sort_columns, None)?;
+    let indices = lexsort_to_indices(&sort_columns, fetch)?;
 
     // reorder all rows based on sorted indices
     let sorted_batch = RecordBatch::try_new(
@@ -870,6 +885,7 @@ async fn do_sort(
     expr: Vec<PhysicalSortExpr>,
     metrics_set: CompositeMetricsSet,
     context: Arc<TaskContext>,
+    fetch: Option<usize>,
 ) -> Result<SendableRecordBatchStream> {
     debug!(
         "Start do_sort for partition {} of context session_id {} and task_id {:?}",
@@ -887,6 +903,7 @@ async fn do_sort(
         metrics_set,
         Arc::new(context.session_config()),
         context.runtime_env(),
+        fetch,
     );
     context.runtime_env().register_requester(sorter.id());
     while let Some(batch) = input.next().await {
@@ -949,6 +966,7 @@ mod tests {
                 },
             ],
             Arc::new(CoalescePartitionsExec::new(csv)),
+            None,
         )?);
 
         let result = collect(sort_exec, task_ctx).await?;
@@ -1011,6 +1029,7 @@ mod tests {
                 },
             ],
             Arc::new(CoalescePartitionsExec::new(csv)),
+            None,
         )?);
 
         let task_ctx = session_ctx.task_ctx();
@@ -1083,6 +1102,7 @@ mod tests {
                 options: SortOptions::default(),
             }],
             input,
+            None,
         )?);
 
         let result: Vec<RecordBatch> = collect(sort_exec, task_ctx).await?;
@@ -1159,6 +1179,7 @@ mod tests {
                 },
             ],
             Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None)?),
+            None,
         )?);
 
         assert_eq!(DataType::Float32, *sort_exec.schema().field(0).data_type());
@@ -1226,6 +1247,7 @@ mod tests {
                 options: SortOptions::default(),
             }],
             blocking_exec,
+            None,
         )?);
 
         let fut = collect(sort_exec, task_ctx);
diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs
index dc788be3380e..5db3c50e6c14 100644
--- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs
+++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs
@@ -874,8 +874,12 @@ mod tests {
         sort: Vec<PhysicalSortExpr>,
         context: Arc<TaskContext>,
     ) -> RecordBatch {
-        let sort_exec =
-            Arc::new(SortExec::new_with_partitioning(sort.clone(), input, true));
+        let sort_exec = Arc::new(SortExec::new_with_partitioning(
+            sort.clone(),
+            input,
+            true,
+            None,
+        ));
         sorted_merge(sort_exec, sort, context).await
     }
 
@@ -885,7 +889,7 @@ mod tests {
         context: Arc<TaskContext>,
     ) -> RecordBatch {
         let merge = Arc::new(CoalescePartitionsExec::new(src));
-        let sort_exec = Arc::new(SortExec::try_new(sort, merge).unwrap());
+        let sort_exec = Arc::new(SortExec::try_new(sort, merge, None).unwrap());
         let mut result = collect(sort_exec, context).await.unwrap();
         assert_eq!(result.len(), 1);
         result.remove(0)
diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs
index a2d47629b927..faaa2ae0bb3d 100644
--- a/datafusion/core/tests/order_spill_fuzz.rs
+++ b/datafusion/core/tests/order_spill_fuzz.rs
@@ -75,7 +75,7 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) {
         }];
 
         let exec = MemoryExec::try_new(&input, schema, None).unwrap();
-        let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec)).unwrap());
+        let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec), None).unwrap());
 
         let runtime_config = RuntimeConfig::new().with_memory_manager(
             MemoryManagerConfig::try_new_limit(pool_size, 1.0).unwrap(),
diff --git a/datafusion/core/tests/user_defined_plan.rs b/datafusion/core/tests/user_defined_plan.rs
index 13ddb1eb8da1..c577e48e7800 100644
--- a/datafusion/core/tests/user_defined_plan.rs
+++ b/datafusion/core/tests/user_defined_plan.rs
@@ -299,6 +299,7 @@ impl OptimizerRule for TopKOptimizerRule {
             if let LogicalPlan::Sort(Sort {
                 ref expr,
                 ref input,
+                ..
             }) = **input
             {
                 if expr.len() == 1 {
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index def9927ab6b3..27837ff68ca8 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -396,6 +396,7 @@ impl LogicalPlanBuilder {
             return Ok(Self::from(LogicalPlan::Sort(Sort {
                 expr: normalize_cols(exprs, &self.plan)?,
                 input: Arc::new(self.plan.clone()),
+                fetch: None,
             })));
         }
 
@@ -403,6 +404,7 @@ impl LogicalPlanBuilder {
         let sort_plan = LogicalPlan::Sort(Sort {
             expr: normalize_cols(exprs, &plan)?,
             input: Arc::new(plan.clone()),
+            fetch: None,
         });
         // remove pushed down sort columns
         let new_expr = schema
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index f6b2b1b74f6a..049e6158ca8f 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -806,7 +806,7 @@ impl LogicalPlan {
                         "Aggregate: groupBy=[{:?}], aggr=[{:?}]",
                         group_expr, aggr_expr
                     ),
-                    LogicalPlan::Sort(Sort { expr, .. }) => {
+                    LogicalPlan::Sort(Sort { expr, fetch, .. }) => {
                         write!(f, "Sort: ")?;
                         for (i, expr_item) in expr.iter().enumerate() {
                             if i > 0 {
@@ -814,6 +814,10 @@ impl LogicalPlan {
                             }
                             write!(f, "{:?}", expr_item)?;
                         }
+                        if let Some(a) = fetch {
+                            write!(f, ", fetch={}", a)?;
+                        }
+
                         Ok(())
                     }
                     LogicalPlan::Join(Join {
@@ -1373,6 +1377,8 @@ pub struct Sort {
     pub expr: Vec<Expr>,
     /// The incoming logical plan
     pub input: Arc<LogicalPlan>,
+    /// Optional fetch limit
+    pub fetch: Option<usize>,
 }
 
 /// Join two logical plans on one or more join columns
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 65c4be24849b..4ace809b811d 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -420,9 +420,10 @@ pub fn from_plan(
             expr[group_expr.len()..].to_vec(),
             schema.clone(),
         )?)),
-        LogicalPlan::Sort(Sort { .. }) => Ok(LogicalPlan::Sort(Sort {
+        LogicalPlan::Sort(Sort { fetch, .. }) => Ok(LogicalPlan::Sort(Sort {
             expr: expr.to_vec(),
             input: Arc::new(inputs[0].clone()),
+            fetch: *fetch,
         })),
         LogicalPlan::Join(Join {
             join_type,
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 978b79d375d3..b17dd9492efc 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -196,7 +196,7 @@ fn optimize(
                 schema.clone(),
             )?))
         }
-        LogicalPlan::Sort(Sort { expr, input }) => {
+        LogicalPlan::Sort(Sort { expr, input, fetch }) => {
             let arrays = to_arrays(expr, input, &mut expr_set)?;
 
             let (mut new_expr, new_input) = rewrite_expr(
@@ -210,6 +210,7 @@ fn optimize(
             Ok(LogicalPlan::Sort(Sort {
                 expr: pop_expr(&mut new_expr)?,
                 input: Arc::new(new_input),
+                fetch: *fetch,
             }))
         }
         LogicalPlan::Join { .. }
diff --git a/datafusion/optimizer/src/limit_push_down.rs b/datafusion/optimizer/src/limit_push_down.rs
index 65bdd64ee0c5..cacbbfc5b2b1 100644
--- a/datafusion/optimizer/src/limit_push_down.rs
+++ b/datafusion/optimizer/src/limit_push_down.rs
@@ -20,7 +20,9 @@
 use crate::{OptimizerConfig, OptimizerRule};
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::{
-    logical_plan::{Join, JoinType, Limit, LogicalPlan, Projection, TableScan, Union},
+    logical_plan::{
+        Join, JoinType, Limit, LogicalPlan, Projection, Sort, TableScan, Union,
+    },
     utils::from_plan,
 };
 use std::sync::Arc;
@@ -247,6 +249,25 @@ fn limit_push_down(
                 ),
             }
         }
+        (
+            LogicalPlan::Sort(Sort { expr, input, fetch }),
+            Ancestor::FromLimit {
+                skip: ancestor_skip,
+                fetch: Some(ancestor_fetch),
+                ..
+            },
+        ) => {
+            // Update Sort `fetch`, but simply recurse through children (sort should receive all input for sorting)
+            let input = push_down_children_limit(_optimizer, _optimizer_config, input)?;
+            let sort_fetch = ancestor_skip + ancestor_fetch;
+            let plan = LogicalPlan::Sort(Sort {
+                expr: expr.clone(),
+                input: Arc::new(input),
+                fetch: Some(fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)),
+            });
+            Ok(plan)
+        }
+
         // For other nodes we can't push down the limit
         // But try to recurse and find other limit nodes to push down
         _ => push_down_children_limit(_optimizer, _optimizer_config, plan),
@@ -340,6 +361,8 @@ impl OptimizerRule for LimitPushDown {
 
 #[cfg(test)]
 mod test {
+    use std::vec;
+
     use super::*;
     use crate::test::*;
     use datafusion_expr::{
@@ -438,6 +461,44 @@ mod test {
         Ok(())
     }
 
+    #[test]
+    fn limit_push_down_sort() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        // Should push down limit to sort
+        let expected = "Limit: skip=0, fetch=10\
+        \n  Sort: #test.a, fetch=10\
+        \n    TableScan: test";
+
+        assert_optimized_plan_eq(&plan, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_skip() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort(vec![col("a")])?
+            .limit(5, Some(10))?
+            .build()?;
+
+        // Should push down limit to sort
+        let expected = "Limit: skip=5, fetch=10\
+        \n  Sort: #test.a, fetch=15\
+        \n    TableScan: test";
+
+        assert_optimized_plan_eq(&plan, expected);
+
+        Ok(())
+    }
+
     #[test]
     fn multi_stage_limit_recurses_to_deeper_limit() -> Result<()> {
         let table_scan = test_table_scan()?;
diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto
index 0e499ee48ffc..7ec4705b03b6 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -125,6 +125,8 @@ message SelectionNode {
 message SortNode {
   LogicalPlanNode input = 1;
   repeated datafusion.LogicalExprNode expr = 2;
+  // Maximum number of highest/lowest rows to fetch; negative means no limit
+  int64 fetch = 3;
 }
 
 message RepartitionNode {
diff --git a/datafusion/proto/src/logical_plan.rs b/datafusion/proto/src/logical_plan.rs
index 45399554b46d..8f9f8a9c4661 100644
--- a/datafusion/proto/src/logical_plan.rs
+++ b/datafusion/proto/src/logical_plan.rs
@@ -959,7 +959,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                     ))),
                 })
             }
-            LogicalPlan::Sort(Sort { input, expr }) => {
+            LogicalPlan::Sort(Sort { input, expr, fetch }) => {
                 let input: protobuf::LogicalPlanNode =
                     protobuf::LogicalPlanNode::try_from_logical_plan(
                         input.as_ref(),
@@ -974,6 +974,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                         protobuf::SortNode {
                             input: Some(Box::new(input)),
                             expr: selection_expr,
+                            fetch: fetch.map(|f| f as i64).unwrap_or(-1i64),
                         },
                     ))),
                 })

From 1261741af2a5e142fa0c7916e759859cc18ea59a Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Tue, 20 Sep 2022 12:10:07 -0600
Subject: [PATCH 27/30] Bug fix: expr_visitor was not visiting aggregate filter
 expressions (#3548)

---
 datafusion/expr/src/expr_visitor.rs           | 18 ++++++++---
 .../optimizer/src/projection_push_down.rs     | 30 +++++++++++++++++--
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/datafusion/expr/src/expr_visitor.rs b/datafusion/expr/src/expr_visitor.rs
index 0f2f5077a9a9..3885456cc3a9 100644
--- a/datafusion/expr/src/expr_visitor.rs
+++ b/datafusion/expr/src/expr_visitor.rs
@@ -176,12 +176,22 @@ impl ExprVisitable for Expr {
                     Ok(visitor)
                 }
             }
-            Expr::ScalarFunction { args, .. }
-            | Expr::ScalarUDF { args, .. }
-            | Expr::AggregateFunction { args, .. }
-            | Expr::AggregateUDF { args, .. } => args
+            Expr::ScalarFunction { args, .. } | Expr::ScalarUDF { args, .. } => args
                 .iter()
                 .try_fold(visitor, |visitor, arg| arg.accept(visitor)),
+            Expr::AggregateFunction { args, filter, .. }
+            | Expr::AggregateUDF { args, filter, .. } => {
+                if let Some(f) = filter {
+                    let mut aggr_exprs = args.clone();
+                    aggr_exprs.push(f.as_ref().clone());
+                    aggr_exprs
+                        .iter()
+                        .try_fold(visitor, |visitor, arg| arg.accept(visitor))
+                } else {
+                    args.iter()
+                        .try_fold(visitor, |visitor, arg| arg.accept(visitor))
+                }
+            }
             Expr::WindowFunction {
                 args,
                 partition_by,
diff --git a/datafusion/optimizer/src/projection_push_down.rs b/datafusion/optimizer/src/projection_push_down.rs
index 5a0b38b575ef..86f73c061a50 100644
--- a/datafusion/optimizer/src/projection_push_down.rs
+++ b/datafusion/optimizer/src/projection_push_down.rs
@@ -532,9 +532,9 @@ mod tests {
     use crate::test::*;
     use arrow::datatypes::DataType;
     use datafusion_expr::{
-        col, lit,
+        col, count, lit,
         logical_plan::{builder::LogicalPlanBuilder, JoinType},
-        max, min, Expr,
+        max, min, AggregateFunction, Expr,
     };
     use std::collections::HashMap;
 
@@ -990,6 +990,32 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn aggregate_filter_pushdown() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let aggr_with_filter = Expr::AggregateFunction {
+            fun: AggregateFunction::Count,
+            args: vec![col("b")],
+            distinct: false,
+            filter: Some(Box::new(col("c").gt(lit(42)))),
+        };
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("a")],
+                vec![count(col("b")), aggr_with_filter.alias("count2")],
+            )?
+            .build()?;
+
+        let expected = "Aggregate: groupBy=[[#test.a]], aggr=[[COUNT(#test.b), COUNT(#test.b) FILTER (WHERE #c > Int32(42)) AS count2]]\
+        \n  TableScan: test projection=[a, b, c]";
+
+        assert_optimized_plan_eq(&plan, expected);
+
+        Ok(())
+    }
+
     fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) {
         let optimized_plan = optimize(plan).expect("failed to optimize plan");
         let formatted_plan = format!("{:?}", optimized_plan);

From c590d2adcced18e73c0708c9abb371ed446a8879 Mon Sep 17 00:00:00 2001
From: Dan Harris <dan@coralogix.com>
Date: Tue, 20 Sep 2022 14:44:29 -0400
Subject: [PATCH 28/30] Make ParquetScanOptions public and add method to get a
 reference from ParquetExec

---
 datafusion/core/src/physical_plan/file_format/mod.rs     | 4 +++-
 datafusion/core/src/physical_plan/file_format/parquet.rs | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs
index 295fef27b291..2926f18a2023 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -30,7 +30,9 @@ mod row_filter;
 pub(crate) use self::csv::plan_to_csv;
 pub use self::csv::CsvExec;
 pub(crate) use self::parquet::plan_to_parquet;
-pub use self::parquet::{ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory};
+pub use self::parquet::{
+    ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory, ParquetScanOptions,
+};
 use arrow::{
     array::{ArrayData, ArrayRef, DictionaryArray},
     buffer::Buffer,
diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs
index a6ce44b22c8d..f4c52e5dc699 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet.rs
@@ -81,11 +81,13 @@ pub struct ParquetScanOptions {
 }
 
 impl ParquetScanOptions {
+    /// Set whether to pushdown pruning predicate to the parquet scan
     pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self {
         self.pushdown_filters = pushdown_filters;
         self
     }
 
+    /// Set whether to reorder pruning predicate expressions in order to minimize evaluation cost
     pub fn with_reorder_predicates(mut self, reorder_predicates: bool) -> Self {
         self.reorder_predicates = reorder_predicates;
         self
@@ -182,6 +184,11 @@ impl ParquetExec {
         self.scan_options = scan_options;
         self
     }
+
+    /// Ref to the `ParquetScanOptions`
+    pub fn parquet_scan_options(&self) -> &ParquetScanOptions {
+        &self.scan_options
+    }
 }
 
 /// Stores metrics about the parquet execution for a particular parquet file.

From 3fbb189e83433f879497556fb257866ee6ed126c Mon Sep 17 00:00:00 2001
From: Dan Harris <dan@coralogix.com>
Date: Wed, 21 Sep 2022 10:53:50 -0400
Subject: [PATCH 29/30] Fix issue in type coercion optimizer

---
 datafusion/core/tests/sql/explain_analyze.rs |  2 +-
 datafusion/optimizer/src/type_coercion.rs    | 21 +++++++++++++-
 datafusion/proto/src/lib.rs                  | 30 ++++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index a75e0e3fa515..f2069126c5ff 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -653,7 +653,7 @@ order by
     let expected = "\
     Sort: #revenue DESC NULLS FIRST\
     \n  Projection: #customer.c_custkey, #customer.c_name, #SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, #customer.c_acctbal, #nation.n_name, #customer.c_address, #customer.c_phone, #customer.c_comment\
-    \n    Aggregate: groupBy=[[#customer.c_custkey, #customer.c_name, #customer.c_acctbal, #customer.c_phone, #nation.n_name, #customer.c_address, #customer.c_comment]], aggr=[[SUM(CAST(#lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(#lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)))]]\
+    \n    Aggregate: groupBy=[[#customer.c_custkey, #customer.c_name, #customer.c_acctbal, #customer.c_phone, #nation.n_name, #customer.c_address, #customer.c_comment]], aggr=[[SUM(CAST(#lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(#lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]]\
     \n      Inner Join: #customer.c_nationkey = #nation.n_nationkey\
     \n        Inner Join: #orders.o_orderkey = #lineitem.l_orderkey\
     \n          Inner Join: #customer.c_custkey = #orders.o_custkey\
diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs
index 9d8e97f36a71..a5195af6a4de 100644
--- a/datafusion/optimizer/src/type_coercion.rs
+++ b/datafusion/optimizer/src/type_coercion.rs
@@ -76,10 +76,29 @@ impl OptimizerRule for TypeCoercion {
             const_evaluator,
         };
 
+        let original_expr_names: Vec<Option<String>> = plan
+            .expressions()
+            .iter()
+            .map(|expr| expr.name().ok())
+            .collect();
+
         let new_expr = plan
             .expressions()
             .into_iter()
-            .map(|expr| expr.rewrite(&mut expr_rewrite))
+            .zip(original_expr_names)
+            .map(|(expr, original_name)| {
+                let expr = expr.rewrite(&mut expr_rewrite)?;
+
+                if matches!(expr, Expr::AggregateFunction { .. }) {
+                    if let Some((alias, name)) = original_name.zip(expr.name().ok()) {
+                        if alias != name {
+                            return Ok(expr.alias(&alias));
+                        }
+                    }
+                }
+
+                Ok(expr)
+            })
             .collect::<Result<Vec<_>>>()?;
 
         from_plan(plan, &new_expr, &new_inputs)
diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs
index f34da705f94a..e4a28d51dd4d 100644
--- a/datafusion/proto/src/lib.rs
+++ b/datafusion/proto/src/lib.rs
@@ -54,6 +54,7 @@ mod roundtrip_tests {
         logical_plan_to_bytes, logical_plan_to_bytes_with_extension_codec,
     };
     use crate::logical_plan::LogicalExtensionCodec;
+    use arrow::datatypes::Schema;
     use arrow::{
         array::ArrayRef,
         datatypes::{DataType, Field, IntervalUnit, TimeUnit, UnionMode},
@@ -128,6 +129,35 @@ mod roundtrip_tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn roundtrip_logical_plan_aggregation() -> Result<(), DataFusionError> {
+        let ctx = SessionContext::new();
+
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, true),
+            Field::new("b", DataType::Decimal128(15, 2), true),
+        ]);
+
+        ctx.register_csv(
+            "t1",
+            "testdata/test.csv",
+            CsvReadOptions::default().schema(&schema),
+        )
+        .await?;
+
+        let query =
+            "SELECT a, SUM(b + 1) as b_sum FROM t1 GROUP BY a ORDER BY b_sum DESC";
+        let plan = ctx.sql(query).await?.to_logical_plan()?;
+
+        println!("{:?}", plan);
+
+        let bytes = logical_plan_to_bytes(&plan)?;
+        let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+        assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip));
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn roundtrip_logical_plan_with_extension() -> Result<(), DataFusionError> {
         let ctx = SessionContext::new();

From 7f45d3d7c4c4b5badd18eb886a50a863482a45d1 Mon Sep 17 00:00:00 2001
From: Dan Harris <dan@coralogix.com>
Date: Thu, 29 Sep 2022 10:07:01 -0400
Subject: [PATCH 30/30] Fix clippy warnings

---
 .github/workflows/rust.yml                    | 102 ------------------
 .../file_format/delimited_stream.rs           |   2 +-
 .../core/src/physical_plan/hash_join.rs       |   4 +-
 3 files changed, 3 insertions(+), 105 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index b9a3311a7fb0..4325a0a060d4 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -165,108 +165,6 @@ jobs:
           POSTGRES_USER: postgres
           POSTGRES_PASSWORD: postgres
 
-  windows:
-    name: cargo test (win64)
-    runs-on: windows-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: true
-      - name: Install protobuf compiler
-        shell: bash
-        run: |
-          mkdir -p $HOME/d/protoc
-          cd $HOME/d/protoc
-          export PROTO_ZIP="protoc-21.4-win64.zip"
-          curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
-          unzip $PROTO_ZIP
-          export PATH=$PATH:$HOME/d/protoc/bin
-          protoc.exe --version
-      # TODO: this won't cache anything, which is expensive. Setup this action
-      # with a OS-dependent path.
-      - name: Setup Rust toolchain
-        run: |
-          rustup toolchain install stable
-          rustup default stable
-          rustup component add rustfmt
-      - name: Run tests
-        shell: bash
-        run: |
-          export PATH=$PATH:$HOME/d/protoc/bin
-          cargo test
-        env:
-          # do not produce debug symbols to keep memory usage down
-          RUSTFLAGS: "-C debuginfo=0"
-
-  macos:
-    name: cargo test (mac)
-    runs-on: macos-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: true
-      - name: Install protobuf compiler
-        shell: bash
-        run: |
-          mkdir -p $HOME/d/protoc
-          cd $HOME/d/protoc
-          export PROTO_ZIP="protoc-21.4-osx-x86_64.zip"
-          curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
-          unzip $PROTO_ZIP
-          echo "$HOME/d/protoc/bin" >> $GITHUB_PATH
-          export PATH=$PATH:$HOME/d/protoc/bin
-          protoc --version
-      # TODO: this won't cache anything, which is expensive. Setup this action
-      # with a OS-dependent path.
-      - name: Setup Rust toolchain
-        run: |
-          rustup toolchain install stable
-          rustup default stable
-          rustup component add rustfmt
-      - name: Run tests
-        shell: bash
-        run: |
-          cargo test
-        env:
-          # do not produce debug symbols to keep memory usage down
-          RUSTFLAGS: "-C debuginfo=0"
-
-  test-datafusion-pyarrow:
-    name: cargo test pyarrow (amd64)
-    needs: [linux-build-lib]
-    runs-on: ubuntu-latest
-    container:
-      image: amd64/rust
-      env:
-        # Disable full debug symbol generation to speed up CI build and keep memory down
-        # "1" means line tables only, which is useful for panic tracebacks.
-        RUSTFLAGS: "-C debuginfo=1"
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
-      - name: Install PyArrow
-        run: |
-          echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          python -m pip install pyarrow
-      - name: Setup Rust toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: stable
-      - name: Run tests
-        run: |
-          cd datafusion
-          cargo test --features=pyarrow
-
   lint:
     name: Lint
     runs-on: ubuntu-latest
diff --git a/datafusion/core/src/physical_plan/file_format/delimited_stream.rs b/datafusion/core/src/physical_plan/file_format/delimited_stream.rs
index 1b6e30a9464e..68878da08670 100644
--- a/datafusion/core/src/physical_plan/file_format/delimited_stream.rs
+++ b/datafusion/core/src/physical_plan/file_format/delimited_stream.rs
@@ -68,7 +68,7 @@ impl LineDelimiter {
             } else if *is_quote {
                 None
             } else {
-                (*v == NEWLINE).then(|| idx + 1)
+                (*v == NEWLINE).then_some(idx + 1)
             }
         });
 
diff --git a/datafusion/core/src/physical_plan/hash_join.rs b/datafusion/core/src/physical_plan/hash_join.rs
index 8cb7445a24dd..a22bcbc13c7e 100644
--- a/datafusion/core/src/physical_plan/hash_join.rs
+++ b/datafusion/core/src/physical_plan/hash_join.rs
@@ -1240,12 +1240,12 @@ fn produce_from_matched(
     let indices = if unmatched {
         UInt64Array::from_iter_values(
             (0..visited_left_side.len())
-                .filter_map(|v| (!visited_left_side.get_bit(v)).then(|| v as u64)),
+                .filter_map(|v| (!visited_left_side.get_bit(v)).then_some(v as u64)),
         )
     } else {
         UInt64Array::from_iter_values(
             (0..visited_left_side.len())
-                .filter_map(|v| (visited_left_side.get_bit(v)).then(|| v as u64)),
+                .filter_map(|v| (visited_left_side.get_bit(v)).then_some(v as u64)),
         )
     };