From 7424eb450a806df72ce0f00e0b1ece7901fc7812 Mon Sep 17 00:00:00 2001 From: suibianwanwan Date: Thu, 6 Mar 2025 23:28:35 +0800 Subject: [PATCH] Minor: Improve documentation of `need_handle_count_bug` --- datafusion/optimizer/src/decorrelate.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index b192f9740483..ff1056f2a818 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -56,10 +56,14 @@ pub struct PullUpCorrelatedExpr { /// Indicates if we encounter any correlated expression that can not be pulled up /// above a aggregation without changing the meaning of the query. can_pull_over_aggregation: bool, - /// Do we need to handle [the Count bug] during the pull up process. - /// TODO this parameter should be removed or renamed semantically + /// Do we need to handle the [count bug] during the pull up process. /// - /// [the Count bug]: https://github.com/apache/datafusion/issues/10553 + /// The "count bug" was described in [Optimization of Nested SQL + /// Queries Revisited](https://dl.acm.org/doi/pdf/10.1145/38714.38723). This bug is + /// not specific to the COUNT function, and it can occur with any aggregate function, + /// such as SUM, AVG, etc. The anomaly arises because aggregates fail to distinguish + /// between an empty set and null values when optimizing a correlated query as a join. + /// Here, we use "the count bug" to refer to all such cases. pub need_handle_count_bug: bool, /// mapping from the plan to its expressions' evaluation result on empty batch pub collected_count_expr_map: HashMap,