From 589ee683dbb2dc742e8b9f5a474a4e3c91f6e186 Mon Sep 17 00:00:00 2001
From: Paul Blankley <31221512+pblankley@users.noreply.github.com>
Date: Wed, 8 May 2024 18:48:49 -0600
Subject: [PATCH] Feature/fix count non additive metrics (#191)

* fix counts for non additive dims

* fix week datatype bigquery issue

* bump version to 0.12.12

---------

Co-authored-by: Paul Blankley <paul@zenlytic.com>
---
 metrics_layer/core/model/field.py             | 14 +++++---
 pyproject.toml                                |  2 +-
 .../config/metrics_layer_config/views/mrr.yml | 16 +++++++++
 tests/test_join_query.py                      | 35 +++++++++++++++++--
 tests/test_listing_functions.py               |  2 +-
 tests/test_non_additive_dimensions.py         | 20 +++++++++++
 tests/test_simple_query.py                    |  9 +++--
 7 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/metrics_layer/core/model/field.py b/metrics_layer/core/model/field.py
index d50843d..31892d1 100644
--- a/metrics_layer/core/model/field.py
+++ b/metrics_layer/core/model/field.py
@@ -285,7 +285,9 @@ def sql(self):
 
             else_0 = False
             if non_additive_dimension := self.non_additive_dimension:
-                else_0 = True
+                # We need to do else 0 if it's a numeric operation like sum, average, etc
+                # But we need to do else null if it is a non numeric op like count, count_distinct
+                else_0 = self.type not in {ZenlyticType.count, ZenlyticType.count_distinct}
                 if isinstance(self.non_additive_dimension, dict):
                     filters_to_apply += [
                         {
@@ -558,8 +560,6 @@ def strict_replaced_query(self):
 
     def _needs_symmetric_aggregate(self, functional_pk: MetricsLayerBase):
         if functional_pk:
-            if functional_pk == Definitions.does_not_exist:
-                return True
             try:
                 field_pk_id = self.view.primary_key.id()
             except AttributeError:
@@ -569,7 +569,9 @@ def _needs_symmetric_aggregate(self, functional_pk: MetricsLayerBase):
                     "Define the primary key by adding primary_key: yes to the field "
                     "that is the primary key of the table."
                 )
-            different_functional_pk = field_pk_id != functional_pk.id()
+            different_functional_pk = (
+                functional_pk == Definitions.does_not_exist or field_pk_id != functional_pk.id()
+            )
         else:
             different_functional_pk = False
         return different_functional_pk
@@ -1103,7 +1105,9 @@ def apply_dimension_group_time_sql(self, sql: str, query_type: str):
                     f"CAST(DATETIME_TRUNC(CAST({s} AS DATETIME), HOUR) AS {self.datatype.upper()})"
                 ),
                 "date": lambda s, qt: f"CAST(DATE_TRUNC(CAST({s} AS DATE), DAY) AS {self.datatype.upper()})",
-                "week": self._week_dimension_group_time_sql,
+                "week": lambda s, qt: (
+                    f"CAST({self._week_dimension_group_time_sql(s, qt)} AS {self.datatype.upper()})"
+                ),
                 "month": lambda s, qt: (  # noqa
                     f"CAST(DATE_TRUNC(CAST({s} AS DATE), MONTH) AS {self.datatype.upper()})"
                 ),
diff --git a/pyproject.toml b/pyproject.toml
index 365b1cd..46461ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "metrics_layer"
-version = "0.12.11"
+version = "0.12.12"
 description = "The open source metrics layer."
 authors = ["Paul Blankley <paul@zenlytic.com>"]
 keywords = ["Metrics Layer", "Business Intelligence", "Analytics"]
diff --git a/tests/config/metrics_layer_config/views/mrr.yml b/tests/config/metrics_layer_config/views/mrr.yml
index 81106b2..0fc812c 100644
--- a/tests/config/metrics_layer_config/views/mrr.yml
+++ b/tests/config/metrics_layer_config/views/mrr.yml
@@ -72,6 +72,22 @@ fields:
     type: count
     sql: ${parent_account_id}
 
+  - name: accounts_beginning_of_month
+    field_type: measure
+    type: count
+    sql: ${parent_account_id}
+    non_additive_dimension:
+      name: record_raw
+      window_choice: min
+
+  - name: accounts_end_of_month
+    field_type: measure
+    type: count_distinct
+    sql: ${parent_account_id}
+    non_additive_dimension:
+      name: record_raw
+      window_choice: max
+
   - name: mrr_end_of_month
     field_type: measure
     type: sum
diff --git a/tests/test_join_query.py b/tests/test_join_query.py
index 4f2c380..1dbf98e 100644
--- a/tests/test_join_query.py
+++ b/tests/test_join_query.py
@@ -1,3 +1,5 @@
+from datetime import datetime
+
 import pytest
 
 from metrics_layer.core.exceptions import JoinError, QueryError
@@ -68,6 +70,36 @@ def test_query_no_join_average_distinct(connection):
     assert query == correct
 
 
+@pytest.mark.query
+@pytest.mark.parametrize("field", ["order_lines.order_week", "orders.order_week"])
+def test_query_bigquery_week_filter_type_conversion(connection, field):
+    query = connection.get_sql_query(
+        metrics=["total_item_revenue"],
+        dimensions=["channel"],
+        where=[
+            {
+                "field": field,
+                "expression": "greater_than",
+                "value": datetime(year=2021, month=8, day=4),
+            }
+        ],
+        query_type="BIGQUERY",
+    )
+
+    cast_as = "DATE" if "order_lines.order_week" == field else "TIMESTAMP"
+    sql_field = "order_lines.order_date" if "order_lines.order_week" == field else "orders.order_date"
+    join = ""
+    if "orders" in field:
+        join = "LEFT JOIN analytics.orders orders ON order_lines.order_unique_id=orders.id "
+    correct = (
+        "SELECT order_lines.sales_channel as order_lines_channel,SUM(order_lines.revenue) as"
+        f" order_lines_total_item_revenue FROM analytics.order_line_items order_lines {join}WHERE"
+        f" CAST(DATE_TRUNC(CAST({sql_field} AS DATE), WEEK) AS {cast_as})>{cast_as}('2021-08-04 00:00:00')"
+        " GROUP BY order_lines_channel;"
+    )
+    assert query == correct
+
+
 @pytest.mark.query
 def test_query_single_join(connection):
     query = connection.get_sql_query(metrics=["total_item_revenue"], dimensions=["channel", "new_vs_repeat"])
@@ -231,7 +263,6 @@ def test_query_single_join_metric_with_sub_field(connection):
     assert query == correct
 
 
-# TODO need one like this with order lines and rainfall
 @pytest.mark.query
 def test_query_single_join_with_forced_additional_join(connection):
     query = connection.get_sql_query(
@@ -249,7 +280,7 @@ def test_query_single_join_with_forced_additional_join(connection):
         "(country_detail.rain)  IS NOT NULL THEN  country_detail.country  ELSE NULL END), "
         "0)) as country_detail_avg_rainfall FROM analytics.discount_detail discount_detail "
         "LEFT JOIN analytics_live.discounts discounts ON discounts.discount_id=discount_detail.discount_id "
-        "AND DATE_TRUNC(CAST(discounts.order_date AS DATE), WEEK) is not null LEFT JOIN "
+        "AND CAST(DATE_TRUNC(CAST(discounts.order_date AS DATE), WEEK) AS TIMESTAMP) is not null LEFT JOIN "
         "(SELECT * FROM ANALYTICS.COUNTRY_DETAIL) as country_detail "
         "ON discounts.country=country_detail.country "
         "GROUP BY discount_detail_discount_promo_name;"
diff --git a/tests/test_listing_functions.py b/tests/test_listing_functions.py
index 8eb5f9f..f85e9c4 100644
--- a/tests/test_listing_functions.py
+++ b/tests/test_listing_functions.py
@@ -4,7 +4,7 @@
 @pytest.mark.project
 def test_list_metrics(connection):
     metrics = connection.list_metrics()
-    assert len(metrics) == 55
+    assert len(metrics) == 57
 
     metrics = connection.list_metrics(view_name="order_lines", names_only=True)
     assert len(metrics) == 11
diff --git a/tests/test_non_additive_dimensions.py b/tests/test_non_additive_dimensions.py
index b0cd772..5fcb0d2 100644
--- a/tests/test_non_additive_dimensions.py
+++ b/tests/test_non_additive_dimensions.py
@@ -19,6 +19,26 @@ def test_mrr_non_additive_dimension_no_group_by_max(connection, metric_suffix):
     assert query == correct
 
 
+@pytest.mark.query
+@pytest.mark.parametrize("metric_suffix", ["end_of_month", "beginning_of_month"])
+def test_mrr_non_additive_dimension_no_group_by_counts(connection, metric_suffix):
+    query = connection.get_sql_query(metrics=[f"accounts_{metric_suffix}"])
+
+    func = "MAX" if metric_suffix == "end_of_month" else "MIN"
+    agg = "COUNT(DISTINCT(" if metric_suffix == "end_of_month" else "COUNT("
+    close = "))" if metric_suffix == "end_of_month" else ")"
+    correct = (
+        f"WITH cte_accounts_{metric_suffix}_record_raw AS (SELECT {func}(mrr.record_date) as"
+        f" mrr_{func.lower()}_record_raw FROM analytics.mrr_by_customer mrr ORDER BY"
+        f" mrr_{func.lower()}_record_raw DESC) SELECT {agg}case when"
+        f" mrr.record_date=cte_accounts_{metric_suffix}_record_raw.mrr_{func.lower()}_record_raw then"
+        f" mrr.parent_account_id end{close} as mrr_accounts_{metric_suffix} FROM analytics.mrr_by_customer"
+        f" mrr JOIN cte_accounts_{metric_suffix}_record_raw ON 1=1 ORDER BY"
+        f" mrr_accounts_{metric_suffix} DESC;"
+    )
+    assert query == correct
+
+
 @pytest.mark.query
 def test_mrr_non_additive_dimension_no_group_by_multi_cte(connection):
     query = connection.get_sql_query(
diff --git a/tests/test_simple_query.py b/tests/test_simple_query.py
index 0cebaf4..23aec84 100644
--- a/tests/test_simple_query.py
+++ b/tests/test_simple_query.py
@@ -496,8 +496,8 @@ def test_simple_query_dimension_group_timezone(connections, field: str, group: s
                 " AS TIMESTAMP) AS DATE), DAY) AS TIMESTAMP)"
             ),
             "week": (  # noqa
-                "CAST(DATE_TRUNC(CAST(CAST(DATETIME(CAST(simple.order_date AS TIMESTAMP), 'America/New_York')"
-                " AS TIMESTAMP) AS DATE) + 1, WEEK) - 1 AS TIMESTAMP)"
+                "CAST(CAST(DATE_TRUNC(CAST(CAST(DATETIME(CAST(simple.order_date AS TIMESTAMP),"
+                " 'America/New_York') AS TIMESTAMP) AS DATE) + 1, WEEK) - 1 AS TIMESTAMP) AS TIMESTAMP)"
             ),
         }
         where = (
@@ -849,7 +849,10 @@ def test_simple_query_dimension_group(connections, group: str, query_type: str):
             "minute": "CAST(DATETIME_TRUNC(CAST(simple.order_date AS DATETIME), MINUTE) AS TIMESTAMP)",
             "hour": "CAST(DATETIME_TRUNC(CAST(simple.order_date AS DATETIME), HOUR) AS TIMESTAMP)",
             "date": "CAST(DATE_TRUNC(CAST(simple.order_date AS DATE), DAY) AS TIMESTAMP)",
-            "week": "CAST(DATE_TRUNC(CAST(simple.order_date AS DATE) + 1, WEEK) - 1 AS TIMESTAMP)",
+            "week": (
+                "CAST(CAST(DATE_TRUNC(CAST(simple.order_date AS DATE) + 1, WEEK) - 1 AS TIMESTAMP) AS"
+                " TIMESTAMP)"
+            ),
             "month": "CAST(DATE_TRUNC(CAST(simple.order_date AS DATE), MONTH) AS TIMESTAMP)",
             "quarter": "CAST(DATE_TRUNC(CAST(simple.order_date AS DATE), QUARTER) AS TIMESTAMP)",
             "year": "CAST(DATE_TRUNC(CAST(simple.order_date AS DATE), YEAR) AS TIMESTAMP)",