From c3571266ce3dd980e2e81f5735337e14ff621b19 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 7 Dec 2023 16:50:33 -0800 Subject: [PATCH 01/35] wip --- dbt/adapters/databricks/relation_configs/base.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dbt/adapters/databricks/relation_configs/base.py diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py new file mode 100644 index 000000000..e69de29bb From 1bddd0b8097a510082b25ef35aab955680a48ce6 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 12 Dec 2023 12:18:05 -0800 Subject: [PATCH 02/35] wip --- .../databricks/relation_configs/base.py | 135 ++++++++++++++++++ .../databricks/relation_configs/comment.py | 46 ++++++ .../relation_configs/materialized_view.py | 19 +++ .../databricks/relation_configs/query.py | 22 +++ .../databricks/relation_configs/refresh.py | 86 +++++++++++ .../databricks/relation_configs/util.py | 7 + tests/unit/relation_configs/test_comment.py | 10 ++ .../unit/relation_configs/test_config_base.py | 129 +++++++++++++++++ tests/unit/relation_configs/test_refresh.py | 67 +++++++++ tests/unit/relation_configs/test_util.py | 16 +++ 10 files changed, 537 insertions(+) create mode 100644 dbt/adapters/databricks/relation_configs/comment.py create mode 100644 dbt/adapters/databricks/relation_configs/materialized_view.py create mode 100644 dbt/adapters/databricks/relation_configs/query.py create mode 100644 dbt/adapters/databricks/relation_configs/refresh.py create mode 100644 dbt/adapters/databricks/relation_configs/util.py create mode 100644 tests/unit/relation_configs/test_comment.py create mode 100644 tests/unit/relation_configs/test_config_base.py create mode 100644 tests/unit/relation_configs/test_refresh.py create mode 100644 tests/unit/relation_configs/test_util.py diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index e69de29bb..62768e230 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -0,0 +1,135 @@ +from abc import ABC +from dataclasses import InitVar, dataclass +import itertools +from typing import Any, ClassVar, Dict, Generic, List, Optional, TypeVar, Union +from dbt.adapters.relation_configs.config_base import RelationConfigBase +from dbt.adapters.relation_configs.config_change import RelationConfigChange +from dbt.contracts.graph.nodes import ModelNode + +from agate import Row + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class DatabricksComponentConfig(ABC): + def to_sql_clause(self) -> str: + raise NotImplementedError("Must be implemented by subclass") + + +Component = TypeVar("Component", bound=DatabricksComponentConfig) + + +class DatabricksComponentProcessor(ABC, Generic[Component]): + @classmethod + def name(cls) -> str: + raise NotImplementedError("Must be implemented by subclass") + + @classmethod + def description_target(cls) -> str: + raise NotImplementedError("Must be implemented by subclass") + + @classmethod + def process_description_row(cls, row: Row) -> Component: + assert row[0] == cls.description_target() + return cls.process_description_row_impl(row) + + @classmethod + def process_description_row_impl(cls, row: Row) -> Component: + raise NotImplementedError("Must be implemented by subclass") + + @classmethod + def process_model_node(cls, model_node: ModelNode) -> Component: + raise NotImplementedError("Must be implemented by subclass") + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class PartitionedByConfig(DatabricksComponentConfig): + partition_by: Optional[List[str]] = None + + def to_sql_clause(self) -> str: + if self.partition_by: + return f"PARTITIONED BY ({', '.join(self.partition_by)})" + return "" + + +class PartitionedByProcessor: + @classmethod + def process_partition_rows(cls, results: List[Row]) -> PartitionedByConfig: + cols = [] + rows = itertools.takewhile( + lambda row: row[0], + itertools.dropwhile(lambda row: row[0] != "# Partition Information", results), + ) + for row in rows: + if not row[0].startswith("# "): + cols.append(row[0]) + + return PartitionedByConfig(cols) + + @classmethod + def process_model_node(cls, model_node: ModelNode) -> PartitionedByConfig: + partition_by = model_node.config.extra.get("partition_by") + if isinstance(partition_by, str): + return PartitionedByConfig([partition_by]) + return PartitionedByConfig(partition_by) + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class PartitionedByConfigChange(RelationConfigChange): + context: Optional[PartitionedByConfig] = None + + def requires_full_refresh(self) -> bool: + return True + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class DatabricksRelationConfigBase(RelationConfigBase, ABC): + partitioned_by_processor: ClassVar[Optional[type[PartitionedByProcessor]]] = None + config_components: ClassVar[List[type[DatabricksComponentProcessor]]] = [] + _indexed_processors: ClassVar[Dict[str, DatabricksComponentProcessor]] = None + + @classmethod + def indexed_processors(cls) -> Dict[str, DatabricksComponentProcessor]: + if not cls._indexed_processors: + cls._indexed_processors = { + component.description_target(): component for component in cls.config_components + } + return cls._indexed_processors + + @classmethod + def from_model_node(cls, model_node: ModelNode) -> RelationConfigBase: + config_dict: Dict[str, DatabricksComponentConfig] = {} + if cls.partitioned_by_processor: + config_dict["partition_by"] = cls.partitioned_by_processor.process_model_node( + model_node + ) + for component in cls.config_components(): + config_dict[component.name] = component.from_model_node(model_node) + + config = cls.from_dict(config_dict) + return config + + @classmethod + def from_describe_extended(cls, results: List[Row]) -> RelationConfigBase: + relation_config = cls.parse_describe_extended(results) + relation = cls.from_dict(relation_config) + return relation + + @classmethod + def parse_describe_extended(cls, results: List[Row]) -> dict[str, DatabricksComponentConfig]: + parsed: Dict[str, DatabricksComponentConfig] = { + component.name(): None for component in cls.config_components + } + + if cls.partitioned_by_processor: + parsed["partition_by"] = cls.partitioned_by_processor.process_partition_rows(results) + + # Skip to metadata + rows = itertools.dropwhile(lambda row: row[0] != "# Detailed Table Information", results) + indexed_processors = cls.indexed_processors() + + for row in rows: + if row[0] in indexed_processors: + processor = indexed_processors[row[0]] + parsed[processor.name()] = processor.process_description_row(row) + + return parsed diff --git a/dbt/adapters/databricks/relation_configs/comment.py b/dbt/adapters/databricks/relation_configs/comment.py new file mode 100644 index 000000000..b66f36e00 --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/comment.py @@ -0,0 +1,46 @@ +from dataclasses import dataclass +from typing import Optional +from agate import Row +from dbt.contracts.graph.nodes import ModelNode + +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksComponentConfig, + DatabricksComponentProcessor, +) +from dbt.adapters.relation_configs.config_change import RelationConfigChange + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class CommentConfig(DatabricksComponentConfig): + comment: Optional[str] = None + + def to_sql_clause(self) -> str: + if self.comment: + return f"COMMENT '{self.comment}'" + return "" + + +class CommentProcessor(DatabricksComponentProcessor[CommentConfig]): + @classmethod + def name(cls) -> str: + return "comment" + + @classmethod + def description_target(cls) -> str: + return "Comment" + + @classmethod + def process_description_row_impl(cls, row: Row) -> CommentConfig: + return CommentConfig(row[1]) + + @classmethod + def process_model_node(cls, model_node: ModelNode) -> CommentConfig: + return CommentConfig(model_node.description) + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class CommentConfigChange(RelationConfigChange): + context: Optional[CommentConfig] = None + + def requires_full_refresh(self) -> bool: + return False diff --git a/dbt/adapters/databricks/relation_configs/materialized_view.py b/dbt/adapters/databricks/relation_configs/materialized_view.py new file mode 100644 index 000000000..80e5c68c2 --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/materialized_view.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass +from typing import List, Optional +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksRelationConfigBase, + PartitionedByConfig, + PartitionedByProcessor, +) +from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor +from dbt.adapters.databricks.relation_configs.refresh import RefreshConfig, RefreshProcessor + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class MaterializedViewConfig(DatabricksRelationConfigBase): + partitioned_by_processor = PartitionedByProcessor + config_components = [CommentProcessor, RefreshProcessor] + + comment: CommentConfig + refresh: RefreshConfig + partition_by: PartitionedByConfig diff --git a/dbt/adapters/databricks/relation_configs/query.py b/dbt/adapters/databricks/relation_configs/query.py new file mode 100644 index 000000000..b8f605d5b --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/query.py @@ -0,0 +1,22 @@ +import dataclasses +from typing import Optional + +from dbt.adapters.databricks.relation_configs.base import DatabricksComponentConfig + + +@dataclasses(frozen=True, eq=True, unsafe_hash=True) +class QueryConfig(DatabricksComponentConfig): + query: str + + def to_sql_clause(self) -> str: + return self.query + + +class QueryProcessor(DatabricksComponentProcessor[CommentConfig]): + @classmethod + def process_description_row_impl(cls, row: Row) -> CommentConfig: + return CommentConfig(row[1]) + + @classmethod + def process_model_node(cls, model_node: ModelNode) -> QueryConfig: + return CommentConfig(model_node.description) diff --git a/dbt/adapters/databricks/relation_configs/refresh.py b/dbt/adapters/databricks/relation_configs/refresh.py new file mode 100644 index 000000000..dec0b0062 --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/refresh.py @@ -0,0 +1,86 @@ +from abc import ABC +from dataclasses import dataclass +import re +from typing import Optional +from agate import Row +from dbt.exceptions import DbtRuntimeError +from dbt.adapters.relation_configs.config_change import RelationConfigChange +from dbt.contracts.graph.nodes import ModelNode + +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksComponentConfig, + DatabricksComponentProcessor, +) + +SCHEDULE_REGEX = re.compile(r"CRON '(.*)' AT TIME ZONE '(.*)'") + + +class RefreshConfig(DatabricksComponentConfig, ABC): + pass + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class ScheduledRefreshConfig(RefreshConfig): + cron: str + time_zone_value: Optional[str] + + def to_sql_clause(self) -> str: + schedule = f"SCHEDULE CRON '{self.cron}'" + + if self.time_zone_value: + schedule += f" AT TIME ZONE '{self.time_zone_value}'" + + return schedule + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class ManualRefreshConfig(RefreshConfig): + def to_sql_clause(self) -> str: + return "" + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class RefreshProcessor(DatabricksComponentProcessor[RefreshConfig]): + @classmethod + def name(cls) -> str: + return "refresh" + + @classmethod + def description_target(cls) -> str: + return "Refresh Schedule" + + @classmethod + def process_description_row_impl(cls, row: Row) -> RefreshConfig: + if row[1] == "MANUAL": + return ManualRefreshConfig() + + match = SCHEDULE_REGEX.match(row[1]) + + if match: + cron, time_zone_value = match.groups() + return ScheduledRefreshConfig(cron=cron, time_zone_value=time_zone_value) + else: + raise DbtRuntimeError( + f"Could not parse schedule from description: {row[1]}." + " This is most likely a bug in the dbt-databricks adapter, so please file an issue!" + ) + + @classmethod + def process_model_node(cls, model_node: ModelNode) -> RefreshConfig: + schedule = model_node.config.extra.get("schedule") + if schedule: + if "cron" not in schedule: + raise DbtRuntimeError(f"Schedule config must contain a 'cron' key, got {schedule}.") + return ScheduledRefreshConfig( + cron=schedule["cron"], time_zone_value=schedule.get("time_zone_value") + ) + else: + return ManualRefreshConfig() + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class RefreshConfigChange(RelationConfigChange): + context: Optional[RefreshConfig] = None + + def requires_full_refresh(self) -> bool: + return False diff --git a/dbt/adapters/databricks/relation_configs/util.py b/dbt/adapters/databricks/relation_configs/util.py new file mode 100644 index 000000000..0756149d8 --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/util.py @@ -0,0 +1,7 @@ +from agate import Table, Row + + +def get_first_row(results: Table) -> Row: + if len(results.rows) == 0: + return Row(values=set()) + return results.rows[0] diff --git a/tests/unit/relation_configs/test_comment.py b/tests/unit/relation_configs/test_comment.py new file mode 100644 index 000000000..b0957e175 --- /dev/null +++ b/tests/unit/relation_configs/test_comment.py @@ -0,0 +1,10 @@ +import pytest +from dbt.adapters.databricks.relation_configs.comment import CommentConfig + + +class TestCommentConfig: + @pytest.mark.parametrize( + "input,expected", [(None, ""), ("", ""), ("comment", "COMMENT 'comment'")] + ) + def test_comment_config(self, input, expected): + assert CommentConfig(input).to_sql_clause() == expected diff --git a/tests/unit/relation_configs/test_config_base.py b/tests/unit/relation_configs/test_config_base.py new file mode 100644 index 000000000..d23d31f53 --- /dev/null +++ b/tests/unit/relation_configs/test_config_base.py @@ -0,0 +1,129 @@ +from dataclasses import dataclass +from agate import Row +from mock import Mock +import pytest + +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksRelationConfigBase, + PartitionedByConfig, + PartitionedByProcessor, +) +from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class TestDatabricksRelationConfig(DatabricksRelationConfigBase): + partitioned_by_processor = PartitionedByProcessor + config_components = [CommentProcessor] + comment: CommentConfig + partition_by: PartitionedByConfig + + +class TestFromDescribeExtended: + def test_from_describe_extended__simple_case(self): + results = [ + Row(["col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row([None, None, None]), + Row(["# Detailed Table Information", None, None]), + Row(["Catalog:", "default", None]), + Row(["Schema:", "default", None]), + Row(["Table:", "table_abc", None]), + Row(["Comment", "This is the table comment", None]), + ] + config = TestDatabricksRelationConfig.from_describe_extended(results) + assert config.comment == CommentConfig("This is the table comment") + + def test_from_describe_extended__with_partitions(self): + results = [ + Row(["col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row(["# Partition Information", None, None]), + Row(["# col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row([None, None, None]), + Row(["# Detailed Table Information", None, None]), + Row(["Catalog:", "default", None]), + Row(["Schema:", "default", None]), + Row(["Table:", "table_abc", None]), + Row(["Comment", "This is the table comment", None]), + ] + config = TestDatabricksRelationConfig.from_describe_extended(results) + assert config.comment == CommentConfig("This is the table comment") + assert config.partition_by == PartitionedByConfig(["col_a"]) + + +class TestPartitionedByConfig: + @pytest.mark.parametrize( + "input,expected", + [ + (None, ""), + ([], ""), + (["col_a"], "PARTITIONED BY (col_a)"), + (["col_a", "col_b"], "PARTITIONED BY (col_a, col_b)"), + ], + ) + def test_to_sql_clause__empty(self, input, expected): + config = PartitionedByConfig(input) + assert config.to_sql_clause() == expected + + +class TestPartitionedByProcessor: + def test_process_partition_rows__none(self): + results = [ + Row(["col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row([None, None, None]), + Row(["# Detailed Table Information", None, None]), + Row(["Catalog:", "default", None]), + ] + + spec = PartitionedByProcessor.process_partition_rows(results) + assert spec == PartitionedByConfig([]) + + def test_process_partition_rows__single(self): + results = [ + Row(["col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row(["# Partition Information", None, None]), + Row(["# col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row([None, None, None]), + Row(["# Detailed Table Information", None, None]), + Row(["Catalog:", "default", None]), + ] + spec = PartitionedByProcessor.process_partition_rows(results) + assert spec == PartitionedByConfig(["col_a"]) + + def test_process_partition_rows__multiple(self): + results = [ + Row(["col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row(["# Partition Information", None, None]), + Row(["# col_name", "data_type", "comment"]), + Row(["col_a", "int", "This is a comment"]), + Row(["col_b", "int", "This is a comment"]), + Row([None, None, None]), + Row(["# Detailed Table Information", None, None]), + Row(["Catalog:", "default", None]), + ] + spec = PartitionedByProcessor.process_partition_rows(results) + assert spec == PartitionedByConfig(["col_a", "col_b"]) + + def test_process_model_node__without_partition_by(self): + model = Mock() + model.config.extra.get.return_value = None + spec = PartitionedByProcessor.process_model_node(model) + assert spec == PartitionedByConfig(None) + + def test_process_model_node__single_column(self): + model = Mock() + model.config.extra.get.return_value = "col_a" + spec = PartitionedByProcessor.process_model_node(model) + assert spec == PartitionedByConfig(["col_a"]) + + def test_process_model_node__multiple_columns(self): + model = Mock() + model.config.extra.get.return_value = ["col_a", "col_b"] + spec = PartitionedByProcessor.process_model_node(model) + assert spec == PartitionedByConfig(["col_a", "col_b"]) diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py new file mode 100644 index 000000000..918949e79 --- /dev/null +++ b/tests/unit/relation_configs/test_refresh.py @@ -0,0 +1,67 @@ +from mock import Mock +import pytest +from agate import Row +from dbt.adapters.databricks.relation_configs.refresh import ( + RefreshProcessor, + ManualRefreshConfig, + ScheduledRefreshConfig, +) +from dbt.exceptions import DbtRuntimeError + + +class TestRefreshProcessor: + def test_process_description_row_impl__valid_schedule(self): + spec = RefreshProcessor.process_description_row_impl( + Row(["Refresh Schedule", "CRON '*/5 * * * *' AT TIME ZONE 'UTC'"]) + ) + assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + + def test_process_description_row_impl__manual(self): + spec = RefreshProcessor.process_description_row_impl(Row(["Refresh Schedule", "MANUAL"])) + assert spec == ManualRefreshConfig() + + def test_process_description_row_impl__invalid(self): + with pytest.raises( + DbtRuntimeError, + match="Could not parse schedule from description: invalid description.", + ): + RefreshProcessor.process_description_row_impl( + Row(["Refresh Schedule", "invalid description"]) + ) + + def test_process_model_node__without_schedule(self): + model = Mock() + model.config.extra.get.return_value = {} + spec = RefreshProcessor.process_model_node(model) + assert spec == ManualRefreshConfig() + + def test_process_model_node__without_cron(self): + model = Mock() + model.config.extra.get.return_value = {"time_zone_value": "UTC"} + with pytest.raises( + DbtRuntimeError, + match="Schedule config must contain a 'cron' key, got {'time_zone_value': 'UTC'}.", + ): + RefreshProcessor.process_model_node(model) + + def test_process_model_node__without_timezone(self): + model = Mock() + model.config.extra.get.return_value = {"cron": "*/5 * * * *"} + spec = RefreshProcessor.process_model_node(model) + assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) + + def test_process_model_node__both(self): + model = Mock() + model.config.extra.get.return_value = {"cron": "*/5 * * * *", "time_zone_value": "UTC"} + spec = RefreshProcessor.process_model_node(model) + assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + + +class TestScheduledRefreshConfig: + def test_to_schedule_clause__no_timezone(self): + spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) + assert spec.to_sql_clause() == "SCHEDULE CRON '*/5 * * * *'" + + def test_to_schedule_clause__with_timezone(self): + spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + assert spec.to_sql_clause() == "SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'" diff --git a/tests/unit/relation_configs/test_util.py b/tests/unit/relation_configs/test_util.py new file mode 100644 index 000000000..c45e89216 --- /dev/null +++ b/tests/unit/relation_configs/test_util.py @@ -0,0 +1,16 @@ +from agate import Table, Row +import pytest +from dbt.adapters.databricks.relation_configs import util + + +class TestGetFirstRow: + @pytest.mark.parametrize( + "input,expected", + [ + (Table([]), Row(set())), + (Table([Row(["first", "row"]), Row(["second", "row"])]), Row(["first", "row"])), + ], + ) + def test_get_first_row(self, input, expected): + first_row = util.get_first_row(input) + assert first_row == expected From c6c421dd0dbd7f23d2103861ac7a8d762f1f694f Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 12 Dec 2023 14:42:52 -0800 Subject: [PATCH 03/35] wip --- .../databricks/relation_configs/base.py | 124 ++++------------- .../databricks/relation_configs/comment.py | 23 ++-- .../relation_configs/materialized_view.py | 27 +++- .../relation_configs/partitioning.py | 55 ++++++++ .../databricks/relation_configs/query.py | 39 ++++-- .../databricks/relation_configs/refresh.py | 46 ++++--- tests/unit/relation_configs/test_comment.py | 40 +++++- .../unit/relation_configs/test_config_base.py | 129 ------------------ .../relation_configs/test_partitioning.py | 97 +++++++++++++ tests/unit/relation_configs/test_refresh.py | 55 +++++--- 10 files changed, 339 insertions(+), 296 deletions(-) create mode 100644 dbt/adapters/databricks/relation_configs/partitioning.py delete mode 100644 tests/unit/relation_configs/test_config_base.py create mode 100644 tests/unit/relation_configs/test_partitioning.py diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index 62768e230..8c9dc52bd 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -1,16 +1,16 @@ -from abc import ABC -from dataclasses import InitVar, dataclass -import itertools -from typing import Any, ClassVar, Dict, Generic, List, Optional, TypeVar, Union -from dbt.adapters.relation_configs.config_base import RelationConfigBase -from dbt.adapters.relation_configs.config_change import RelationConfigChange +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import ClassVar, Dict, Generic, List, TypeVar + +from dbt.adapters.relation_configs.config_base import RelationConfigBase, RelationResults from dbt.contracts.graph.nodes import ModelNode -from agate import Row +from dbt.adapters.relation_configs.config_change import RelationConfigChange @dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksComponentConfig(ABC): + @abstractmethod def to_sql_clause(self) -> str: raise NotImplementedError("Must be implemented by subclass") @@ -18,118 +18,42 @@ def to_sql_clause(self) -> str: Component = TypeVar("Component", bound=DatabricksComponentConfig) +@dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksComponentProcessor(ABC, Generic[Component]): - @classmethod - def name(cls) -> str: - raise NotImplementedError("Must be implemented by subclass") - - @classmethod - def description_target(cls) -> str: - raise NotImplementedError("Must be implemented by subclass") - - @classmethod - def process_description_row(cls, row: Row) -> Component: - assert row[0] == cls.description_target() - return cls.process_description_row_impl(row) + name: ClassVar[str] @classmethod - def process_description_row_impl(cls, row: Row) -> Component: + @abstractmethod + def from_results(cls, row: RelationResults) -> Component: raise NotImplementedError("Must be implemented by subclass") @classmethod - def process_model_node(cls, model_node: ModelNode) -> Component: + @abstractmethod + def from_model_node(cls, model_node: ModelNode) -> Component: raise NotImplementedError("Must be implemented by subclass") -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class PartitionedByConfig(DatabricksComponentConfig): - partition_by: Optional[List[str]] = None - - def to_sql_clause(self) -> str: - if self.partition_by: - return f"PARTITIONED BY ({', '.join(self.partition_by)})" - return "" - - -class PartitionedByProcessor: - @classmethod - def process_partition_rows(cls, results: List[Row]) -> PartitionedByConfig: - cols = [] - rows = itertools.takewhile( - lambda row: row[0], - itertools.dropwhile(lambda row: row[0] != "# Partition Information", results), - ) - for row in rows: - if not row[0].startswith("# "): - cols.append(row[0]) - - return PartitionedByConfig(cols) - - @classmethod - def process_model_node(cls, model_node: ModelNode) -> PartitionedByConfig: - partition_by = model_node.config.extra.get("partition_by") - if isinstance(partition_by, str): - return PartitionedByConfig([partition_by]) - return PartitionedByConfig(partition_by) - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class PartitionedByConfigChange(RelationConfigChange): - context: Optional[PartitionedByConfig] = None - - def requires_full_refresh(self) -> bool: - return True - - @dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksRelationConfigBase(RelationConfigBase, ABC): - partitioned_by_processor: ClassVar[Optional[type[PartitionedByProcessor]]] = None - config_components: ClassVar[List[type[DatabricksComponentProcessor]]] = [] - _indexed_processors: ClassVar[Dict[str, DatabricksComponentProcessor]] = None - - @classmethod - def indexed_processors(cls) -> Dict[str, DatabricksComponentProcessor]: - if not cls._indexed_processors: - cls._indexed_processors = { - component.description_target(): component for component in cls.config_components - } - return cls._indexed_processors + config_components: ClassVar[List[type[DatabricksComponentProcessor]]] @classmethod def from_model_node(cls, model_node: ModelNode) -> RelationConfigBase: config_dict: Dict[str, DatabricksComponentConfig] = {} - if cls.partitioned_by_processor: - config_dict["partition_by"] = cls.partitioned_by_processor.process_model_node( - model_node - ) - for component in cls.config_components(): + for component in cls.config_components: config_dict[component.name] = component.from_model_node(model_node) - config = cls.from_dict(config_dict) - return config - - @classmethod - def from_describe_extended(cls, results: List[Row]) -> RelationConfigBase: - relation_config = cls.parse_describe_extended(results) - relation = cls.from_dict(relation_config) - return relation + return cls.from_dict(config_dict) @classmethod - def parse_describe_extended(cls, results: List[Row]) -> dict[str, DatabricksComponentConfig]: - parsed: Dict[str, DatabricksComponentConfig] = { - component.name(): None for component in cls.config_components - } - - if cls.partitioned_by_processor: - parsed["partition_by"] = cls.partitioned_by_processor.process_partition_rows(results) + def from_results(cls, results: RelationResults) -> RelationConfigBase: + config_dict: Dict[str, DatabricksComponentConfig] = {} + for component in cls.config_components: + config_dict[component.name] = component.from_results(results) - # Skip to metadata - rows = itertools.dropwhile(lambda row: row[0] != "# Detailed Table Information", results) - indexed_processors = cls.indexed_processors() + return cls.from_dict(config_dict) - for row in rows: - if row[0] in indexed_processors: - processor = indexed_processors[row[0]] - parsed[processor.name()] = processor.process_description_row(row) - return parsed +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class DatabricksRelationChangeSet: + changes: List[RelationConfigChange] diff --git a/dbt/adapters/databricks/relation_configs/comment.py b/dbt/adapters/databricks/relation_configs/comment.py index b66f36e00..fd7b2688f 100644 --- a/dbt/adapters/databricks/relation_configs/comment.py +++ b/dbt/adapters/databricks/relation_configs/comment.py @@ -1,12 +1,12 @@ from dataclasses import dataclass -from typing import Optional -from agate import Row +from typing import Optional, ClassVar from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( DatabricksComponentConfig, DatabricksComponentProcessor, ) +from dbt.adapters.relation_configs.config_base import RelationResults from dbt.adapters.relation_configs.config_change import RelationConfigChange @@ -21,20 +21,18 @@ def to_sql_clause(self) -> str: class CommentProcessor(DatabricksComponentProcessor[CommentConfig]): - @classmethod - def name(cls) -> str: - return "comment" - - @classmethod - def description_target(cls) -> str: - return "Comment" + name: ClassVar[str] = "comment" @classmethod - def process_description_row_impl(cls, row: Row) -> CommentConfig: - return CommentConfig(row[1]) + def from_results(cls, results: RelationResults) -> CommentConfig: + table = results["describe_extended"] + for row in table.rows: + if row[0] == "Comment": + return CommentConfig(row[1]) + return CommentConfig() @classmethod - def process_model_node(cls, model_node: ModelNode) -> CommentConfig: + def from_model_node(cls, model_node: ModelNode) -> CommentConfig: return CommentConfig(model_node.description) @@ -42,5 +40,6 @@ def process_model_node(cls, model_node: ModelNode) -> CommentConfig: class CommentConfigChange(RelationConfigChange): context: Optional[CommentConfig] = None + @property def requires_full_refresh(self) -> bool: return False diff --git a/dbt/adapters/databricks/relation_configs/materialized_view.py b/dbt/adapters/databricks/relation_configs/materialized_view.py index 80e5c68c2..79039d1ac 100644 --- a/dbt/adapters/databricks/relation_configs/materialized_view.py +++ b/dbt/adapters/databricks/relation_configs/materialized_view.py @@ -1,19 +1,34 @@ from dataclasses import dataclass -from typing import List, Optional from dbt.adapters.databricks.relation_configs.base import ( DatabricksRelationConfigBase, +) +from dbt.adapters.databricks.relation_configs.comment import ( + CommentConfig, + CommentConfigChange, + CommentProcessor, +) +from dbt.adapters.databricks.relation_configs.partitioning import ( PartitionedByConfig, + PartitionedByConfigChange, PartitionedByProcessor, ) -from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor -from dbt.adapters.databricks.relation_configs.refresh import RefreshConfig, RefreshProcessor +from dbt.adapters.databricks.relation_configs.query import ( + QueryConfig, + QueryConfigChange, + QueryProcessor, +) +from dbt.adapters.databricks.relation_configs.refresh import ( + RefreshConfig, + RefreshConfigChange, + RefreshProcessor, +) @dataclass(frozen=True, eq=True, unsafe_hash=True) class MaterializedViewConfig(DatabricksRelationConfigBase): - partitioned_by_processor = PartitionedByProcessor - config_components = [CommentProcessor, RefreshProcessor] + config_components = [PartitionedByProcessor, CommentProcessor, RefreshProcessor, QueryProcessor] + partition_by: PartitionedByConfig comment: CommentConfig refresh: RefreshConfig - partition_by: PartitionedByConfig + query: QueryConfig diff --git a/dbt/adapters/databricks/relation_configs/partitioning.py b/dbt/adapters/databricks/relation_configs/partitioning.py new file mode 100644 index 000000000..3496942b7 --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/partitioning.py @@ -0,0 +1,55 @@ +from dataclasses import dataclass +import itertools +from typing import ClassVar, List, Optional + +from dbt.adapters.relation_configs.config_base import RelationResults +from dbt.adapters.relation_configs.config_change import RelationConfigChange +from dbt.contracts.graph.nodes import ModelNode +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksComponentConfig, + DatabricksComponentProcessor, +) + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class PartitionedByConfig(DatabricksComponentConfig): + partition_by: Optional[List[str]] = None + + def to_sql_clause(self) -> str: + if self.partition_by: + return f"PARTITIONED BY ({', '.join(self.partition_by)})" + return "" + + +class PartitionedByProcessor(DatabricksComponentProcessor): + name: ClassVar[str] = "partitioned_by" + + @classmethod + def from_results(cls, results: RelationResults) -> PartitionedByConfig: + table = results["describe_extended"] + cols = [] + rows = itertools.takewhile( + lambda row: row[0], + itertools.dropwhile(lambda row: row[0] != "# Partition Information", table.rows), + ) + for row in rows: + if not row[0].startswith("# "): + cols.append(row[0]) + + return PartitionedByConfig(cols) + + @classmethod + def from_model_node(cls, model_node: ModelNode) -> PartitionedByConfig: + partition_by = model_node.config.extra.get("partition_by") + if isinstance(partition_by, str): + return PartitionedByConfig([partition_by]) + return PartitionedByConfig(partition_by) + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class PartitionedByConfigChange(RelationConfigChange): + context: Optional[PartitionedByConfig] = None + + @property + def requires_full_refresh(self) -> bool: + return True diff --git a/dbt/adapters/databricks/relation_configs/query.py b/dbt/adapters/databricks/relation_configs/query.py index b8f605d5b..4cbdf87a2 100644 --- a/dbt/adapters/databricks/relation_configs/query.py +++ b/dbt/adapters/databricks/relation_configs/query.py @@ -1,10 +1,16 @@ -import dataclasses +from dataclasses import dataclass from typing import Optional +from dbt.adapters.relation_configs.config_base import RelationResults +from dbt.contracts.graph.nodes import ModelNode +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksComponentConfig, + DatabricksComponentProcessor, +) +from dbt.adapters.relation_configs.config_change import RelationConfigChange +from dbt.exceptions import DbtRuntimeError -from dbt.adapters.databricks.relation_configs.base import DatabricksComponentConfig - -@dataclasses(frozen=True, eq=True, unsafe_hash=True) +@dataclass(frozen=True, eq=True, unsafe_hash=True) class QueryConfig(DatabricksComponentConfig): query: str @@ -12,11 +18,26 @@ def to_sql_clause(self) -> str: return self.query -class QueryProcessor(DatabricksComponentProcessor[CommentConfig]): +class QueryProcessor(DatabricksComponentProcessor[QueryConfig]): @classmethod - def process_description_row_impl(cls, row: Row) -> CommentConfig: - return CommentConfig(row[1]) + def from_results(cls, result: RelationResults) -> QueryConfig: + row = result["information_schema.views"] + return QueryConfig(row["view_definition"]) @classmethod - def process_model_node(cls, model_node: ModelNode) -> QueryConfig: - return CommentConfig(model_node.description) + def from_model_node(cls, model_node: ModelNode) -> QueryConfig: + query = model_node.compiled_code + + if query: + return QueryConfig(query.strip()) + else: + raise DbtRuntimeError(f"Cannot compile model {model_node.unique_id} with no SQL query") + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class QueryConfigChange(RelationConfigChange): + context: Optional[QueryConfig] = None + + @property + def requires_full_refresh(self) -> bool: + return True diff --git a/dbt/adapters/databricks/relation_configs/refresh.py b/dbt/adapters/databricks/relation_configs/refresh.py index dec0b0062..8fbf40215 100644 --- a/dbt/adapters/databricks/relation_configs/refresh.py +++ b/dbt/adapters/databricks/relation_configs/refresh.py @@ -1,8 +1,9 @@ from abc import ABC from dataclasses import dataclass import re -from typing import Optional -from agate import Row +from typing import ClassVar, Optional + +from dbt.adapters.relation_configs.config_base import RelationResults from dbt.exceptions import DbtRuntimeError from dbt.adapters.relation_configs.config_change import RelationConfigChange from dbt.contracts.graph.nodes import ModelNode @@ -41,32 +42,34 @@ def to_sql_clause(self) -> str: @dataclass(frozen=True, eq=True, unsafe_hash=True) class RefreshProcessor(DatabricksComponentProcessor[RefreshConfig]): - @classmethod - def name(cls) -> str: - return "refresh" + name: ClassVar[str] = "refresh" @classmethod - def description_target(cls) -> str: - return "Refresh Schedule" + def from_results(cls, results: RelationResults) -> RefreshConfig: + table = results["describe_extended"] + for row in table.rows: + if row[0] == "Refresh Schedule": + if row[1] == "MANUAL": + return ManualRefreshConfig() - @classmethod - def process_description_row_impl(cls, row: Row) -> RefreshConfig: - if row[1] == "MANUAL": - return ManualRefreshConfig() + match = SCHEDULE_REGEX.match(row[1]) - match = SCHEDULE_REGEX.match(row[1]) + if match: + cron, time_zone_value = match.groups() + return ScheduledRefreshConfig(cron=cron, time_zone_value=time_zone_value) - if match: - cron, time_zone_value = match.groups() - return ScheduledRefreshConfig(cron=cron, time_zone_value=time_zone_value) - else: - raise DbtRuntimeError( - f"Could not parse schedule from description: {row[1]}." - " This is most likely a bug in the dbt-databricks adapter, so please file an issue!" - ) + raise DbtRuntimeError( + f"Could not parse schedule from description: {row[1]}." + " This is most likely a bug in the dbt-databricks adapter, so please file an issue!" + ) + + raise DbtRuntimeError( + f"Could not parse schedule for table." + " This is most likely a bug in the dbt-databricks adapter, so please file an issue!" + ) @classmethod - def process_model_node(cls, model_node: ModelNode) -> RefreshConfig: + def from_model_node(cls, model_node: ModelNode) -> RefreshConfig: schedule = model_node.config.extra.get("schedule") if schedule: if "cron" not in schedule: @@ -82,5 +85,6 @@ def process_model_node(cls, model_node: ModelNode) -> RefreshConfig: class RefreshConfigChange(RelationConfigChange): context: Optional[RefreshConfig] = None + @property def requires_full_refresh(self) -> bool: return False diff --git a/tests/unit/relation_configs/test_comment.py b/tests/unit/relation_configs/test_comment.py index b0957e175..0a45e731c 100644 --- a/tests/unit/relation_configs/test_comment.py +++ b/tests/unit/relation_configs/test_comment.py @@ -1,5 +1,6 @@ import pytest -from dbt.adapters.databricks.relation_configs.comment import CommentConfig +from agate import Table, Row +from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor class TestCommentConfig: @@ -8,3 +9,40 @@ class TestCommentConfig: ) def test_comment_config(self, input, expected): assert CommentConfig(input).to_sql_clause() == expected + + +class TestCommentProcessor: + def test_from_results__no_comment(self): + results = { + "describe_extended": Table( + rows=[ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ["Schema:", "default", None], + ["Table:", "table_abc", None], + ] + ) + } + config = CommentProcessor.from_results(results) + assert config == CommentConfig() + + def test_from_results__with_comment(self): + results = { + "describe_extended": Table( + rows=[ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ["Schema:", "default", None], + ["Table:", "table_abc", None], + ["Comment", "This is the table comment", None], + ] + ) + } + config = CommentProcessor.from_results(results) + assert config == CommentConfig("This is the table comment") diff --git a/tests/unit/relation_configs/test_config_base.py b/tests/unit/relation_configs/test_config_base.py deleted file mode 100644 index d23d31f53..000000000 --- a/tests/unit/relation_configs/test_config_base.py +++ /dev/null @@ -1,129 +0,0 @@ -from dataclasses import dataclass -from agate import Row -from mock import Mock -import pytest - -from dbt.adapters.databricks.relation_configs.base import ( - DatabricksRelationConfigBase, - PartitionedByConfig, - PartitionedByProcessor, -) -from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class TestDatabricksRelationConfig(DatabricksRelationConfigBase): - partitioned_by_processor = PartitionedByProcessor - config_components = [CommentProcessor] - comment: CommentConfig - partition_by: PartitionedByConfig - - -class TestFromDescribeExtended: - def test_from_describe_extended__simple_case(self): - results = [ - Row(["col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row([None, None, None]), - Row(["# Detailed Table Information", None, None]), - Row(["Catalog:", "default", None]), - Row(["Schema:", "default", None]), - Row(["Table:", "table_abc", None]), - Row(["Comment", "This is the table comment", None]), - ] - config = TestDatabricksRelationConfig.from_describe_extended(results) - assert config.comment == CommentConfig("This is the table comment") - - def test_from_describe_extended__with_partitions(self): - results = [ - Row(["col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row(["# Partition Information", None, None]), - Row(["# col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row([None, None, None]), - Row(["# Detailed Table Information", None, None]), - Row(["Catalog:", "default", None]), - Row(["Schema:", "default", None]), - Row(["Table:", "table_abc", None]), - Row(["Comment", "This is the table comment", None]), - ] - config = TestDatabricksRelationConfig.from_describe_extended(results) - assert config.comment == CommentConfig("This is the table comment") - assert config.partition_by == PartitionedByConfig(["col_a"]) - - -class TestPartitionedByConfig: - @pytest.mark.parametrize( - "input,expected", - [ - (None, ""), - ([], ""), - (["col_a"], "PARTITIONED BY (col_a)"), - (["col_a", "col_b"], "PARTITIONED BY (col_a, col_b)"), - ], - ) - def test_to_sql_clause__empty(self, input, expected): - config = PartitionedByConfig(input) - assert config.to_sql_clause() == expected - - -class TestPartitionedByProcessor: - def test_process_partition_rows__none(self): - results = [ - Row(["col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row([None, None, None]), - Row(["# Detailed Table Information", None, None]), - Row(["Catalog:", "default", None]), - ] - - spec = PartitionedByProcessor.process_partition_rows(results) - assert spec == PartitionedByConfig([]) - - def test_process_partition_rows__single(self): - results = [ - Row(["col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row(["# Partition Information", None, None]), - Row(["# col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row([None, None, None]), - Row(["# Detailed Table Information", None, None]), - Row(["Catalog:", "default", None]), - ] - spec = PartitionedByProcessor.process_partition_rows(results) - assert spec == PartitionedByConfig(["col_a"]) - - def test_process_partition_rows__multiple(self): - results = [ - Row(["col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row(["# Partition Information", None, None]), - Row(["# col_name", "data_type", "comment"]), - Row(["col_a", "int", "This is a comment"]), - Row(["col_b", "int", "This is a comment"]), - Row([None, None, None]), - Row(["# Detailed Table Information", None, None]), - Row(["Catalog:", "default", None]), - ] - spec = PartitionedByProcessor.process_partition_rows(results) - assert spec == PartitionedByConfig(["col_a", "col_b"]) - - def test_process_model_node__without_partition_by(self): - model = Mock() - model.config.extra.get.return_value = None - spec = PartitionedByProcessor.process_model_node(model) - assert spec == PartitionedByConfig(None) - - def test_process_model_node__single_column(self): - model = Mock() - model.config.extra.get.return_value = "col_a" - spec = PartitionedByProcessor.process_model_node(model) - assert spec == PartitionedByConfig(["col_a"]) - - def test_process_model_node__multiple_columns(self): - model = Mock() - model.config.extra.get.return_value = ["col_a", "col_b"] - spec = PartitionedByProcessor.process_model_node(model) - assert spec == PartitionedByConfig(["col_a", "col_b"]) diff --git a/tests/unit/relation_configs/test_partitioning.py b/tests/unit/relation_configs/test_partitioning.py new file mode 100644 index 000000000..24242f6c6 --- /dev/null +++ b/tests/unit/relation_configs/test_partitioning.py @@ -0,0 +1,97 @@ +from mock import Mock +import pytest +from agate import Table + +from dbt.adapters.databricks.relation_configs.partitioning import ( + PartitionedByConfig, + PartitionedByProcessor, +) + + +class TestPartitionedByConfig: + @pytest.mark.parametrize( + "input,expected", + [ + (None, ""), + ([], ""), + (["col_a"], "PARTITIONED BY (col_a)"), + (["col_a", "col_b"], "PARTITIONED BY (col_a, col_b)"), + ], + ) + def test_to_sql_clause(self, input, expected): + config = PartitionedByConfig(input) + assert config.to_sql_clause() == expected + + +class TestPartitionedByProcessor: + def test_from_results__none(self): + results = { + "describe_extended": Table( + rows=[ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ] + ) + } + + spec = PartitionedByProcessor.from_results(results) + assert spec == PartitionedByConfig([]) + + def test_from_results__single(self): + results = { + "describe_extended": Table( + rows=[ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + ["# Partition Information", None, None], + ["# col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ] + ) + } + + spec = PartitionedByProcessor.from_results(results) + assert spec == PartitionedByConfig(["col_a"]) + + def test_from_results__multiple(self): + results = { + "describe_extended": Table( + rows=[ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + ["# Partition Information", None, None], + ["# col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + ["col_b", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ] + ) + } + spec = PartitionedByProcessor.from_results(results) + assert spec == PartitionedByConfig(["col_a", "col_b"]) + + def test_from_model_node__without_partition_by(self): + model = Mock() + model.config.extra.get.return_value = None + spec = PartitionedByProcessor.from_model_node(model) + assert spec == PartitionedByConfig(None) + + def test_from_model_node__single_column(self): + model = Mock() + model.config.extra.get.return_value = "col_a" + spec = PartitionedByProcessor.from_model_node(model) + assert spec == PartitionedByConfig(["col_a"]) + + def test_from_model_node__multiple_columns(self): + model = Mock() + model.config.extra.get.return_value = ["col_a", "col_b"] + spec = PartitionedByProcessor.from_model_node(model) + assert spec == PartitionedByConfig(["col_a", "col_b"]) diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py index 918949e79..7189e8125 100644 --- a/tests/unit/relation_configs/test_refresh.py +++ b/tests/unit/relation_configs/test_refresh.py @@ -1,3 +1,4 @@ +from typing import Any, List from mock import Mock import pytest from agate import Row @@ -7,53 +8,71 @@ ScheduledRefreshConfig, ) from dbt.exceptions import DbtRuntimeError +from agate import Table class TestRefreshProcessor: - def test_process_description_row_impl__valid_schedule(self): - spec = RefreshProcessor.process_description_row_impl( - Row(["Refresh Schedule", "CRON '*/5 * * * *' AT TIME ZONE 'UTC'"]) - ) + @pytest.fixture + def rows(self) -> List[List[str | Any]]: + return [ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ["Schema:", "default", None], + ["Table:", "table_abc", None], + ] + + def test_from_results__valid_schedule(self, rows): + results = { + "describe_extended": Table( + rows=rows + [["Refresh Schedule", "CRON '*/5 * * * *' AT TIME ZONE 'UTC'"]] + ) + } + spec = RefreshProcessor.from_results(results) assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") - def test_process_description_row_impl__manual(self): - spec = RefreshProcessor.process_description_row_impl(Row(["Refresh Schedule", "MANUAL"])) + def test_from_results__manual(self, rows): + results = {"describe_extended": Table(rows=rows + [["Refresh Schedule", "MANUAL"]])} + spec = RefreshProcessor.from_results(results) assert spec == ManualRefreshConfig() - def test_process_description_row_impl__invalid(self): + def test_from_results__invalid(self, rows): + results = { + "describe_extended": Table(rows=rows + [["Refresh Schedule", "invalid description"]]) + } with pytest.raises( DbtRuntimeError, - match="Could not parse schedule from description: invalid description.", + match="Could not parse schedule from description: invalid description", ): - RefreshProcessor.process_description_row_impl( - Row(["Refresh Schedule", "invalid description"]) - ) + RefreshProcessor.from_results(results) - def test_process_model_node__without_schedule(self): + def test_from_model_node__without_schedule(self): model = Mock() model.config.extra.get.return_value = {} - spec = RefreshProcessor.process_model_node(model) + spec = RefreshProcessor.from_model_node(model) assert spec == ManualRefreshConfig() - def test_process_model_node__without_cron(self): + def test_from_model_node__without_cron(self): model = Mock() model.config.extra.get.return_value = {"time_zone_value": "UTC"} with pytest.raises( DbtRuntimeError, match="Schedule config must contain a 'cron' key, got {'time_zone_value': 'UTC'}.", ): - RefreshProcessor.process_model_node(model) + RefreshProcessor.from_model_node(model) - def test_process_model_node__without_timezone(self): + def test_from_model_node__without_timezone(self): model = Mock() model.config.extra.get.return_value = {"cron": "*/5 * * * *"} - spec = RefreshProcessor.process_model_node(model) + spec = RefreshProcessor.from_model_node(model) assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) def test_process_model_node__both(self): model = Mock() model.config.extra.get.return_value = {"cron": "*/5 * * * *", "time_zone_value": "UTC"} - spec = RefreshProcessor.process_model_node(model) + spec = RefreshProcessor.from_model_node(model) assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") From 5f12412047725ef4a58cb59c9cc4ffca38b02c48 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 14 Dec 2023 08:59:58 -0800 Subject: [PATCH 04/35] wip --- dbt/adapters/databricks/impl.py | 7 ++++++- dbt/adapters/databricks/relation_configs/base.py | 10 +++++++++- .../databricks/relation_configs/materialized_view.py | 4 ---- .../macros/relations/materialized_view/alter.sql | 0 .../macros/relations/materialized_view/create.sql | 6 ++++++ 5 files changed, 21 insertions(+), 6 deletions(-) create mode 100644 dbt/include/databricks/macros/relations/materialized_view/alter.sql diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 2eaa6a87f..16853ad2d 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -41,7 +41,7 @@ from dbt.clients.agate_helper import DEFAULT_TYPE_TESTER, empty_table from dbt.contracts.connection import AdapterResponse, Connection from dbt.contracts.graph.manifest import Manifest -from dbt.contracts.graph.nodes import ResultNode +from dbt.contracts.graph.nodes import ResultNode, ModelNode from dbt.contracts.relation import RelationType import dbt.exceptions from dbt.events import AdapterLogger @@ -58,6 +58,7 @@ DatabricksRelation, DatabricksRelationType, ) +from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig from dbt.adapters.databricks.utils import redact_credentials, undefined_proof @@ -734,3 +735,7 @@ def get_persist_doc_columns( return_columns[name] = columns[name] return return_columns + + @available.parse_none + def materialized_view_from_model(self, model: ModelNode) -> MaterializedViewConfig: + return MaterializedViewConfig.from_model_node(model) # type: ignore diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index 8c9dc52bd..80c066263 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -34,7 +34,7 @@ def from_model_node(cls, model_node: ModelNode) -> Component: @dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksRelationConfigBase(RelationConfigBase, ABC): +class DatabricksRelationConfigBase(RelationConfigBase): config_components: ClassVar[List[type[DatabricksComponentProcessor]]] @classmethod @@ -57,3 +57,11 @@ def from_results(cls, results: RelationResults) -> RelationConfigBase: @dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksRelationChangeSet: changes: List[RelationConfigChange] + + @property + def requires_full_refresh(self) -> bool: + return any(change.requires_full_refresh for change in self.changes) + + @property + def has_changes(self) -> bool: + return len(self.changes) > 0 diff --git a/dbt/adapters/databricks/relation_configs/materialized_view.py b/dbt/adapters/databricks/relation_configs/materialized_view.py index 79039d1ac..60507e634 100644 --- a/dbt/adapters/databricks/relation_configs/materialized_view.py +++ b/dbt/adapters/databricks/relation_configs/materialized_view.py @@ -4,22 +4,18 @@ ) from dbt.adapters.databricks.relation_configs.comment import ( CommentConfig, - CommentConfigChange, CommentProcessor, ) from dbt.adapters.databricks.relation_configs.partitioning import ( PartitionedByConfig, - PartitionedByConfigChange, PartitionedByProcessor, ) from dbt.adapters.databricks.relation_configs.query import ( QueryConfig, - QueryConfigChange, QueryProcessor, ) from dbt.adapters.databricks.relation_configs.refresh import ( RefreshConfig, - RefreshConfigChange, RefreshProcessor, ) diff --git a/dbt/include/databricks/macros/relations/materialized_view/alter.sql b/dbt/include/databricks/macros/relations/materialized_view/alter.sql new file mode 100644 index 000000000..e69de29bb diff --git a/dbt/include/databricks/macros/relations/materialized_view/create.sql b/dbt/include/databricks/macros/relations/materialized_view/create.sql index 80043a284..3376d8761 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/create.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/create.sql @@ -1,5 +1,11 @@ {% macro databricks__get_create_materialized_view_as_sql(relation, sql) -%} + + {%- set materialized_view = adapter.materialized_view_from_model(config.model) -%} + create materialized view {{ relation }} + {% materialized_view.partition_by.to_sql_clause() -%} + {% materialized_view.comment.to_sql_clause() -%} + {% materialized_view.refresh.to_sql_clause() -%} as {{ sql }} {% endmacro %} \ No newline at end of file From 522bb9f70a929c2ca6faeef3439802cdb24d54c5 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 15:37:38 -0800 Subject: [PATCH 05/35] mostly fleshed out --- dbt/adapters/databricks/impl.py | 27 ++- dbt/adapters/databricks/relation.py | 16 ++ .../databricks/relation_configs/base.py | 77 ++++++-- .../databricks/relation_configs/comment.py | 10 - .../relation_configs/materialized_view.py | 37 +++- .../relation_configs/partitioning.py | 12 +- .../databricks/relation_configs/query.py | 14 +- .../databricks/relation_configs/refresh.py | 46 +++-- .../relation_configs/tblproperties.py | 123 ++++++++++++ .../databricks/relation_configs/util.py | 7 - .../materializations/materialized_view.sql | 91 +++++++++ .../materialized_view/materialized_view.sql | 43 ---- .../relations/materialized_view/alter.sql | 31 +++ .../relations/materialized_view/create.sql | 1 + .../relations/materialized_view/refresh.sql | 6 +- .../unit/relation_configs/test_config_base.py | 119 +++++++++++ .../test_materialized_view_config.py | 68 +++++++ .../relation_configs/test_partitioning.py | 6 +- tests/unit/relation_configs/test_query.py | 30 +++ tests/unit/relation_configs/test_refresh.py | 72 ++++++- .../relation_configs/test_tblproperties.py | 185 ++++++++++++++++++ tests/unit/relation_configs/test_util.py | 16 -- 22 files changed, 896 insertions(+), 141 deletions(-) create mode 100644 dbt/adapters/databricks/relation_configs/tblproperties.py delete mode 100644 dbt/adapters/databricks/relation_configs/util.py create mode 100644 dbt/include/databricks/macros/materializations/materialized_view.sql delete mode 100644 dbt/include/databricks/macros/materializations/materialized_view/materialized_view.sql create mode 100644 tests/unit/relation_configs/test_config_base.py create mode 100644 tests/unit/relation_configs/test_materialized_view_config.py create mode 100644 tests/unit/relation_configs/test_query.py create mode 100644 tests/unit/relation_configs/test_tblproperties.py delete mode 100644 tests/unit/relation_configs/test_util.py diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 16853ad2d..ff2b9435b 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -31,6 +31,7 @@ ) from dbt.adapters.spark.impl import ( SparkAdapter, + DESCRIBE_TABLE_EXTENDED_MACRO_NAME, GET_COLUMNS_IN_RELATION_RAW_MACRO_NAME, KEY_TABLE_OWNER, KEY_TABLE_STATISTICS, @@ -60,6 +61,7 @@ ) from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig from dbt.adapters.databricks.utils import redact_credentials, undefined_proof +from dbt.adapters.relation_configs.config_base import RelationResults logger = AdapterLogger("Databricks") @@ -737,5 +739,28 @@ def get_persist_doc_columns( return return_columns @available.parse_none - def materialized_view_from_model(self, model: ModelNode) -> MaterializedViewConfig: + def materialized_view_config_from_model(self, model: ModelNode) -> MaterializedViewConfig: return MaterializedViewConfig.from_model_node(model) # type: ignore + + @available.parse_none + def get_relation_config(self, relation: DatabricksRelation) -> MaterializedViewConfig: + if relation.type == RelationType.MaterializedView: + results = self.describe_materialized_view(relation) + return MaterializedViewConfig.from_results(results) + else: + raise dbt.exceptions.DbtRuntimeError( + f"The method `DatabricksAdapter.get_relation_config` is not implemented " + f"for the relation type: {relation.type}" + ) + + def describe_materialized_view(self, relation: DatabricksRelation) -> RelationResults: + kwargs = {"relation": relation} + results: RelationResults = dict() + results["describe_extended"] = self.execute_macro( + DESCRIBE_TABLE_EXTENDED_MACRO_NAME, kwargs=kwargs + ) + results["information_schema.views"] = self.execute_macro( + "get_view_description", kwargs=kwargs + ) + results["show_tblproperties"] = self.execute_macro("fetch_tbl_properties", kwargs=kwargs) + return results diff --git a/dbt/adapters/databricks/relation.py b/dbt/adapters/databricks/relation.py index 07046f98b..38b0a7602 100644 --- a/dbt/adapters/databricks/relation.py +++ b/dbt/adapters/databricks/relation.py @@ -10,6 +10,7 @@ from dbt.adapters.databricks.utils import remove_undefined from dbt.utils import filter_null_values, classproperty from dbt.exceptions import DbtRuntimeError +from dbt.adapters.base.relation import RelationType KEY_TABLE_PROVIDER = "Provider" @@ -65,6 +66,21 @@ def __pre_deserialize__(cls, data: Dict[Any, Any]) -> Dict[Any, Any]: data["path"]["database"] = remove_undefined(data["path"]["database"]) return data + renameable_relations = frozenset( + { + RelationType.View, + RelationType.Table, + } + ) + + # list relations that can be atomically replaced (e.g. `CREATE OR REPLACE my_relation..` versus `DROP` and `CREATE`) + replaceable_relations = frozenset( + { + RelationType.View, + RelationType.Table, + } + ) + def has_information(self) -> bool: return self.metadata is not None diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index 80c066263..c37830222 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -1,11 +1,16 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import ClassVar, Dict, Generic, List, TypeVar +from typing import ClassVar, Dict, Generic, List, Optional, TypeVar +from typing_extensions import Self from dbt.adapters.relation_configs.config_base import RelationConfigBase, RelationResults from dbt.contracts.graph.nodes import ModelNode +from dbt.exceptions import DbtRuntimeError -from dbt.adapters.relation_configs.config_change import RelationConfigChange +from dbt.adapters.relation_configs.config_change import ( + RelationConfigChange, + RelationConfigChangeAction, +) @dataclass(frozen=True, eq=True, unsafe_hash=True) @@ -14,10 +19,60 @@ class DatabricksComponentConfig(ABC): def to_sql_clause(self) -> str: raise NotImplementedError("Must be implemented by subclass") + def get_diff(self, other: "DatabricksComponentConfig") -> Self: + if not isinstance(other, self.__class__): + raise DbtRuntimeError( + f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" + ) + return self + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class DatabricksAlterableComponentConfig(DatabricksComponentConfig, ABC): + @abstractmethod + def to_alter_sql_clauses(self) -> List[str]: + raise NotImplementedError("Must be implemented by subclass") + Component = TypeVar("Component", bound=DatabricksComponentConfig) +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class DatabricksConfigChange(RelationConfigChange, Generic[Component]): + context: Component + + @property + def requires_full_refresh(self) -> bool: + return not isinstance(self.context, DatabricksAlterableComponentConfig) + + @classmethod + def get_change(cls, new: Component, existing: Component) -> Optional[Self]: + if new != existing: + return cls(RelationConfigChangeAction.alter, new.get_diff(existing)) + return None + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class DatabricksRelationChangeSet: + changes: List[DatabricksConfigChange] + + @property + def requires_full_refresh(self) -> bool: + return any(change.requires_full_refresh for change in self.changes) + + @property + def has_changes(self) -> bool: + return len(self.changes) > 0 + + def get_alter_sql_clauses(self) -> List[str]: + assert ( + not self.requires_full_refresh + ), "Cannot alter a relation when changes requires a full refresh" + return [ + clause for change in self.changes for clause in change.context.to_alter_sql_clauses() + ] + + @dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksComponentProcessor(ABC, Generic[Component]): name: ClassVar[str] @@ -33,6 +88,9 @@ def from_model_node(cls, model_node: ModelNode) -> Component: raise NotImplementedError("Must be implemented by subclass") +T = TypeVar("T", bound=DatabricksComponentConfig) + + @dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksRelationConfigBase(RelationConfigBase): config_components: ClassVar[List[type[DatabricksComponentProcessor]]] @@ -53,15 +111,6 @@ def from_results(cls, results: RelationResults) -> RelationConfigBase: return cls.from_dict(config_dict) - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksRelationChangeSet: - changes: List[RelationConfigChange] - - @property - def requires_full_refresh(self) -> bool: - return any(change.requires_full_refresh for change in self.changes) - - @property - def has_changes(self) -> bool: - return len(self.changes) > 0 + @abstractmethod + def get_changeset(self, existing: Self) -> Optional[DatabricksRelationChangeSet]: + raise NotImplementedError("Must be implemented by subclass") diff --git a/dbt/adapters/databricks/relation_configs/comment.py b/dbt/adapters/databricks/relation_configs/comment.py index fd7b2688f..40d6eb426 100644 --- a/dbt/adapters/databricks/relation_configs/comment.py +++ b/dbt/adapters/databricks/relation_configs/comment.py @@ -7,7 +7,6 @@ DatabricksComponentProcessor, ) from dbt.adapters.relation_configs.config_base import RelationResults -from dbt.adapters.relation_configs.config_change import RelationConfigChange @dataclass(frozen=True, eq=True, unsafe_hash=True) @@ -34,12 +33,3 @@ def from_results(cls, results: RelationResults) -> CommentConfig: @classmethod def from_model_node(cls, model_node: ModelNode) -> CommentConfig: return CommentConfig(model_node.description) - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class CommentConfigChange(RelationConfigChange): - context: Optional[CommentConfig] = None - - @property - def requires_full_refresh(self) -> bool: - return False diff --git a/dbt/adapters/databricks/relation_configs/materialized_view.py b/dbt/adapters/databricks/relation_configs/materialized_view.py index 60507e634..6f0c888e4 100644 --- a/dbt/adapters/databricks/relation_configs/materialized_view.py +++ b/dbt/adapters/databricks/relation_configs/materialized_view.py @@ -1,5 +1,10 @@ from dataclasses import dataclass +from typing import List, Optional + +from dbt.exceptions import DbtRuntimeError from dbt.adapters.databricks.relation_configs.base import ( + DatabricksConfigChange, + DatabricksRelationChangeSet, DatabricksRelationConfigBase, ) from dbt.adapters.databricks.relation_configs.comment import ( @@ -18,13 +23,43 @@ RefreshConfig, RefreshProcessor, ) +from dbt.adapters.databricks.relation_configs.tblproperties import ( + TblPropertiesConfig, + TblPropertiesProcessor, +) @dataclass(frozen=True, eq=True, unsafe_hash=True) class MaterializedViewConfig(DatabricksRelationConfigBase): - config_components = [PartitionedByProcessor, CommentProcessor, RefreshProcessor, QueryProcessor] + config_components = [ + PartitionedByProcessor, + CommentProcessor, + TblPropertiesProcessor, + RefreshProcessor, + QueryProcessor, + ] partition_by: PartitionedByConfig comment: CommentConfig + tblproperties: TblPropertiesConfig refresh: RefreshConfig query: QueryConfig + + def get_changeset( + self, existing: DatabricksRelationConfigBase + ) -> Optional[DatabricksRelationChangeSet]: + if not isinstance(existing, MaterializedViewConfig): + raise DbtRuntimeError( + f"Invalid comparison between MaterializedViewConfig and {type(existing)}" + ) + + changes: List[Optional[DatabricksConfigChange]] = [] + changes.append(DatabricksConfigChange.get_change(existing.partition_by, self.partition_by)) + changes.append(DatabricksConfigChange.get_change(existing.comment, self.comment)) + changes.append(DatabricksConfigChange.get_change(existing.refresh, self.refresh)) + changes.append(DatabricksConfigChange.get_change(existing.query, self.query)) + + trimmed_changes = [change for change in changes if change] + if trimmed_changes: + return DatabricksRelationChangeSet(trimmed_changes) + return None diff --git a/dbt/adapters/databricks/relation_configs/partitioning.py b/dbt/adapters/databricks/relation_configs/partitioning.py index 3496942b7..4d7e18493 100644 --- a/dbt/adapters/databricks/relation_configs/partitioning.py +++ b/dbt/adapters/databricks/relation_configs/partitioning.py @@ -3,7 +3,6 @@ from typing import ClassVar, List, Optional from dbt.adapters.relation_configs.config_base import RelationResults -from dbt.adapters.relation_configs.config_change import RelationConfigChange from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( DatabricksComponentConfig, @@ -22,7 +21,7 @@ def to_sql_clause(self) -> str: class PartitionedByProcessor(DatabricksComponentProcessor): - name: ClassVar[str] = "partitioned_by" + name: ClassVar[str] = "partition_by" @classmethod def from_results(cls, results: RelationResults) -> PartitionedByConfig: @@ -44,12 +43,3 @@ def from_model_node(cls, model_node: ModelNode) -> PartitionedByConfig: if isinstance(partition_by, str): return PartitionedByConfig([partition_by]) return PartitionedByConfig(partition_by) - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class PartitionedByConfigChange(RelationConfigChange): - context: Optional[PartitionedByConfig] = None - - @property - def requires_full_refresh(self) -> bool: - return True diff --git a/dbt/adapters/databricks/relation_configs/query.py b/dbt/adapters/databricks/relation_configs/query.py index 4cbdf87a2..c99c9337d 100644 --- a/dbt/adapters/databricks/relation_configs/query.py +++ b/dbt/adapters/databricks/relation_configs/query.py @@ -1,12 +1,11 @@ from dataclasses import dataclass -from typing import Optional +from typing import ClassVar from dbt.adapters.relation_configs.config_base import RelationResults from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( DatabricksComponentConfig, DatabricksComponentProcessor, ) -from dbt.adapters.relation_configs.config_change import RelationConfigChange from dbt.exceptions import DbtRuntimeError @@ -19,6 +18,8 @@ def to_sql_clause(self) -> str: class QueryProcessor(DatabricksComponentProcessor[QueryConfig]): + name: ClassVar[str] = "query" + @classmethod def from_results(cls, result: RelationResults) -> QueryConfig: row = result["information_schema.views"] @@ -32,12 +33,3 @@ def from_model_node(cls, model_node: ModelNode) -> QueryConfig: return QueryConfig(query.strip()) else: raise DbtRuntimeError(f"Cannot compile model {model_node.unique_id} with no SQL query") - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class QueryConfigChange(RelationConfigChange): - context: Optional[QueryConfig] = None - - @property - def requires_full_refresh(self) -> bool: - return True diff --git a/dbt/adapters/databricks/relation_configs/refresh.py b/dbt/adapters/databricks/relation_configs/refresh.py index 8fbf40215..2b5e9a777 100644 --- a/dbt/adapters/databricks/relation_configs/refresh.py +++ b/dbt/adapters/databricks/relation_configs/refresh.py @@ -1,14 +1,14 @@ from abc import ABC from dataclasses import dataclass import re -from typing import ClassVar, Optional +from typing import ClassVar, List, Optional from dbt.adapters.relation_configs.config_base import RelationResults from dbt.exceptions import DbtRuntimeError -from dbt.adapters.relation_configs.config_change import RelationConfigChange from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( + DatabricksAlterableComponentConfig, DatabricksComponentConfig, DatabricksComponentProcessor, ) @@ -16,14 +16,15 @@ SCHEDULE_REGEX = re.compile(r"CRON '(.*)' AT TIME ZONE '(.*)'") -class RefreshConfig(DatabricksComponentConfig, ABC): +class RefreshConfig(DatabricksAlterableComponentConfig, ABC): pass @dataclass(frozen=True, eq=True, unsafe_hash=True) class ScheduledRefreshConfig(RefreshConfig): cron: str - time_zone_value: Optional[str] + time_zone_value: Optional[str] = None + alter: bool = False def to_sql_clause(self) -> str: schedule = f"SCHEDULE CRON '{self.cron}'" @@ -33,12 +34,40 @@ def to_sql_clause(self) -> str: return schedule + def to_alter_sql_clauses(self) -> List[str]: + prefix = "ALTER " if self.alter else "ADD " + + return [prefix + self.to_sql_clause()] + + def get_diff(self, other: DatabricksComponentConfig) -> "ScheduledRefreshConfig": + if not isinstance(other, RefreshConfig): + raise DbtRuntimeError( + f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" + ) + + return ScheduledRefreshConfig( + cron=self.cron, + time_zone_value=self.time_zone_value, + alter=isinstance(other, ScheduledRefreshConfig), + ) + @dataclass(frozen=True, eq=True, unsafe_hash=True) class ManualRefreshConfig(RefreshConfig): def to_sql_clause(self) -> str: return "" + def to_alter_sql_clauses(self) -> List[str]: + return ["DROP SCHEDULE"] + + def get_diff(self, other: DatabricksComponentConfig) -> "ManualRefreshConfig": + if not isinstance(other, RefreshConfig): + raise DbtRuntimeError( + f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" + ) + + return ManualRefreshConfig() + @dataclass(frozen=True, eq=True, unsafe_hash=True) class RefreshProcessor(DatabricksComponentProcessor[RefreshConfig]): @@ -79,12 +108,3 @@ def from_model_node(cls, model_node: ModelNode) -> RefreshConfig: ) else: return ManualRefreshConfig() - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class RefreshConfigChange(RelationConfigChange): - context: Optional[RefreshConfig] = None - - @property - def requires_full_refresh(self) -> bool: - return False diff --git a/dbt/adapters/databricks/relation_configs/tblproperties.py b/dbt/adapters/databricks/relation_configs/tblproperties.py new file mode 100644 index 000000000..62ee1453e --- /dev/null +++ b/dbt/adapters/databricks/relation_configs/tblproperties.py @@ -0,0 +1,123 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List, ClassVar +from dbt.contracts.graph.nodes import ModelNode + +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksAlterableComponentConfig, + DatabricksComponentConfig, + DatabricksComponentProcessor, +) +from dbt.adapters.relation_configs.config_base import RelationResults +from dbt.exceptions import DbtRuntimeError + + +@dataclass(frozen=True) +class TblPropertiesConfig(DatabricksAlterableComponentConfig): + tblproperties: Dict[str, str] = field(default_factory=dict) + ignore_list: List[str] = field(default_factory=list) + to_unset: List[str] = field(default_factory=list) + + def __post_init__(self) -> None: + if any([key in self.tblproperties for key in self.to_unset]): + raise DbtRuntimeError("Cannot set and unset the same tblproperty in the same model") + + def _format_tblproperties(self, to_format: Dict[str, str]) -> str: + return ", ".join(f"'{k}' = '{v}'" for k, v in to_format.items()) + + def to_sql_clause(self) -> str: + without_ignore_list = self._without_ignore_list(self.tblproperties) + if without_ignore_list: + return f"TBLPROPERTIES ({self._format_tblproperties(without_ignore_list)})" + return "" + + def to_alter_sql_clauses(self) -> List[str]: + clauses = [] + without_ignore_list = self._without_ignore_list(self.tblproperties) + if without_ignore_list: + clauses.append(f"SET TBLPROPERTIES ({self._format_tblproperties(without_ignore_list)})") + if self.to_unset: + to_unset = ", ".join(f"'{k}'" for k in self.to_unset) + clauses.append(f"UNSET TBLPROPERTIES IF EXISTS ({to_unset})") + return clauses + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, TblPropertiesConfig): + return False + return ( + self._without_ignore_list(self.tblproperties) + == self._without_ignore_list(other.tblproperties) + and self.to_unset == other.to_unset + ) + + def _without_ignore_list(self, tblproperties: Dict[str, str]) -> Dict[str, str]: + if tblproperties: + ignore_list = self.ignore_list or ["pipelines.pipelineId"] + return {k: v for k, v in tblproperties.items() if k not in ignore_list} + return tblproperties + + def get_diff(self, other: DatabricksComponentConfig) -> "TblPropertiesConfig": + if not isinstance(other, TblPropertiesConfig): + raise DbtRuntimeError( + f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" + ) + + tblproperties = self._without_ignore_list(self.tblproperties) + other_tblproperties = self._without_ignore_list(other.tblproperties) + + tblproperties = { + k: v + for k, v in tblproperties.items() + if k not in other_tblproperties or v != other_tblproperties[k] + } + + to_unset = [] + for k in other_tblproperties.keys(): + if k not in self.tblproperties: + to_unset.append(k) + + return TblPropertiesConfig( + tblproperties=tblproperties, + ignore_list=self.ignore_list, + to_unset=to_unset, + ) + + +class TblPropertiesProcessor(DatabricksComponentProcessor[TblPropertiesConfig]): + name: ClassVar[str] = "tblproperties" + + @classmethod + def from_results(cls, results: RelationResults) -> TblPropertiesConfig: + table = results.get("show_tblproperties") + tblproperties = dict() + + if table: + for row in table.rows: + tblproperties[str(row[0])] = str(row[1]) + + return TblPropertiesConfig(tblproperties) + + @classmethod + def from_model_node(cls, model_node: ModelNode) -> TblPropertiesConfig: + tblproperties_config = model_node.config.extra.get("tblproperties_config") + if tblproperties_config: + ignore_list = tblproperties_config.get("ignore_list") + tblproperties = tblproperties_config.get("tblproperties") + if isinstance(tblproperties, Dict): + tblproperties = {str(k): str(v) for k, v in tblproperties.items()} + elif tblproperties is None: + tblproperties = dict() + else: + raise DbtRuntimeError( + "If tblproperties_config is set, tblproperties must be a dictionary" + ) + + if ignore_list is None: + return TblPropertiesConfig(tblproperties) + + if isinstance(ignore_list, List): + ignore_list = [str(i) for i in ignore_list] + return TblPropertiesConfig(tblproperties, ignore_list) + else: + raise DbtRuntimeError("ignore_list must be a list") + + return TblPropertiesConfig() diff --git a/dbt/adapters/databricks/relation_configs/util.py b/dbt/adapters/databricks/relation_configs/util.py deleted file mode 100644 index 0756149d8..000000000 --- a/dbt/adapters/databricks/relation_configs/util.py +++ /dev/null @@ -1,7 +0,0 @@ -from agate import Table, Row - - -def get_first_row(results: Table) -> Row: - if len(results.rows) == 0: - return Row(values=set()) - return results.rows[0] diff --git a/dbt/include/databricks/macros/materializations/materialized_view.sql b/dbt/include/databricks/macros/materializations/materialized_view.sql new file mode 100644 index 000000000..c3c7f3d18 --- /dev/null +++ b/dbt/include/databricks/macros/materializations/materialized_view.sql @@ -0,0 +1,91 @@ +{% materialization materialized_view, databricks %} + {% set existing_relation = load_cached_relation(this) %} + {% set target_relation = this.incorporate(type=this.MaterializedView) %} + + {{ run_hooks(pre_hooks, inside_transaction=False) }} + + {% set build_sql = materialized_view_get_build_sql(existing_relation, target_relation) %} + + {% if build_sql == '' %} + {{ materialized_view_execute_no_op(target_relation) }} + {% else %} + {{ materialized_view_execute_build_sql(build_sql, existing_relation, target_relation, post_hooks) }} + {% endif %} + + {{ run_hooks(post_hooks, inside_transaction=False) }} + + {{ return({'relations': [target_relation]}) }} + +{% endmaterialization %} + + +{% macro materialized_view_get_build_sql(existing_relation, target_relation) %} + + {% set full_refresh_mode = should_full_refresh() %} + + -- determine the scenario we're in: create, full_refresh, alter, refresh data + {% if existing_relation is none %} + {% set build_sql = get_create_materialized_view_as_sql(target_relation, sql) %} + {% elif full_refresh_mode or not existing_relation.is_materialized_view %} + {% set build_sql = get_replace_sql(existing_relation, target_relation, sql) %} + {% else %} + + -- get config options + {% set on_configuration_change = config.get('on_configuration_change') %} + {% set configuration_changes = get_materialized_view_configuration_changes(existing_relation, config) %} + + {% if configuration_changes is none %} + {% set build_sql = [refresh_materialized_view(target_relation)] %} + + {% elif on_configuration_change == 'apply' %} + {% set build_sql = get_alter_materialized_view_as_sql(target_relation, configuration_changes, sql, existing_relation, None, None) %} + {% elif on_configuration_change == 'continue' %} + {% set build_sql = [] %} + {{ exceptions.warn("Configuration changes were identified and `on_configuration_change` was set to `continue` for `" ~ target_relation ~ "`") }} + {% elif on_configuration_change == 'fail' %} + {{ exceptions.raise_fail_fast_error("Configuration changes were identified and `on_configuration_change` was set to `fail` for `" ~ target_relation ~ "`") }} + + {% else %} + -- this only happens if the user provides a value other than `apply`, 'skip', 'fail' + {{ exceptions.raise_compiler_error("Unexpected configuration scenario") }} + + {% endif %} + + {% endif %} + + {% do return(build_sql) %} + +{% endmacro %} + + +{% macro materialized_view_execute_no_op(target_relation) %} + {% do store_raw_result( + name="main", + message="skip " ~ target_relation, + code="skip", + rows_affected="-1" + ) %} +{% endmacro %} + + +{% macro materialized_view_execute_build_sql(build_sql, existing_relation, target_relation, post_hooks) %} + + -- `BEGIN` happens here: + {{ run_hooks(pre_hooks, inside_transaction=True) }} + + {% set grant_config = config.get('grants') %} + + {%- for sql in build_sql %} + {% call statement(name="main") %} + {{ build_sql }} + {% endcall %} + {% endfor % + + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + + {% do persist_docs(target_relation, model) %} + + {{ run_hooks(post_hooks, inside_transaction=True) }} + +{% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/materializations/materialized_view/materialized_view.sql b/dbt/include/databricks/macros/materializations/materialized_view/materialized_view.sql deleted file mode 100644 index 6aef0539d..000000000 --- a/dbt/include/databricks/macros/materializations/materialized_view/materialized_view.sql +++ /dev/null @@ -1,43 +0,0 @@ -{% materialization materialized_view, adapter='databricks' %} - - {%- set identifier = model['alias'] -%} - - {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%} - {%- set exists_as_materialized_view = (old_relation is not none and old_relation.is_materialized_view) -%} - - {%- set target_relation = api.Relation.create( - identifier=identifier, schema=schema, database=database, - type='materializedview') -%} - {% set grant_config = config.get('grants') %} - - {{ run_hooks(pre_hooks) }} - - {%- set full_refresh_mode = should_full_refresh() -%} - - {%- if exists_as_materialized_view and not full_refresh_mode -%} - -- refresh materialized view - {% call statement('main') -%} - {{ get_refresh_materialized_view_sql(target_relation) }} - {%- endcall %} - {%- else -%} - -- If there's a table with the same name and we weren't told to full refresh, - -- that's an error. If we were told to full refresh, drop it. This behavior differs - -- for Snowflake and BigQuery, so multiple dispatch is used. - {%- if old_relation is not none and (not exists_as_materialized_view or full_refresh_mode) -%} - {{ handle_existing_table(full_refresh_mode, old_relation) }} - {%- endif -%} - - -- build model - {% call statement('main') -%} - {{ get_create_materialized_view_as_sql(target_relation, sql) }} - {%- endcall %} - {%- endif -%} - - {% set should_revoke = should_revoke(exists_as_materialized_view, full_refresh_mode) %} - {% do apply_grants(target_relation, grant_config, should_revoke) %} - - {{ run_hooks(post_hooks) }} - - {{ return({'relations': [target_relation]}) }} - -{% endmaterialization %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/materialized_view/alter.sql b/dbt/include/databricks/macros/relations/materialized_view/alter.sql index e69de29bb..038c6cd08 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/alter.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/alter.sql @@ -0,0 +1,31 @@ +{% macro databricks__get_materialized_view_configuration_changes(existing_relation, new_config) -%} + {% set _existing_materialized_view = adapter.get_relation_config(existing_relation) %} + {%- set materialized_view = adapter.materialized_view_config_from_model(config.model) -%} + {%- set _configuration_changes = materialized_view.get_changeset(_existing_materialized_view) -%} + {% do return(_configuration_changes) %} +{% endmacro -%} + +{% macro databricks__get_alter_materialized_view_as_sql( + relation, + configuration_changes, + sql, + existing_relation, + backup_relation, + intermediate_relation +) %} + -- apply a full refresh immediately if needed + {% if configuration_changes.requires_full_refresh %} + + {{ get_replace_sql(existing_relation, relation, sql) }} + + -- otherwise apply individual changes as needed + {% else %} + + {%- set autorefresh = configuration_changes.autorefresh -%} + {%- if autorefresh -%}{{- log('Applying UPDATE AUTOREFRESH to: ' ~ relation) -}}{%- endif -%} + + alter materialized view {{ relation }} + auto refresh {% if autorefresh.context %}yes{% else %}no{% endif %} + + {%- endif -%} +{% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/materialized_view/create.sql b/dbt/include/databricks/macros/relations/materialized_view/create.sql index 3376d8761..b2d500939 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/create.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/create.sql @@ -5,6 +5,7 @@ create materialized view {{ relation }} {% materialized_view.partition_by.to_sql_clause() -%} {% materialized_view.comment.to_sql_clause() -%} + {% materialized_view.tblproperties.to_sql_clause() -%} {% materialized_view.refresh.to_sql_clause() -%} as {{ sql }} diff --git a/dbt/include/databricks/macros/relations/materialized_view/refresh.sql b/dbt/include/databricks/macros/relations/materialized_view/refresh.sql index 47a99d243..651b3082d 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/refresh.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/refresh.sql @@ -1,7 +1,3 @@ -{% macro get_refresh_materialized_view_sql(relation) -%} - {{ adapter.dispatch('get_refresh_materialized_view_sql', 'dbt')(relation) }} -{%- endmacro %} - -{% macro databricks__get_refresh_materialized_view_sql(relation) -%} +{% macro databricks__refresh_materialized_view(relation) -%} refresh materialized view {{ relation }} {% endmacro %} \ No newline at end of file diff --git a/tests/unit/relation_configs/test_config_base.py b/tests/unit/relation_configs/test_config_base.py new file mode 100644 index 000000000..2e7286376 --- /dev/null +++ b/tests/unit/relation_configs/test_config_base.py @@ -0,0 +1,119 @@ +from dataclasses import dataclass +import pytest +from dbt.adapters.databricks.relation_configs.base import ( + DatabricksComponentConfig, + DatabricksConfigChange, + DatabricksRelationChangeSet, +) +from dbt.exceptions import DbtRuntimeError + +from dbt.adapters.databricks.relation_configs.comment import CommentConfig +from dbt.adapters.databricks.relation_configs.base import RelationConfigChangeAction +from dbt.adapters.databricks.relation_configs.refresh import ( + ManualRefreshConfig, + ScheduledRefreshConfig, +) +from dbt.adapters.databricks.relation_configs.tblproperties import TblPropertiesConfig + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class MockComponentConfig(DatabricksComponentConfig): + data: int = 1 + + def to_sql_clause(self) -> str: + return "" + + +class TestDatabricksComponentConfig: + def test_get_diff__invalid_type(self): + config = MockComponentConfig() + other = CommentConfig() + + with pytest.raises(DbtRuntimeError, match="Cannot diff"): + config.get_diff(other) + + def test_get_diff__valid_type(self): + config = MockComponentConfig() + other = MockComponentConfig(data=2) + + assert config.get_diff(other) == config + + +class TestDatabricksConfigChange: + def test_requires_full_refresh__unalterable(self): + change = DatabricksConfigChange(RelationConfigChangeAction.alter, MockComponentConfig()) + assert change.requires_full_refresh + + def test_requires_full_refresh__alterable(self): + change = DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()) + assert not change.requires_full_refresh + + def test_get_change__no_change(self): + change = DatabricksConfigChange.get_change(ManualRefreshConfig(), ManualRefreshConfig()) + assert change is None + + def test_get_change__with_diff(self): + change = DatabricksConfigChange.get_change( + ManualRefreshConfig(), ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + ) + assert change == DatabricksConfigChange( + RelationConfigChangeAction.alter, ManualRefreshConfig() + ) + + +class TestDatabricksRelationChangeSet: + def test_requires_full_refresh__no_changes(self): + changeset = DatabricksRelationChangeSet([]) + assert not changeset.requires_full_refresh + + def test_requires_full_refresh__has_only_alterable_changes(self): + changeset = DatabricksRelationChangeSet( + [DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig())] + ) + assert not changeset.requires_full_refresh + + def test_requires_full_refresh__has_an_inalterable_change(self): + changeset = DatabricksRelationChangeSet( + [ + DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()), + DatabricksConfigChange(RelationConfigChangeAction.alter, CommentConfig()), + ] + ) + assert changeset.requires_full_refresh + + def test_has_changes__no_changes(self): + changeset = DatabricksRelationChangeSet([]) + assert not changeset.has_changes + + def test_has_changes__has_changes(self): + changeset = DatabricksRelationChangeSet( + [DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig())] + ) + assert changeset.has_changes + + def test_get_alter_sql_clauses__no_changes(self): + changeset = DatabricksRelationChangeSet([]) + assert changeset.get_alter_sql_clauses() == [] + + def test_get_alter_sql_clauses__with_inalterable_change(self): + changeset = DatabricksRelationChangeSet( + [DatabricksConfigChange(RelationConfigChangeAction.alter, CommentConfig())] + ) + with pytest.raises(AssertionError): + changeset.get_alter_sql_clauses() + + def test_get_alter_sql_clauses__with_alterable_change(self): + changeset = DatabricksRelationChangeSet( + [ + DatabricksConfigChange( + RelationConfigChangeAction.alter, + TblPropertiesConfig(tblproperties={"key": "value"}, to_unset=["other"]), + ), + DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()), + ] + ) + assert changeset.get_alter_sql_clauses() == [ + "SET TBLPROPERTIES ('key' = 'value')", + "UNSET TBLPROPERTIES IF EXISTS ('other')", + "DROP SCHEDULE", + ] diff --git a/tests/unit/relation_configs/test_materialized_view_config.py b/tests/unit/relation_configs/test_materialized_view_config.py new file mode 100644 index 000000000..42814b75c --- /dev/null +++ b/tests/unit/relation_configs/test_materialized_view_config.py @@ -0,0 +1,68 @@ +from agate import Table, Row +from mock import Mock +from dbt.adapters.databricks.relation_configs.comment import CommentConfig +from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig +from dbt.adapters.databricks.relation_configs.partitioning import PartitionedByConfig +from dbt.adapters.databricks.relation_configs.query import QueryConfig +from dbt.adapters.databricks.relation_configs.refresh import ManualRefreshConfig +from dbt.adapters.databricks.relation_configs.tblproperties import TblPropertiesConfig + + +class TestMaterializedViewConfig: + def test_from_results(self): + results = { + "describe_extended": Table( + rows=[ + ["col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + ["# Partition Information", None, None], + ["# col_name", "data_type", "comment"], + ["col_a", "int", "This is a comment"], + ["col_b", "int", "This is a comment"], + [None, None, None], + ["# Detailed Table Information", None, None], + ["Catalog:", "default", None], + ["Comment", "This is the table comment", None], + ["Refresh Schedule", "MANUAL", None], + ] + ), + "information_schema.views": Row( + ["select * from foo", "other"], ["view_definition", "comment"] + ), + "show_tblproperties": Table(rows=[["prop", "1"], ["other", "other"]]), + } + + config = MaterializedViewConfig.from_results(results) + + assert config == MaterializedViewConfig( + PartitionedByConfig(["col_a", "col_b"]), + CommentConfig("This is the table comment"), + TblPropertiesConfig({"prop": "1", "other": "other"}), + ManualRefreshConfig(), + QueryConfig("select * from foo"), + ) + + def test_from_model_node(self): + model = Mock() + model.compiled_code = "select * from foo" + model.config.extra = { + "partition_by": ["col_a", "col_b"], + "tblproperties_config": { + "tblproperties": { + "prop": "1", + "other": "other", + } + }, + "refresh": "manual", + } + model.description = "This is the table comment" + + config = MaterializedViewConfig.from_model_node(model) + + assert config == MaterializedViewConfig( + PartitionedByConfig(["col_a", "col_b"]), + CommentConfig("This is the table comment"), + TblPropertiesConfig({"prop": "1", "other": "other"}), + ManualRefreshConfig(), + QueryConfig("select * from foo"), + ) diff --git a/tests/unit/relation_configs/test_partitioning.py b/tests/unit/relation_configs/test_partitioning.py index 24242f6c6..56ce49fd3 100644 --- a/tests/unit/relation_configs/test_partitioning.py +++ b/tests/unit/relation_configs/test_partitioning.py @@ -80,18 +80,18 @@ def test_from_results__multiple(self): def test_from_model_node__without_partition_by(self): model = Mock() - model.config.extra.get.return_value = None + model.config.extra = {} spec = PartitionedByProcessor.from_model_node(model) assert spec == PartitionedByConfig(None) def test_from_model_node__single_column(self): model = Mock() - model.config.extra.get.return_value = "col_a" + model.config.extra = {"partition_by": "col_a"} spec = PartitionedByProcessor.from_model_node(model) assert spec == PartitionedByConfig(["col_a"]) def test_from_model_node__multiple_columns(self): model = Mock() - model.config.extra.get.return_value = ["col_a", "col_b"] + model.config.extra = {"partition_by": ["col_a", "col_b"]} spec = PartitionedByProcessor.from_model_node(model) assert spec == PartitionedByConfig(["col_a", "col_b"]) diff --git a/tests/unit/relation_configs/test_query.py b/tests/unit/relation_configs/test_query.py new file mode 100644 index 000000000..8fa4dbec3 --- /dev/null +++ b/tests/unit/relation_configs/test_query.py @@ -0,0 +1,30 @@ +from agate import Row +from mock import Mock +import pytest +from dbt.exceptions import DbtRuntimeError +from dbt.adapters.databricks.relation_configs.query import QueryConfig, QueryProcessor + +sql = "select * from foo" + + +class TestQueryProcessor: + def test_from_results(self): + results = {"information_schema.views": Row([sql, "other"], ["view_definition", "comment"])} + spec = QueryProcessor.from_results(results) + assert spec == QueryConfig(sql) + + def test_from_model_node__with_query(self): + model = Mock() + model.compiled_code = sql + spec = QueryProcessor.from_model_node(model) + assert spec == QueryConfig(sql) + + def test_from_model_node__without_query(self): + model = Mock() + model.compiled_code = None + model.unique_id = "1" + with pytest.raises( + DbtRuntimeError, + match="Cannot compile model 1 with no SQL query", + ): + spec = QueryProcessor.from_model_node(model) diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py index 7189e8125..ee8395c18 100644 --- a/tests/unit/relation_configs/test_refresh.py +++ b/tests/unit/relation_configs/test_refresh.py @@ -50,13 +50,13 @@ def test_from_results__invalid(self, rows): def test_from_model_node__without_schedule(self): model = Mock() - model.config.extra.get.return_value = {} + model.config.extra = {} spec = RefreshProcessor.from_model_node(model) assert spec == ManualRefreshConfig() def test_from_model_node__without_cron(self): model = Mock() - model.config.extra.get.return_value = {"time_zone_value": "UTC"} + model.config.extra = {"schedule": {"time_zone_value": "UTC"}} with pytest.raises( DbtRuntimeError, match="Schedule config must contain a 'cron' key, got {'time_zone_value': 'UTC'}.", @@ -65,22 +65,82 @@ def test_from_model_node__without_cron(self): def test_from_model_node__without_timezone(self): model = Mock() - model.config.extra.get.return_value = {"cron": "*/5 * * * *"} + model.config.extra = {"schedule": {"cron": "*/5 * * * *"}} spec = RefreshProcessor.from_model_node(model) assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) def test_process_model_node__both(self): model = Mock() - model.config.extra.get.return_value = {"cron": "*/5 * * * *", "time_zone_value": "UTC"} + model.config.extra = {"schedule": {"cron": "*/5 * * * *", "time_zone_value": "UTC"}} spec = RefreshProcessor.from_model_node(model) assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") class TestScheduledRefreshConfig: - def test_to_schedule_clause__no_timezone(self): + def test_to_sql_clause__no_timezone(self): spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) assert spec.to_sql_clause() == "SCHEDULE CRON '*/5 * * * *'" - def test_to_schedule_clause__with_timezone(self): + def test_to_sql_clause__with_timezone(self): spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") assert spec.to_sql_clause() == "SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'" + + def test_to_alter_sql_clauses__alter_false(self): + spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC", alter=False) + assert spec.to_alter_sql_clauses() == [ + "ADD SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'", + ] + + def test_to_alter_sql_clauses__alter_true(self): + spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC", alter=True) + assert spec.to_alter_sql_clauses() == [ + "ALTER SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'", + ] + + def test_get_diff__other_not_refresh(self): + config = ScheduledRefreshConfig(cron="*/5 * * * *") + other = Mock() + with pytest.raises( + DbtRuntimeError, + match="Cannot diff ScheduledRefreshConfig with Mock", + ): + config.get_diff(other) + + def test_get_diff__other_manual_refresh(self): + config = ScheduledRefreshConfig(cron="*/5 * * * *") + other = ManualRefreshConfig() + diff = config.get_diff(other) + assert diff == ScheduledRefreshConfig(cron="*/5 * * * *", alter=False) + + def test_get_diff__other_scheduled_refresh(self): + config = ScheduledRefreshConfig(cron="*/5 * * * *") + other = ScheduledRefreshConfig(cron="0 * * * *") + diff = config.get_diff(other) + assert diff == ScheduledRefreshConfig(cron="*/5 * * * *", alter=True) + + +class TestManualRefreshConfig: + def test_get_diff__other_not_refresh(self): + config = ManualRefreshConfig() + other = Mock() + with pytest.raises( + DbtRuntimeError, + match="Cannot diff ManualRefreshConfig with Mock", + ): + config.get_diff(other) + + def test_get_diff__other_scheduled_refresh(self): + config = ManualRefreshConfig() + other = ScheduledRefreshConfig(cron="*/5 * * * *") + diff = config.get_diff(other) + assert diff == config + + def test_get_diff__other_manual_refresh(self): + config = ManualRefreshConfig() + other = ManualRefreshConfig() + diff = config.get_diff(other) + assert diff == config + + def test_to_alter_sql_clauses(self): + spec = ManualRefreshConfig() + assert spec.to_alter_sql_clauses() == ["DROP SCHEDULE"] diff --git a/tests/unit/relation_configs/test_tblproperties.py b/tests/unit/relation_configs/test_tblproperties.py new file mode 100644 index 000000000..bc1b675b8 --- /dev/null +++ b/tests/unit/relation_configs/test_tblproperties.py @@ -0,0 +1,185 @@ +from mock import Mock +import pytest +from agate import Table + +from dbt.adapters.databricks.relation_configs.tblproperties import ( + TblPropertiesConfig, + TblPropertiesProcessor, +) +from dbt.exceptions import DbtRuntimeError + + +class TestTblPropertiesConfig: + def test_post_init__conflicting_tblproperties(self): + with pytest.raises( + DbtRuntimeError, + match="Cannot set and unset the same tblproperty in the same model", + ): + config = TblPropertiesConfig({"prop": "1"}, to_unset=["prop"]) + + @pytest.mark.parametrize( + "input,expected", + [ + (None, ""), + (dict(), ""), + ({"prop": 1}, "TBLPROPERTIES ('prop' = '1')"), + ({"prop": 1, "other": "thing"}, "TBLPROPERTIES ('prop' = '1', 'other' = 'thing')"), + ], + ) + def test_to_sql_clause(self, input, expected): + config = TblPropertiesConfig(tblproperties=input) + assert config.to_sql_clause() == expected + + def test_eq__match(self): + config1 = TblPropertiesConfig(tblproperties={"prop": "1"}) + config2 = TblPropertiesConfig(tblproperties={"prop": "1"}) + assert config1 == config2 + + def test_eq__mismatch(self): + config1 = TblPropertiesConfig(tblproperties={"prop": "1"}) + config2 = TblPropertiesConfig(tblproperties={"prop": "2"}) + assert config1 != config2 + + def test_eq__ignore_list(self): + config1 = TblPropertiesConfig(tblproperties={"prop": "1"}, ignore_list=["prop"]) + config2 = TblPropertiesConfig(tblproperties={"prop": "2"}) + assert config1 == config2 + + def test_eq__ignore_list_extra(self): + config1 = TblPropertiesConfig( + tblproperties={"prop": "1", "other": "other"}, ignore_list=["prop"] + ) + config2 = TblPropertiesConfig(tblproperties={"prop": "2", "other": "other"}) + assert config1 == config2 + + def test_eq__default_ignore_list(self): + config1 = TblPropertiesConfig(tblproperties={"pipelines.pipelineId": "1"}) + config2 = TblPropertiesConfig(tblproperties={}) + assert config1 == config2 + + def test_get_diff__same_properties(self): + config = TblPropertiesConfig(tblproperties={"prop": "1"}) + diff = config.get_diff(config) + assert diff == TblPropertiesConfig() + + def test_get_diff__other_has_added_prop(self): + config = TblPropertiesConfig(tblproperties={"prop": "1"}) + other = TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}) + diff = config.get_diff(other) + assert diff == TblPropertiesConfig(to_unset=["other"]) + + def test_get_diff__other_has_diff_value(self): + config = TblPropertiesConfig(tblproperties={"prop": "1"}) + other = TblPropertiesConfig(tblproperties={"prop": "2"}) + diff = config.get_diff(other) + assert diff == TblPropertiesConfig(tblproperties={"prop": "1"}) + + def test_get_diff__mix(self): + config = TblPropertiesConfig( + tblproperties={"prop": "1", "other": "2", "third": "3"}, ignore_list=["third"] + ) + other = TblPropertiesConfig( + tblproperties={"prop": "2", "other": "2", "third": "4", "fourth": "5"} + ) + diff = config.get_diff(other) + assert diff == TblPropertiesConfig(tblproperties={"prop": "1"}, to_unset=["fourth"]) + + def test_to_alter_sql_clauses__none(self): + config = TblPropertiesConfig() + assert config.to_alter_sql_clauses() == [] + + def test_to_alter_sql_clauses__set_only(self): + config = TblPropertiesConfig({"prop": "1", "other": "other"}) + assert config.to_alter_sql_clauses() == [ + "SET TBLPROPERTIES ('prop' = '1', 'other' = 'other')" + ] + + def test_to_alter_sql_clauses__unset_only(self): + config = TblPropertiesConfig(to_unset=["prop", "other"]) + assert config.to_alter_sql_clauses() == ["UNSET TBLPROPERTIES IF EXISTS ('prop', 'other')"] + + def test_to_alter_sql_clauses__both(self): + config = TblPropertiesConfig( + {"prop": "1"}, + to_unset=["other"], + ) + assert config.to_alter_sql_clauses() == [ + "SET TBLPROPERTIES ('prop' = '1')", + "UNSET TBLPROPERTIES IF EXISTS ('other')", + ] + + +class TestTblPropertiesProcessor: + def test_from_results__none(self): + results = {"show_tblproperties": None} + spec = TblPropertiesProcessor.from_results(results) + assert spec == TblPropertiesConfig() + + def test_from_results__single(self): + results = {"show_tblproperties": Table(rows=[["prop", "f1"]])} + spec = TblPropertiesProcessor.from_results(results) + assert spec == TblPropertiesConfig(tblproperties={"prop": "f1"}) + + def test_from_results__multiple(self): + results = {"show_tblproperties": Table(rows=[["prop", "1"], ["other", "other"]])} + spec = TblPropertiesProcessor.from_results(results) + assert spec == TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}) + + def test_from_model_node__without_tblproperties_config(self): + model = Mock() + model.config.extra = {} + spec = TblPropertiesProcessor.from_model_node(model) + assert spec == TblPropertiesConfig() + + def test_from_model_node__with_all_config(self): + model = Mock() + model.config.extra = { + "tblproperties_config": { + "tblproperties": {"prop": 1}, + "ignore_list": ["other"], + } + } + spec = TblPropertiesProcessor.from_model_node(model) + assert spec == TblPropertiesConfig(tblproperties={"prop": "1"}, ignore_list=["other"]) + + def test_from_model_node__without_ignore_list(self): + model = Mock() + model.config.extra = { + "tblproperties_config": { + "tblproperties": {"prop": 1}, + } + } + spec = TblPropertiesProcessor.from_model_node(model) + assert spec == TblPropertiesConfig(tblproperties={"prop": "1"}) + + def test_from_model_node__without_tblproperties(self): + model = Mock() + model.config.extra = { + "tblproperties_config": { + "ignore_list": ["other"], + } + } + spec = TblPropertiesProcessor.from_model_node(model) + assert spec == TblPropertiesConfig(ignore_list=["other"]) + + def test_from_model_node__with_incorrect_tblproperties(self): + model = Mock() + model.config.extra = { + "tblproperties_config": {"ignore_list": ["other"], "tblproperties": True} + } + with pytest.raises( + DbtRuntimeError, + match="tblproperties must be a dictionary", + ): + TblPropertiesProcessor.from_model_node(model) + + def test_from_model_node__with_incorrect_ignore_list(self): + model = Mock() + model.config.extra = { + "tblproperties_config": {"tblproperties": {"prop": 1}, "ignore_list": True} + } + with pytest.raises( + DbtRuntimeError, + match="ignore_list must be a list", + ): + TblPropertiesProcessor.from_model_node(model) diff --git a/tests/unit/relation_configs/test_util.py b/tests/unit/relation_configs/test_util.py deleted file mode 100644 index c45e89216..000000000 --- a/tests/unit/relation_configs/test_util.py +++ /dev/null @@ -1,16 +0,0 @@ -from agate import Table, Row -import pytest -from dbt.adapters.databricks.relation_configs import util - - -class TestGetFirstRow: - @pytest.mark.parametrize( - "input,expected", - [ - (Table([]), Row(set())), - (Table([Row(["first", "row"]), Row(["second", "row"])]), Row(["first", "row"])), - ], - ) - def test_get_first_row(self, input, expected): - first_row = util.get_first_row(input) - assert first_row == expected From 4c6fb7cbe3ff3312959bee6c871c3b874c26c638 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 15:40:23 -0800 Subject: [PATCH 06/35] type issue --- dbt/adapters/databricks/impl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index ff2b9435b..0fd822fa4 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -59,6 +59,7 @@ DatabricksRelation, DatabricksRelationType, ) +from dbt.adapters.databricks.relation_configs.base import DatabricksRelationConfigBase from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig from dbt.adapters.databricks.utils import redact_credentials, undefined_proof from dbt.adapters.relation_configs.config_base import RelationResults @@ -743,7 +744,7 @@ def materialized_view_config_from_model(self, model: ModelNode) -> MaterializedV return MaterializedViewConfig.from_model_node(model) # type: ignore @available.parse_none - def get_relation_config(self, relation: DatabricksRelation) -> MaterializedViewConfig: + def get_relation_config(self, relation: DatabricksRelation) -> DatabricksRelationConfigBase: if relation.type == RelationType.MaterializedView: results = self.describe_materialized_view(relation) return MaterializedViewConfig.from_results(results) From d2579bb42b1896e490f11b2639591fc2f0e32d50 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 15:54:15 -0800 Subject: [PATCH 07/35] macro issue fix --- .../macros/relations/materialized_view/create.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbt/include/databricks/macros/relations/materialized_view/create.sql b/dbt/include/databricks/macros/relations/materialized_view/create.sql index b2d500939..267c3a575 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/create.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/create.sql @@ -3,10 +3,10 @@ {%- set materialized_view = adapter.materialized_view_from_model(config.model) -%} create materialized view {{ relation }} - {% materialized_view.partition_by.to_sql_clause() -%} - {% materialized_view.comment.to_sql_clause() -%} - {% materialized_view.tblproperties.to_sql_clause() -%} - {% materialized_view.refresh.to_sql_clause() -%} + {{ materialized_view.partition_by.to_sql_clause() }} + {{ materialized_view.comment.to_sql_clause() }} + {{ materialized_view.tblproperties.to_sql_clause() }} + {{ materialized_view.refresh.to_sql_clause() }} as {{ sql }} {% endmacro %} \ No newline at end of file From 2d1cb773a8df3b18bcb22aa2064a3cc76fd1c788 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 15:57:28 -0800 Subject: [PATCH 08/35] materialization typo --- .../databricks/macros/materializations/materialized_view.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/databricks/macros/materializations/materialized_view.sql b/dbt/include/databricks/macros/materializations/materialized_view.sql index c3c7f3d18..a30e9e0f0 100644 --- a/dbt/include/databricks/macros/materializations/materialized_view.sql +++ b/dbt/include/databricks/macros/materializations/materialized_view.sql @@ -1,4 +1,4 @@ -{% materialization materialized_view, databricks %} +{% materialization materialized_view, adapter = 'databricks' %} {% set existing_relation = load_cached_relation(this) %} {% set target_relation = this.incorporate(type=this.MaterializedView) %} From 0b425342240d461edd9ce0f1c5569043ab4b51e6 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 16:00:43 -0800 Subject: [PATCH 09/35] address dependabot issue --- dev-requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 22b46a03b..87c22daa3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -23,8 +23,8 @@ tox>=3.2.0 types-requests types-mock -dbt-core==1.7.1 -dbt-tests-adapter==1.7.1 +dbt-core==1.7.3 +dbt-tests-adapter==1.7.3 # git+https://github.com/dbt-labs/dbt-spark.git@1.5.latest#egg=dbt-spark # git+https://github.com/dbt-labs/dbt-core.git@1.5.latest#egg=dbt-core&subdirectory=core # git+https://github.com/dbt-labs/dbt-core.git@1.5.latest#egg=dbt-tests-adapter&subdirectory=tests/adapter From d163d285ffeed2551a4137b13781446c7c8b2425 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 16:01:51 -0800 Subject: [PATCH 10/35] minor macro issue --- .../databricks/macros/materializations/materialized_view.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/databricks/macros/materializations/materialized_view.sql b/dbt/include/databricks/macros/materializations/materialized_view.sql index a30e9e0f0..3a74f1ce3 100644 --- a/dbt/include/databricks/macros/materializations/materialized_view.sql +++ b/dbt/include/databricks/macros/materializations/materialized_view.sql @@ -79,7 +79,7 @@ {% call statement(name="main") %} {{ build_sql }} {% endcall %} - {% endfor % + {% endfor %} {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} From c948b1bb1f839ae32656856bd3b1ba26501a0752 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 16:16:02 -0800 Subject: [PATCH 11/35] lint passing --- dbt/adapters/databricks/impl.py | 4 ++-- dbt/adapters/databricks/relation.py | 3 ++- dbt/adapters/databricks/relation_configs/refresh.py | 7 ++++--- tests/unit/relation_configs/test_comment.py | 2 +- tests/unit/relation_configs/test_query.py | 2 +- tests/unit/relation_configs/test_refresh.py | 1 - tests/unit/relation_configs/test_tblproperties.py | 6 +++--- 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 0fd822fa4..b7b7731dc 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -47,6 +47,7 @@ import dbt.exceptions from dbt.events import AdapterLogger from dbt.utils import executor +from dbt.adapters.relation_configs.config_base import RelationConfigBase from dbt.adapters.databricks.column import DatabricksColumn from dbt.adapters.databricks.connections import DatabricksConnectionManager @@ -59,7 +60,6 @@ DatabricksRelation, DatabricksRelationType, ) -from dbt.adapters.databricks.relation_configs.base import DatabricksRelationConfigBase from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig from dbt.adapters.databricks.utils import redact_credentials, undefined_proof from dbt.adapters.relation_configs.config_base import RelationResults @@ -744,7 +744,7 @@ def materialized_view_config_from_model(self, model: ModelNode) -> MaterializedV return MaterializedViewConfig.from_model_node(model) # type: ignore @available.parse_none - def get_relation_config(self, relation: DatabricksRelation) -> DatabricksRelationConfigBase: + def get_relation_config(self, relation: DatabricksRelation) -> RelationConfigBase: if relation.type == RelationType.MaterializedView: results = self.describe_materialized_view(relation) return MaterializedViewConfig.from_results(results) diff --git a/dbt/adapters/databricks/relation.py b/dbt/adapters/databricks/relation.py index 38b0a7602..b1cbc8a4f 100644 --- a/dbt/adapters/databricks/relation.py +++ b/dbt/adapters/databricks/relation.py @@ -73,7 +73,8 @@ def __pre_deserialize__(cls, data: Dict[Any, Any]) -> Dict[Any, Any]: } ) - # list relations that can be atomically replaced (e.g. `CREATE OR REPLACE my_relation..` versus `DROP` and `CREATE`) + # list relations that can be atomically replaced (e.g. `CREATE OR REPLACE my_relation..` + # versus `DROP` and `CREATE`) replaceable_relations = frozenset( { RelationType.View, diff --git a/dbt/adapters/databricks/relation_configs/refresh.py b/dbt/adapters/databricks/relation_configs/refresh.py index 2b5e9a777..54ef3c96f 100644 --- a/dbt/adapters/databricks/relation_configs/refresh.py +++ b/dbt/adapters/databricks/relation_configs/refresh.py @@ -89,11 +89,12 @@ def from_results(cls, results: RelationResults) -> RefreshConfig: raise DbtRuntimeError( f"Could not parse schedule from description: {row[1]}." - " This is most likely a bug in the dbt-databricks adapter, so please file an issue!" + " This is most likely a bug in the dbt-databricks adapter," + " so please file an issue!" ) raise DbtRuntimeError( - f"Could not parse schedule for table." + "Could not parse schedule for table." " This is most likely a bug in the dbt-databricks adapter, so please file an issue!" ) @@ -102,7 +103,7 @@ def from_model_node(cls, model_node: ModelNode) -> RefreshConfig: schedule = model_node.config.extra.get("schedule") if schedule: if "cron" not in schedule: - raise DbtRuntimeError(f"Schedule config must contain a 'cron' key, got {schedule}.") + raise DbtRuntimeError(f"Schedule config must contain a 'cron' key, got {schedule}") return ScheduledRefreshConfig( cron=schedule["cron"], time_zone_value=schedule.get("time_zone_value") ) diff --git a/tests/unit/relation_configs/test_comment.py b/tests/unit/relation_configs/test_comment.py index 0a45e731c..4525c42d5 100644 --- a/tests/unit/relation_configs/test_comment.py +++ b/tests/unit/relation_configs/test_comment.py @@ -1,5 +1,5 @@ import pytest -from agate import Table, Row +from agate import Table from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor diff --git a/tests/unit/relation_configs/test_query.py b/tests/unit/relation_configs/test_query.py index 8fa4dbec3..ed25aa0b1 100644 --- a/tests/unit/relation_configs/test_query.py +++ b/tests/unit/relation_configs/test_query.py @@ -27,4 +27,4 @@ def test_from_model_node__without_query(self): DbtRuntimeError, match="Cannot compile model 1 with no SQL query", ): - spec = QueryProcessor.from_model_node(model) + _ = QueryProcessor.from_model_node(model) diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py index ee8395c18..6f7e3c6f8 100644 --- a/tests/unit/relation_configs/test_refresh.py +++ b/tests/unit/relation_configs/test_refresh.py @@ -1,7 +1,6 @@ from typing import Any, List from mock import Mock import pytest -from agate import Row from dbt.adapters.databricks.relation_configs.refresh import ( RefreshProcessor, ManualRefreshConfig, diff --git a/tests/unit/relation_configs/test_tblproperties.py b/tests/unit/relation_configs/test_tblproperties.py index bc1b675b8..606196282 100644 --- a/tests/unit/relation_configs/test_tblproperties.py +++ b/tests/unit/relation_configs/test_tblproperties.py @@ -15,7 +15,7 @@ def test_post_init__conflicting_tblproperties(self): DbtRuntimeError, match="Cannot set and unset the same tblproperty in the same model", ): - config = TblPropertiesConfig({"prop": "1"}, to_unset=["prop"]) + _ = TblPropertiesConfig({"prop": "1"}, to_unset=["prop"]) @pytest.mark.parametrize( "input,expected", @@ -171,7 +171,7 @@ def test_from_model_node__with_incorrect_tblproperties(self): DbtRuntimeError, match="tblproperties must be a dictionary", ): - TblPropertiesProcessor.from_model_node(model) + _ = TblPropertiesProcessor.from_model_node(model) def test_from_model_node__with_incorrect_ignore_list(self): model = Mock() @@ -182,4 +182,4 @@ def test_from_model_node__with_incorrect_ignore_list(self): DbtRuntimeError, match="ignore_list must be a list", ): - TblPropertiesProcessor.from_model_node(model) + _ = TblPropertiesProcessor.from_model_node(model) From e1ae8701f8657a4a5097f5fb941be0b4b207c49a Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 16:27:18 -0800 Subject: [PATCH 12/35] fixing unit tests, hopefully --- .python-version | 2 +- dbt/adapters/databricks/relation_configs/base.py | 4 ++-- tests/unit/relation_configs/test_refresh.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.python-version b/.python-version index 2c0733315..cc1923a40 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.11 +3.8 diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index c37830222..bc73c7b07 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import ClassVar, Dict, Generic, List, Optional, TypeVar -from typing_extensions import Self +from typing_extensions import Self, Type from dbt.adapters.relation_configs.config_base import RelationConfigBase, RelationResults from dbt.contracts.graph.nodes import ModelNode @@ -93,7 +93,7 @@ def from_model_node(cls, model_node: ModelNode) -> Component: @dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksRelationConfigBase(RelationConfigBase): - config_components: ClassVar[List[type[DatabricksComponentProcessor]]] + config_components: ClassVar[List[Type[DatabricksComponentProcessor]]] @classmethod def from_model_node(cls, model_node: ModelNode) -> RelationConfigBase: diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py index 6f7e3c6f8..3185fd991 100644 --- a/tests/unit/relation_configs/test_refresh.py +++ b/tests/unit/relation_configs/test_refresh.py @@ -58,7 +58,7 @@ def test_from_model_node__without_cron(self): model.config.extra = {"schedule": {"time_zone_value": "UTC"}} with pytest.raises( DbtRuntimeError, - match="Schedule config must contain a 'cron' key, got {'time_zone_value': 'UTC'}.", + match="Schedule config must contain a 'cron' key, got {'time_zone_value': 'UTC'}", ): RefreshProcessor.from_model_node(model) From f0fb0a7337a877ca8c6551c9c644f8e713786cae Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Mon, 18 Dec 2023 16:32:31 -0800 Subject: [PATCH 13/35] does this fix it? --- tests/unit/relation_configs/test_refresh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py index 3185fd991..2ea6256c5 100644 --- a/tests/unit/relation_configs/test_refresh.py +++ b/tests/unit/relation_configs/test_refresh.py @@ -12,7 +12,7 @@ class TestRefreshProcessor: @pytest.fixture - def rows(self) -> List[List[str | Any]]: + def rows(self) -> List[List[Any]]: return [ ["col_name", "data_type", "comment"], ["col_a", "int", "This is a comment"], From f3f1cd7535096b714a1fc4e7f15a51d4cae62ff7 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 19 Dec 2023 08:45:49 -0800 Subject: [PATCH 14/35] wip --- .../test_materialized_views.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/functional/adapter/materialized_view_tests/test_materialized_views.py diff --git a/tests/functional/adapter/materialized_view_tests/test_materialized_views.py b/tests/functional/adapter/materialized_view_tests/test_materialized_views.py new file mode 100644 index 000000000..54d00e81b --- /dev/null +++ b/tests/functional/adapter/materialized_view_tests/test_materialized_views.py @@ -0,0 +1,24 @@ +from ast import Tuple +from typing import Optional +from dbt.tests.adapter.materialized_view.basic import MaterializedViewBasic +from dbt.adapters.base.relation import BaseRelation + + +class TestMaterializedViewsMixin: + @staticmethod + def insert_record(project, table: BaseRelation, record: Tuple[int, int]): + project.run_sql(f"insert into {table} values {record}") + + @staticmethod + def refresh_materialized_view(project, materialized_view: BaseRelation): + project.run_sql(f"refresh materialized view {materialized_view}") + + @staticmethod + def query_row_count(project, relation: BaseRelation) -> int: + project.run_sql(f"select count(*) from {relation}") + + @staticmethod + def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + project.run_sql(f"select is_materialized from {relation.information_schema_only}") + +class TestMaterializedViews(MaterializedViewBasic): From 383dc9a734f654f3e2a3214462f947bb0c805d68 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 19 Dec 2023 15:33:17 -0800 Subject: [PATCH 15/35] passing first set of functional tests --- dbt/adapters/databricks/impl.py | 10 ++-- dbt/adapters/databricks/relation.py | 2 +- dbt/adapters/databricks/utils.py | 7 +++ .../databricks/macros/adapters/metadata.sql | 8 +++ .../materializations/materialized_view.sql | 15 +++-- .../databricks/macros/relations/drop.sql | 6 ++ .../relations/materialized_view/alter.sql | 35 +++++++++--- .../relations/materialized_view/create.sql | 2 +- .../databricks/macros/relations/replace.sql | 56 +++++++++++++++++++ .../test_materialized_views.py | 42 +++++++++++--- tests/unit/test_utils.py | 15 ++++- 11 files changed, 169 insertions(+), 29 deletions(-) create mode 100644 dbt/include/databricks/macros/relations/replace.sql diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index b7b7731dc..4ca91c637 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -61,7 +61,7 @@ DatabricksRelationType, ) from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig -from dbt.adapters.databricks.utils import redact_credentials, undefined_proof +from dbt.adapters.databricks.utils import redact_credentials, undefined_proof, get_first_row from dbt.adapters.relation_configs.config_base import RelationResults @@ -755,13 +755,15 @@ def get_relation_config(self, relation: DatabricksRelation) -> RelationConfigBas ) def describe_materialized_view(self, relation: DatabricksRelation) -> RelationResults: - kwargs = {"relation": relation} + kwargs = {"table_name": relation} results: RelationResults = dict() results["describe_extended"] = self.execute_macro( DESCRIBE_TABLE_EXTENDED_MACRO_NAME, kwargs=kwargs ) - results["information_schema.views"] = self.execute_macro( - "get_view_description", kwargs=kwargs + + kwargs = {"relation": relation} + results["information_schema.views"] = get_first_row( + self.execute_macro("get_view_description", kwargs=kwargs) ) results["show_tblproperties"] = self.execute_macro("fetch_tbl_properties", kwargs=kwargs) return results diff --git a/dbt/adapters/databricks/relation.py b/dbt/adapters/databricks/relation.py index b1cbc8a4f..17cdc5501 100644 --- a/dbt/adapters/databricks/relation.py +++ b/dbt/adapters/databricks/relation.py @@ -33,7 +33,7 @@ class DatabricksRelationType(StrEnum): Table = "table" View = "view" CTE = "cte" - MaterializedView = "materializedview" + MaterializedView = "materialized_view" External = "external" StreamingTable = "streamingtable" diff --git a/dbt/adapters/databricks/utils.py b/dbt/adapters/databricks/utils.py index 9fbbc411c..d2df7ee9e 100644 --- a/dbt/adapters/databricks/utils.py +++ b/dbt/adapters/databricks/utils.py @@ -5,6 +5,7 @@ from dbt.adapters.base import BaseAdapter from jinja2.runtime import Undefined +from agate import Table, Row A = TypeVar("A", bound=BaseAdapter) @@ -77,3 +78,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: def remove_ansi(line: str) -> str: ansi_escape = re.compile(r"(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]") return ansi_escape.sub("", line) + + +def get_first_row(results: Table) -> Row: + if len(results.rows) == 0: + return Row(values=set()) + return results.rows[0] diff --git a/dbt/include/databricks/macros/adapters/metadata.sql b/dbt/include/databricks/macros/adapters/metadata.sql index 9f85ca828..343e98d35 100644 --- a/dbt/include/databricks/macros/adapters/metadata.sql +++ b/dbt/include/databricks/macros/adapters/metadata.sql @@ -69,4 +69,12 @@ {{ return(load_result('last_modified')) }} +{% endmacro %} + +{% macro get_view_description(relation) %} + {% call statement('get_view_description', fetch_result=True) -%} + select * from {{ relation.information_schema() }}.`views` where table_schema = '{{ relation.schema }}' and table_name = '{{ relation.identifier }}' + {% endcall %} + + {% do return(load_result('get_view_description').table) %} {% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/materializations/materialized_view.sql b/dbt/include/databricks/macros/materializations/materialized_view.sql index 3a74f1ce3..8fb55f306 100644 --- a/dbt/include/databricks/macros/materializations/materialized_view.sql +++ b/dbt/include/databricks/macros/materializations/materialized_view.sql @@ -35,12 +35,12 @@ {% set configuration_changes = get_materialized_view_configuration_changes(existing_relation, config) %} {% if configuration_changes is none %} - {% set build_sql = [refresh_materialized_view(target_relation)] %} + {% set build_sql = refresh_materialized_view(target_relation) %} {% elif on_configuration_change == 'apply' %} {% set build_sql = get_alter_materialized_view_as_sql(target_relation, configuration_changes, sql, existing_relation, None, None) %} {% elif on_configuration_change == 'continue' %} - {% set build_sql = [] %} + {% set build_sql = "" %} {{ exceptions.warn("Configuration changes were identified and `on_configuration_change` was set to `continue` for `" ~ target_relation ~ "`") }} {% elif on_configuration_change == 'fail' %} {{ exceptions.raise_fail_fast_error("Configuration changes were identified and `on_configuration_change` was set to `fail` for `" ~ target_relation ~ "`") }} @@ -75,11 +75,18 @@ {% set grant_config = config.get('grants') %} - {%- for sql in build_sql %} + {%- if build_sql is string %} {% call statement(name="main") %} {{ build_sql }} {% endcall %} - {% endfor %} + {%- else %} + {%- for sql in build_sql %} + {% call statement(name="main") %} + {{ sql }} + {% endcall %} + {% endfor %} + {% endif %} + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} diff --git a/dbt/include/databricks/macros/relations/drop.sql b/dbt/include/databricks/macros/relations/drop.sql index d0e688d44..c09220874 100644 --- a/dbt/include/databricks/macros/relations/drop.sql +++ b/dbt/include/databricks/macros/relations/drop.sql @@ -1,3 +1,9 @@ +{% macro databricks__drop_relation(relation) -%} + {% call statement('drop_relation', auto_begin=False) -%} + {{ get_drop_sql(relation) }} + {%- endcall %} +{% endmacro %} + {% macro databricks__get_drop_sql(relation) -%} {%- if relation.is_materialized_view -%} {{ drop_materialized_view(relation) }} diff --git a/dbt/include/databricks/macros/relations/materialized_view/alter.sql b/dbt/include/databricks/macros/relations/materialized_view/alter.sql index 038c6cd08..797baee86 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/alter.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/alter.sql @@ -1,3 +1,23 @@ +{% macro get_alter_materialized_view_as_sql( + relation, + configuration_changes, + sql, + existing_relation, + backup_relation, + intermediate_relation +) %} + {{- log('Applying ALTER to: ' ~ relation) -}} + {{- return(adapter.dispatch('get_alter_materialized_view_as_sql', 'dbt')( + relation, + configuration_changes, + sql, + existing_relation, + backup_relation, + intermediate_relation + )) -}} +{% endmacro %} + + {% macro databricks__get_materialized_view_configuration_changes(existing_relation, new_config) -%} {% set _existing_materialized_view = adapter.get_relation_config(existing_relation) %} {%- set materialized_view = adapter.materialized_view_config_from_model(config.model) -%} @@ -15,17 +35,14 @@ ) %} -- apply a full refresh immediately if needed {% if configuration_changes.requires_full_refresh %} - - {{ get_replace_sql(existing_relation, relation, sql) }} + {{ return(databricks__get_replace_sql(existing_relation, relation, sql)) }} -- otherwise apply individual changes as needed {% else %} - - {%- set autorefresh = configuration_changes.autorefresh -%} - {%- if autorefresh -%}{{- log('Applying UPDATE AUTOREFRESH to: ' ~ relation) -}}{%- endif -%} - - alter materialized view {{ relation }} - auto refresh {% if autorefresh.context %}yes{% else %}no{% endif %} - + {% set changes = [] %} + {% for clause in configuration_changes.get_alter_sql_clauses() %} + {% do changes.append("alter materialized view " ~ relation ~ " " ~ clause) %} + {% endfor %} + {{ return(changes) }} {%- endif -%} {% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/materialized_view/create.sql b/dbt/include/databricks/macros/relations/materialized_view/create.sql index 267c3a575..9f475c697 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/create.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/create.sql @@ -1,6 +1,6 @@ {% macro databricks__get_create_materialized_view_as_sql(relation, sql) -%} - {%- set materialized_view = adapter.materialized_view_from_model(config.model) -%} + {%- set materialized_view = adapter.materialized_view_config_from_model(config.model) -%} create materialized view {{ relation }} {{ materialized_view.partition_by.to_sql_clause() }} diff --git a/dbt/include/databricks/macros/relations/replace.sql b/dbt/include/databricks/macros/relations/replace.sql new file mode 100644 index 000000000..af6f41e7a --- /dev/null +++ b/dbt/include/databricks/macros/relations/replace.sql @@ -0,0 +1,56 @@ +{% macro get_replace_sql(existing_relation, target_relation, sql) %} + {{- log('Applying REPLACE to: ' ~ existing_relation) -}} + {{- return(adapter.dispatch('get_replace_sql', 'dbt')(existing_relation, target_relation, sql)) -}} +{% endmacro %} + +{% macro databricks__get_replace_sql(existing_relation, target_relation, sql) %} + + {# /* use a create or replace statement if possible */ #} + + {% set is_replaceable = existing_relation.type == target_relation_type and existing_relation.can_be_replaced %} + + {% if is_replaceable and existing_relation.is_view %} + {{ get_replace_view_sql(target_relation, sql) }} + + {% elif is_replaceable and existing_relation.is_table %} + {{ get_replace_table_sql(target_relation, sql) }} + + {% elif is_replaceable and existing_relation.is_materialized_view %} + {{ get_replace_materialized_view_sql(target_relation, sql) }} + + {# /* a create or replace statement is not possible, so try to stage and/or backup to be safe */ #} + + {# /* create target_relation as an intermediate relation, then swap it out with the existing one using a backup */ #} + {%- elif target_relation.can_be_renamed and existing_relation.can_be_renamed -%} + {{ return([ + get_create_intermediate_sql(target_relation, sql), + get_create_backup_sql(existing_relation), + get_rename_intermediate_sql(target_relation), + get_drop_backup_sql(existing_relation) + ]) }} + + {# /* create target_relation as an intermediate relation, then swap it out with the existing one without using a backup */ #} + {%- elif target_relation.can_be_renamed -%} + {{ return([ + get_create_intermediate_sql(target_relation, sql), + get_drop_sql(existing_relation), + get_rename_intermediate_sql(target_relation), + ]) }} + + {# /* create target_relation in place by first backing up the existing relation */ #} + {%- elif existing_relation.can_be_renamed -%} + {{ return([ + get_create_backup_sql(existing_relation), + get_create_sql(target_relation, sql), + get_drop_backup_sql(existing_relation) + ]) }} + + {# /* no renaming is allowed, so just drop and create */ #} + {%- else -%} + {{ return([ + get_drop_sql(existing_relation), + get_create_sql(target_relation, sql) + ]) }} + {%- endif -%} + +{% endmacro %} \ No newline at end of file diff --git a/tests/functional/adapter/materialized_view_tests/test_materialized_views.py b/tests/functional/adapter/materialized_view_tests/test_materialized_views.py index 54d00e81b..e8ad12531 100644 --- a/tests/functional/adapter/materialized_view_tests/test_materialized_views.py +++ b/tests/functional/adapter/materialized_view_tests/test_materialized_views.py @@ -1,24 +1,48 @@ -from ast import Tuple -from typing import Optional +from typing import Optional, Tuple from dbt.tests.adapter.materialized_view.basic import MaterializedViewBasic +import dbt.tests.adapter.materialized_view.basic as fixtures from dbt.adapters.base.relation import BaseRelation +import pytest + +from dbt.adapters.databricks.relation import DatabricksRelationType class TestMaterializedViewsMixin: @staticmethod - def insert_record(project, table: BaseRelation, record: Tuple[int, int]): + def insert_record(project, table: BaseRelation, record: Tuple[int, int]) -> None: project.run_sql(f"insert into {table} values {record}") @staticmethod - def refresh_materialized_view(project, materialized_view: BaseRelation): + def refresh_materialized_view(project, materialized_view: BaseRelation) -> None: project.run_sql(f"refresh materialized view {materialized_view}") @staticmethod def query_row_count(project, relation: BaseRelation) -> int: - project.run_sql(f"select count(*) from {relation}") - + return project.run_sql(f"select count(*) from {relation}", fetch="one")[0] + @staticmethod def query_relation_type(project, relation: BaseRelation) -> Optional[str]: - project.run_sql(f"select is_materialized from {relation.information_schema_only}") - -class TestMaterializedViews(MaterializedViewBasic): + table_type = project.run_sql( + f"select table_type from {relation.information_schema_only()}." + f"`tables` where table_name = '{relation.identifier}'", + fetch="one", + )[0] + if table_type == "STREAMING_TABLE": + return DatabricksRelationType.StreamingTable.value + elif table_type == "MANAGED" or table_type == "EXTERNAL": + return DatabricksRelationType.Table.value + else: + is_materialized = project.run_sql( + f"select is_materialized from {relation.information_schema_only()}." + f"`views` where table_name = '{relation.identifier}'", + fetch="one", + )[0] + if is_materialized == "TRUE": + return DatabricksRelationType.MaterializedView.value + else: + return DatabricksRelationType.View.value + + +@pytest.mark.skip_profile("databricks_cluster") +class TestMaterializedViews(TestMaterializedViewsMixin, MaterializedViewBasic): + pass diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 9beb93ef6..a67179726 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,6 +1,8 @@ import unittest -from dbt.adapters.databricks.utils import redact_credentials, remove_ansi +import pytest +from agate import Table, Row +from dbt.adapters.databricks.utils import redact_credentials, remove_ansi, get_first_row class TestDatabricksUtils(unittest.TestCase): @@ -87,3 +89,14 @@ def test_remove_ansi(self): 72 # how to execute python model in notebook """ self.assertEqual(remove_ansi(test_string), expected_string) + + @pytest.mark.parametrize( + "input,expected", + [ + (Table([]), Row(set())), + (Table([Row(["first", "row"]), Row(["second", "row"])]), Row(["first", "row"])), + ], + ) + def test_get_first_row(self, input, expected): + first_row = get_first_row(input) + assert first_row == expected From 8d79d5ac6f9ff973c439e8792a31bec3349bcbdc Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 20 Dec 2023 13:06:41 -0800 Subject: [PATCH 16/35] all functional tests passing --- .../databricks/relation_configs/base.py | 2 +- .../relation_configs/materialized_view.py | 3 + .../relation_configs/tblproperties.py | 5 +- .../relations/materialized_view/alter.sql | 3 +- .../databricks/macros/relations/replace.sql | 2 +- .../materialized_view_tests/fixtures.py | 46 +++++++++ .../materialized_view_tests/test_basic.py | 29 ++++++ .../materialized_view_tests/test_changes.py | 93 +++++++++++++++++++ .../test_materialized_views.py | 48 ---------- .../unit/relation_configs/test_config_base.py | 13 +-- tests/unit/test_utils.py | 21 +++-- 11 files changed, 191 insertions(+), 74 deletions(-) create mode 100644 tests/functional/adapter/materialized_view_tests/fixtures.py create mode 100644 tests/functional/adapter/materialized_view_tests/test_basic.py create mode 100644 tests/functional/adapter/materialized_view_tests/test_changes.py delete mode 100644 tests/functional/adapter/materialized_view_tests/test_materialized_views.py diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index bc73c7b07..6ed6e97d3 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -46,7 +46,7 @@ def requires_full_refresh(self) -> bool: return not isinstance(self.context, DatabricksAlterableComponentConfig) @classmethod - def get_change(cls, new: Component, existing: Component) -> Optional[Self]: + def get_change(cls, existing: Component, new: Component) -> Optional[Self]: if new != existing: return cls(RelationConfigChangeAction.alter, new.get_diff(existing)) return None diff --git a/dbt/adapters/databricks/relation_configs/materialized_view.py b/dbt/adapters/databricks/relation_configs/materialized_view.py index 6f0c888e4..6a4631c77 100644 --- a/dbt/adapters/databricks/relation_configs/materialized_view.py +++ b/dbt/adapters/databricks/relation_configs/materialized_view.py @@ -55,6 +55,9 @@ def get_changeset( changes: List[Optional[DatabricksConfigChange]] = [] changes.append(DatabricksConfigChange.get_change(existing.partition_by, self.partition_by)) + changes.append( + DatabricksConfigChange.get_change(existing.tblproperties, self.tblproperties) + ) changes.append(DatabricksConfigChange.get_change(existing.comment, self.comment)) changes.append(DatabricksConfigChange.get_change(existing.refresh, self.refresh)) changes.append(DatabricksConfigChange.get_change(existing.query, self.query)) diff --git a/dbt/adapters/databricks/relation_configs/tblproperties.py b/dbt/adapters/databricks/relation_configs/tblproperties.py index 62ee1453e..7de606ca0 100644 --- a/dbt/adapters/databricks/relation_configs/tblproperties.py +++ b/dbt/adapters/databricks/relation_configs/tblproperties.py @@ -3,7 +3,6 @@ from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( - DatabricksAlterableComponentConfig, DatabricksComponentConfig, DatabricksComponentProcessor, ) @@ -12,7 +11,7 @@ @dataclass(frozen=True) -class TblPropertiesConfig(DatabricksAlterableComponentConfig): +class TblPropertiesConfig(DatabricksComponentConfig): tblproperties: Dict[str, str] = field(default_factory=dict) ignore_list: List[str] = field(default_factory=list) to_unset: List[str] = field(default_factory=list) @@ -31,6 +30,8 @@ def to_sql_clause(self) -> str: return "" def to_alter_sql_clauses(self) -> List[str]: + """For now this is not called because MVs do not currently allow altering tblproperties. + When that changes, switch to Alterable config to start invoking this method.""" clauses = [] without_ignore_list = self._without_ignore_list(self.tblproperties) if without_ignore_list: diff --git a/dbt/include/databricks/macros/relations/materialized_view/alter.sql b/dbt/include/databricks/macros/relations/materialized_view/alter.sql index 797baee86..99dada18c 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/alter.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/alter.sql @@ -35,7 +35,7 @@ ) %} -- apply a full refresh immediately if needed {% if configuration_changes.requires_full_refresh %} - {{ return(databricks__get_replace_sql(existing_relation, relation, sql)) }} + {{ return(get_replace_sql(existing_relation, relation, sql)) }} -- otherwise apply individual changes as needed {% else %} @@ -43,6 +43,7 @@ {% for clause in configuration_changes.get_alter_sql_clauses() %} {% do changes.append("alter materialized view " ~ relation ~ " " ~ clause) %} {% endfor %} + {{ log(changes) }} {{ return(changes) }} {%- endif -%} {% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/replace.sql b/dbt/include/databricks/macros/relations/replace.sql index af6f41e7a..4d853ba93 100644 --- a/dbt/include/databricks/macros/relations/replace.sql +++ b/dbt/include/databricks/macros/relations/replace.sql @@ -4,7 +4,7 @@ {% endmacro %} {% macro databricks__get_replace_sql(existing_relation, target_relation, sql) %} - + {{ log('in get replace')}} {# /* use a create or replace statement if possible */ #} {% set is_replaceable = existing_relation.type == target_relation_type and existing_relation.can_be_replaced %} diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py new file mode 100644 index 000000000..34e3003aa --- /dev/null +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -0,0 +1,46 @@ +from typing import Optional +from dbt.adapters.base import BaseRelation + +from dbt.adapters.databricks.relation import DatabricksRelationType + + +def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + table_type = project.run_sql( + f"select table_type from {relation.information_schema_only()}." + f"`tables` where table_name = '{relation.identifier}'", + fetch="one", + )[0] + if table_type == "STREAMING_TABLE": + return DatabricksRelationType.StreamingTable.value + elif table_type == "MANAGED" or table_type == "EXTERNAL": + return DatabricksRelationType.Table.value + else: + is_materialized = project.run_sql( + f"select is_materialized from {relation.information_schema_only()}." + f"`views` where table_name = '{relation.identifier}'", + fetch="one", + )[0] + if is_materialized == "TRUE": + return DatabricksRelationType.MaterializedView.value + else: + return DatabricksRelationType.View.value + + +materialized_view = """ +{{ config( + materialized='materialized_view', + description='this is a materialized view', + partition_by='id', + schedule = { + 'cron': '0 0 * * * ? *', + 'time_zone': 'Etc/UTC' + }, + tblproperties_config = { + 'tblproperties': { + 'key': 'value' + }, + 'ignore_list': ['pipelines.pipelineId'] + } +) }} +select * from {{ ref('my_seed') }} +""" diff --git a/tests/functional/adapter/materialized_view_tests/test_basic.py b/tests/functional/adapter/materialized_view_tests/test_basic.py new file mode 100644 index 000000000..e3604ba09 --- /dev/null +++ b/tests/functional/adapter/materialized_view_tests/test_basic.py @@ -0,0 +1,29 @@ +from typing import Optional, Tuple +from dbt.tests.adapter.materialized_view.basic import MaterializedViewBasic +from dbt.adapters.base.relation import BaseRelation +import pytest + +from tests.functional.adapter.materialized_view_tests import fixtures + + +class TestMaterializedViewsMixin: + @staticmethod + def insert_record(project, table: BaseRelation, record: Tuple[int, int]) -> None: + project.run_sql(f"insert into {table} values {record}") + + @staticmethod + def refresh_materialized_view(project, materialized_view: BaseRelation) -> None: + project.run_sql(f"refresh materialized view {materialized_view}") + + @staticmethod + def query_row_count(project, relation: BaseRelation) -> int: + return project.run_sql(f"select count(*) from {relation}", fetch="one")[0] + + @staticmethod + def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + return fixtures.query_relation_type(project, relation) + + +@pytest.mark.skip_profile("databricks_cluster", "databricks_uc_cluster") +class TestMaterializedViews(TestMaterializedViewsMixin, MaterializedViewBasic): + pass diff --git a/tests/functional/adapter/materialized_view_tests/test_changes.py b/tests/functional/adapter/materialized_view_tests/test_changes.py new file mode 100644 index 000000000..ec3dd577a --- /dev/null +++ b/tests/functional/adapter/materialized_view_tests/test_changes.py @@ -0,0 +1,93 @@ +from typing import Optional +from dbt.tests.adapter.materialized_view.changes import ( + MaterializedViewChanges, + MaterializedViewChangesApplyMixin, + MaterializedViewChangesContinueMixin, + MaterializedViewChangesFailMixin, +) +from dbt.adapters.base import BaseRelation +from dbt.tests import util +import pytest +from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig +from dbt.adapters.databricks.relation_configs.refresh import ScheduledRefreshConfig +from dbt.adapters.databricks.relation_configs.tblproperties import TblPropertiesConfig + +from tests.functional.adapter.materialized_view_tests import fixtures + + +def _check_tblproperties(tblproperties: TblPropertiesConfig, expected: dict): + assert tblproperties._without_ignore_list(tblproperties.tblproperties) == expected + + +class MaterializedViewChangesMixin(MaterializedViewChanges): + @pytest.fixture(scope="class", autouse=True) + def models(self): + return {"my_materialized_view.sql": fixtures.materialized_view} + + @staticmethod + def check_start_state(project, materialized_view): + with util.get_connection(project.adapter): + results = project.adapter.get_relation_config(materialized_view) + assert isinstance(results, MaterializedViewConfig) + assert results.partition_by.partition_by == ["id"] + assert results.query.query.startswith("select * from") + _check_tblproperties(results.tblproperties, {"key": "value"}) + assert isinstance(results.refresh, ScheduledRefreshConfig) + assert results.refresh.cron == "0 0 * * * ? *" + assert results.refresh.time_zone_value == "Etc/UTC" + + @staticmethod + def change_config_via_alter(project, materialized_view): + initial_model = util.get_model_file(project, materialized_view) + new_model = initial_model.replace("'cron': '0 0 * * * ? *'", "'cron': '0 5 * * * ? *'") + util.set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_alter_change_is_applied(project, materialized_view): + with util.get_connection(project.adapter): + results = project.adapter.get_relation_config(materialized_view) + assert isinstance(results, MaterializedViewConfig) + assert isinstance(results.refresh, ScheduledRefreshConfig) + assert results.refresh.cron == "0 5 * * * ? *" + assert results.refresh.time_zone_value == "Etc/UTC" + + @staticmethod + def change_config_via_replace(project, materialized_view): + initial_model = util.get_model_file(project, materialized_view) + new_model = ( + initial_model.replace("partition_by='id',", "") + .replace("select *", "select id, value") + .replace("'key': 'value'", "'other': 'other'") + ) + util.set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_replace_change_is_applied(project, materialized_view): + with util.get_connection(project.adapter): + results = project.adapter.get_relation_config(materialized_view) + assert isinstance(results, MaterializedViewConfig) + assert results.partition_by.partition_by == [] + assert results.query.query.startswith("select id, value") + _check_tblproperties(results.tblproperties, {"other": "other"}) + + @staticmethod + def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + return fixtures.query_relation_type(project, relation) + + +class TestMaterializedViewApplyChanges( + MaterializedViewChangesMixin, MaterializedViewChangesApplyMixin +): + pass + + +class TestMaterializedViewContinueOnChanges( + MaterializedViewChangesMixin, MaterializedViewChangesContinueMixin +): + pass + + +class TestMaterializedViewFailOnChanges( + MaterializedViewChangesMixin, MaterializedViewChangesFailMixin +): + pass diff --git a/tests/functional/adapter/materialized_view_tests/test_materialized_views.py b/tests/functional/adapter/materialized_view_tests/test_materialized_views.py deleted file mode 100644 index e8ad12531..000000000 --- a/tests/functional/adapter/materialized_view_tests/test_materialized_views.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Optional, Tuple -from dbt.tests.adapter.materialized_view.basic import MaterializedViewBasic -import dbt.tests.adapter.materialized_view.basic as fixtures -from dbt.adapters.base.relation import BaseRelation -import pytest - -from dbt.adapters.databricks.relation import DatabricksRelationType - - -class TestMaterializedViewsMixin: - @staticmethod - def insert_record(project, table: BaseRelation, record: Tuple[int, int]) -> None: - project.run_sql(f"insert into {table} values {record}") - - @staticmethod - def refresh_materialized_view(project, materialized_view: BaseRelation) -> None: - project.run_sql(f"refresh materialized view {materialized_view}") - - @staticmethod - def query_row_count(project, relation: BaseRelation) -> int: - return project.run_sql(f"select count(*) from {relation}", fetch="one")[0] - - @staticmethod - def query_relation_type(project, relation: BaseRelation) -> Optional[str]: - table_type = project.run_sql( - f"select table_type from {relation.information_schema_only()}." - f"`tables` where table_name = '{relation.identifier}'", - fetch="one", - )[0] - if table_type == "STREAMING_TABLE": - return DatabricksRelationType.StreamingTable.value - elif table_type == "MANAGED" or table_type == "EXTERNAL": - return DatabricksRelationType.Table.value - else: - is_materialized = project.run_sql( - f"select is_materialized from {relation.information_schema_only()}." - f"`views` where table_name = '{relation.identifier}'", - fetch="one", - )[0] - if is_materialized == "TRUE": - return DatabricksRelationType.MaterializedView.value - else: - return DatabricksRelationType.View.value - - -@pytest.mark.skip_profile("databricks_cluster") -class TestMaterializedViews(TestMaterializedViewsMixin, MaterializedViewBasic): - pass diff --git a/tests/unit/relation_configs/test_config_base.py b/tests/unit/relation_configs/test_config_base.py index 2e7286376..1637c065b 100644 --- a/tests/unit/relation_configs/test_config_base.py +++ b/tests/unit/relation_configs/test_config_base.py @@ -13,7 +13,6 @@ ManualRefreshConfig, ScheduledRefreshConfig, ) -from dbt.adapters.databricks.relation_configs.tblproperties import TblPropertiesConfig @dataclass(frozen=True, eq=True, unsafe_hash=True) @@ -54,7 +53,7 @@ def test_get_change__no_change(self): def test_get_change__with_diff(self): change = DatabricksConfigChange.get_change( - ManualRefreshConfig(), ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC"), ManualRefreshConfig() ) assert change == DatabricksConfigChange( RelationConfigChangeAction.alter, ManualRefreshConfig() @@ -105,15 +104,7 @@ def test_get_alter_sql_clauses__with_inalterable_change(self): def test_get_alter_sql_clauses__with_alterable_change(self): changeset = DatabricksRelationChangeSet( [ - DatabricksConfigChange( - RelationConfigChangeAction.alter, - TblPropertiesConfig(tblproperties={"key": "value"}, to_unset=["other"]), - ), DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()), ] ) - assert changeset.get_alter_sql_clauses() == [ - "SET TBLPROPERTIES ('key' = 'value')", - "UNSET TBLPROPERTIES IF EXISTS ('other')", - "DROP SCHEDULE", - ] + assert changeset.get_alter_sql_clauses() == ["DROP SCHEDULE"] diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index a67179726..32658e2c1 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -90,13 +90,14 @@ def test_remove_ansi(self): """ self.assertEqual(remove_ansi(test_string), expected_string) - @pytest.mark.parametrize( - "input,expected", - [ - (Table([]), Row(set())), - (Table([Row(["first", "row"]), Row(["second", "row"])]), Row(["first", "row"])), - ], - ) - def test_get_first_row(self, input, expected): - first_row = get_first_row(input) - assert first_row == expected + +@pytest.mark.parametrize( + "input,expected", + [ + (Table([]), Row(set())), + (Table([Row(["first", "row"]), Row(["second", "row"])]), Row(["first", "row"])), + ], +) +def test_get_first_row(input, expected): + first_row = get_first_row(input) + assert first_row == expected From ab006708792669016e8d1ea907c67811b9c27fb7 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 20 Dec 2023 13:13:32 -0800 Subject: [PATCH 17/35] wip --- .../databricks/macros/relations/materialized_view/alter.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbt/include/databricks/macros/relations/materialized_view/alter.sql b/dbt/include/databricks/macros/relations/materialized_view/alter.sql index 99dada18c..3a7b5cf16 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/alter.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/alter.sql @@ -18,12 +18,12 @@ {% endmacro %} -{% macro databricks__get_materialized_view_configuration_changes(existing_relation, new_config) -%} - {% set _existing_materialized_view = adapter.get_relation_config(existing_relation) %} +{%- macro databricks__get_materialized_view_configuration_changes(existing_relation, new_config) -%} + {%- set _existing_materialized_view = adapter.get_relation_config(existing_relation) -%} {%- set materialized_view = adapter.materialized_view_config_from_model(config.model) -%} {%- set _configuration_changes = materialized_view.get_changeset(_existing_materialized_view) -%} {% do return(_configuration_changes) %} -{% endmacro -%} +{%- endmacro -%} {% macro databricks__get_alter_materialized_view_as_sql( relation, From d975c6c8ecae1b5e6e3d46c6a4f645ef457163cd Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 20 Dec 2023 13:22:16 -0800 Subject: [PATCH 18/35] trying to pass linter --- dbt/adapters/databricks/auth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/databricks/auth.py b/dbt/adapters/databricks/auth.py index cb8f6dcdd..c8fc0eaa5 100644 --- a/dbt/adapters/databricks/auth.py +++ b/dbt/adapters/databricks/auth.py @@ -16,12 +16,12 @@ def as_dict(self) -> dict: return {"token": self._token} @staticmethod - def from_dict(raw: Optional[dict]) -> CredentialsProvider: + def from_dict(raw: Optional[dict]) -> Optional[CredentialsProvider]: if not raw: return None return token_auth(raw["token"]) - def __call__(self, *args: tuple, **kwargs: Dict[str, Any]) -> HeaderFactory: + def __call__(self, _: Config) -> HeaderFactory: static_credentials = {"Authorization": f"Bearer {self._token}"} def inner() -> Dict[str, str]: From eda0f90e6b131dbf23f792844e6827ed7810ea67 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 20 Dec 2023 13:41:21 -0800 Subject: [PATCH 19/35] Revert "trying to pass linter" This reverts commit 663af9ba5b49b071e95d10557f878e72f048023e. --- dbt/adapters/databricks/auth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/databricks/auth.py b/dbt/adapters/databricks/auth.py index c8fc0eaa5..cb8f6dcdd 100644 --- a/dbt/adapters/databricks/auth.py +++ b/dbt/adapters/databricks/auth.py @@ -16,12 +16,12 @@ def as_dict(self) -> dict: return {"token": self._token} @staticmethod - def from_dict(raw: Optional[dict]) -> Optional[CredentialsProvider]: + def from_dict(raw: Optional[dict]) -> CredentialsProvider: if not raw: return None return token_auth(raw["token"]) - def __call__(self, _: Config) -> HeaderFactory: + def __call__(self, *args: tuple, **kwargs: Dict[str, Any]) -> HeaderFactory: static_credentials = {"Authorization": f"Bearer {self._token}"} def inner() -> Dict[str, str]: From daea09465546014c73ae67ea6ec061a37387d265 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 20 Dec 2023 13:58:32 -0800 Subject: [PATCH 20/35] skip mv change tests for non-sql profiles --- .../functional/adapter/materialized_view_tests/test_changes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/functional/adapter/materialized_view_tests/test_changes.py b/tests/functional/adapter/materialized_view_tests/test_changes.py index ec3dd577a..926348fb3 100644 --- a/tests/functional/adapter/materialized_view_tests/test_changes.py +++ b/tests/functional/adapter/materialized_view_tests/test_changes.py @@ -75,18 +75,21 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: return fixtures.query_relation_type(project, relation) +@pytest.mark.skip_profile("databricks_cluster", "databricks_uc_cluster") class TestMaterializedViewApplyChanges( MaterializedViewChangesMixin, MaterializedViewChangesApplyMixin ): pass +@pytest.mark.skip_profile("databricks_cluster", "databricks_uc_cluster") class TestMaterializedViewContinueOnChanges( MaterializedViewChangesMixin, MaterializedViewChangesContinueMixin ): pass +@pytest.mark.skip_profile("databricks_cluster", "databricks_uc_cluster") class TestMaterializedViewFailOnChanges( MaterializedViewChangesMixin, MaterializedViewChangesFailMixin ): From 8d3bab58e842f154c2f3066cc00637dfab892dd8 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 21 Dec 2023 17:30:39 -0800 Subject: [PATCH 21/35] redone, successfully --- dbt/adapters/databricks/impl.py | 4 +- .../databricks/relation_configs/base.py | 102 +++++------- .../databricks/relation_configs/comment.py | 15 +- .../relation_configs/materialized_view.py | 40 ----- .../relation_configs/partitioning.py | 21 ++- .../databricks/relation_configs/query.py | 11 +- .../databricks/relation_configs/refresh.py | 78 +++------ .../relation_configs/tblproperties.py | 111 +++---------- .../macros/relations/components/comment.sql | 5 + .../relations/components/partitioning.sql | 5 + .../relations/components/refresh_schedule.sql | 17 ++ .../relations/components/tblproperties.sql | 9 ++ .../relations/materialized_view/alter.sql | 20 +-- .../relations/materialized_view/create.sql | 14 +- .../databricks/macros/relations/replace.sql | 3 +- .../materialized_view_tests/fixtures.py | 9 +- .../materialized_view_tests/test_changes.py | 28 ++-- tests/unit/macros/relation_configs/base.py | 32 ++++ .../relation_configs/test_comment_macros.py | 20 +++ .../test_partitioning_macros.py | 24 +++ .../test_refresh_schedule_macros.py | 40 +++++ .../test_tblproperties_macros.py | 24 +++ tests/unit/relation_configs/test_comment.py | 30 ++-- .../unit/relation_configs/test_config_base.py | 81 ++-------- .../test_materialized_view_config.py | 85 ++++++++-- .../relation_configs/test_partitioning.py | 28 +--- tests/unit/relation_configs/test_query.py | 4 +- tests/unit/relation_configs/test_refresh.py | 89 +++-------- .../relation_configs/test_tblproperties.py | 148 ++---------------- 29 files changed, 451 insertions(+), 646 deletions(-) create mode 100644 dbt/include/databricks/macros/relations/components/comment.sql create mode 100644 dbt/include/databricks/macros/relations/components/partitioning.sql create mode 100644 dbt/include/databricks/macros/relations/components/refresh_schedule.sql create mode 100644 dbt/include/databricks/macros/relations/components/tblproperties.sql create mode 100644 tests/unit/macros/relation_configs/base.py create mode 100644 tests/unit/macros/relation_configs/test_comment_macros.py create mode 100644 tests/unit/macros/relation_configs/test_partitioning_macros.py create mode 100644 tests/unit/macros/relation_configs/test_refresh_schedule_macros.py create mode 100644 tests/unit/macros/relation_configs/test_tblproperties_macros.py diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 4ca91c637..e10250c5c 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -47,7 +47,6 @@ import dbt.exceptions from dbt.events import AdapterLogger from dbt.utils import executor -from dbt.adapters.relation_configs.config_base import RelationConfigBase from dbt.adapters.databricks.column import DatabricksColumn from dbt.adapters.databricks.connections import DatabricksConnectionManager @@ -60,6 +59,7 @@ DatabricksRelation, DatabricksRelationType, ) +from dbt.adapters.databricks.relation_configs.base import DatabricksRelationConfigBase from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig from dbt.adapters.databricks.utils import redact_credentials, undefined_proof, get_first_row from dbt.adapters.relation_configs.config_base import RelationResults @@ -744,7 +744,7 @@ def materialized_view_config_from_model(self, model: ModelNode) -> MaterializedV return MaterializedViewConfig.from_model_node(model) # type: ignore @available.parse_none - def get_relation_config(self, relation: DatabricksRelation) -> RelationConfigBase: + def get_relation_config(self, relation: DatabricksRelation) -> DatabricksRelationConfigBase: if relation.type == RelationType.MaterializedView: results = self.describe_materialized_view(relation) return MaterializedViewConfig.from_results(results) diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index 6ed6e97d3..07e0c25d3 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -1,79 +1,42 @@ from abc import ABC, abstractmethod -from dataclasses import dataclass +from pydantic import BaseModel, ConfigDict from typing import ClassVar, Dict, Generic, List, Optional, TypeVar from typing_extensions import Self, Type -from dbt.adapters.relation_configs.config_base import RelationConfigBase, RelationResults +from dbt.adapters.relation_configs.config_base import RelationResults from dbt.contracts.graph.nodes import ModelNode -from dbt.exceptions import DbtRuntimeError -from dbt.adapters.relation_configs.config_change import ( - RelationConfigChange, - RelationConfigChangeAction, -) - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksComponentConfig(ABC): - @abstractmethod - def to_sql_clause(self) -> str: - raise NotImplementedError("Must be implemented by subclass") - - def get_diff(self, other: "DatabricksComponentConfig") -> Self: - if not isinstance(other, self.__class__): - raise DbtRuntimeError( - f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" - ) - return self - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksAlterableComponentConfig(DatabricksComponentConfig, ABC): - @abstractmethod - def to_alter_sql_clauses(self) -> List[str]: - raise NotImplementedError("Must be implemented by subclass") - - -Component = TypeVar("Component", bound=DatabricksComponentConfig) - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksConfigChange(RelationConfigChange, Generic[Component]): - context: Component +class DatabricksComponentConfig(BaseModel, ABC): + model_config = ConfigDict(frozen=True) @property + @abstractmethod def requires_full_refresh(self) -> bool: - return not isinstance(self.context, DatabricksAlterableComponentConfig) + raise NotImplementedError("Must be implemented by subclass") - @classmethod - def get_change(cls, existing: Component, new: Component) -> Optional[Self]: - if new != existing: - return cls(RelationConfigChangeAction.alter, new.get_diff(existing)) + def get_diff(self, other: Self) -> Optional[Self]: + if self != other: + return self return None -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksRelationChangeSet: - changes: List[DatabricksConfigChange] +class DatabricksRelationChangeSet(BaseModel): + model_config = ConfigDict(frozen=True) + changes: Dict[str, DatabricksComponentConfig] @property def requires_full_refresh(self) -> bool: - return any(change.requires_full_refresh for change in self.changes) + return any(change.requires_full_refresh for change in self.changes.values()) @property def has_changes(self) -> bool: return len(self.changes) > 0 - def get_alter_sql_clauses(self) -> List[str]: - assert ( - not self.requires_full_refresh - ), "Cannot alter a relation when changes requires a full refresh" - return [ - clause for change in self.changes for clause in change.context.to_alter_sql_clauses() - ] + +Component = TypeVar("Component", bound=DatabricksComponentConfig) -@dataclass(frozen=True, eq=True, unsafe_hash=True) class DatabricksComponentProcessor(ABC, Generic[Component]): name: ClassVar[str] @@ -88,29 +51,38 @@ def from_model_node(cls, model_node: ModelNode) -> Component: raise NotImplementedError("Must be implemented by subclass") -T = TypeVar("T", bound=DatabricksComponentConfig) - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class DatabricksRelationConfigBase(RelationConfigBase): +class DatabricksRelationConfigBase(BaseModel, ABC): config_components: ClassVar[List[Type[DatabricksComponentProcessor]]] + config: Dict[str, DatabricksComponentConfig] @classmethod - def from_model_node(cls, model_node: ModelNode) -> RelationConfigBase: + def from_model_node(cls, model_node: ModelNode) -> Self: config_dict: Dict[str, DatabricksComponentConfig] = {} for component in cls.config_components: - config_dict[component.name] = component.from_model_node(model_node) + model_component = component.from_model_node(model_node) + if model_component: + config_dict[component.name] = model_component - return cls.from_dict(config_dict) + return cls(config=config_dict) @classmethod - def from_results(cls, results: RelationResults) -> RelationConfigBase: + def from_results(cls, results: RelationResults) -> Self: config_dict: Dict[str, DatabricksComponentConfig] = {} for component in cls.config_components: - config_dict[component.name] = component.from_results(results) + result_component = component.from_results(results) + if result_component: + config_dict[component.name] = result_component - return cls.from_dict(config_dict) + return cls(config=config_dict) - @abstractmethod def get_changeset(self, existing: Self) -> Optional[DatabricksRelationChangeSet]: - raise NotImplementedError("Must be implemented by subclass") + changes = {} + + for key, value in self.config.items(): + diff = value.get_diff(existing.config[key]) + if diff: + changes[key] = diff + + if len(changes) > 0: + return DatabricksRelationChangeSet(changes=changes) + return None diff --git a/dbt/adapters/databricks/relation_configs/comment.py b/dbt/adapters/databricks/relation_configs/comment.py index 40d6eb426..4f88db77d 100644 --- a/dbt/adapters/databricks/relation_configs/comment.py +++ b/dbt/adapters/databricks/relation_configs/comment.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass from typing import Optional, ClassVar from dbt.contracts.graph.nodes import ModelNode @@ -9,14 +8,12 @@ from dbt.adapters.relation_configs.config_base import RelationResults -@dataclass(frozen=True, eq=True, unsafe_hash=True) class CommentConfig(DatabricksComponentConfig): comment: Optional[str] = None - def to_sql_clause(self) -> str: - if self.comment: - return f"COMMENT '{self.comment}'" - return "" + @property + def requires_full_refresh(self) -> bool: + return True class CommentProcessor(DatabricksComponentProcessor[CommentConfig]): @@ -27,9 +24,11 @@ def from_results(cls, results: RelationResults) -> CommentConfig: table = results["describe_extended"] for row in table.rows: if row[0] == "Comment": - return CommentConfig(row[1]) + return CommentConfig(comment=row[1]) return CommentConfig() @classmethod def from_model_node(cls, model_node: ModelNode) -> CommentConfig: - return CommentConfig(model_node.description) + if model_node.description is not None: + return CommentConfig(comment=model_node.description) + return CommentConfig() diff --git a/dbt/adapters/databricks/relation_configs/materialized_view.py b/dbt/adapters/databricks/relation_configs/materialized_view.py index 6a4631c77..46fd1627b 100644 --- a/dbt/adapters/databricks/relation_configs/materialized_view.py +++ b/dbt/adapters/databricks/relation_configs/materialized_view.py @@ -1,35 +1,23 @@ -from dataclasses import dataclass -from typing import List, Optional - -from dbt.exceptions import DbtRuntimeError from dbt.adapters.databricks.relation_configs.base import ( - DatabricksConfigChange, - DatabricksRelationChangeSet, DatabricksRelationConfigBase, ) from dbt.adapters.databricks.relation_configs.comment import ( - CommentConfig, CommentProcessor, ) from dbt.adapters.databricks.relation_configs.partitioning import ( - PartitionedByConfig, PartitionedByProcessor, ) from dbt.adapters.databricks.relation_configs.query import ( - QueryConfig, QueryProcessor, ) from dbt.adapters.databricks.relation_configs.refresh import ( - RefreshConfig, RefreshProcessor, ) from dbt.adapters.databricks.relation_configs.tblproperties import ( - TblPropertiesConfig, TblPropertiesProcessor, ) -@dataclass(frozen=True, eq=True, unsafe_hash=True) class MaterializedViewConfig(DatabricksRelationConfigBase): config_components = [ PartitionedByProcessor, @@ -38,31 +26,3 @@ class MaterializedViewConfig(DatabricksRelationConfigBase): RefreshProcessor, QueryProcessor, ] - - partition_by: PartitionedByConfig - comment: CommentConfig - tblproperties: TblPropertiesConfig - refresh: RefreshConfig - query: QueryConfig - - def get_changeset( - self, existing: DatabricksRelationConfigBase - ) -> Optional[DatabricksRelationChangeSet]: - if not isinstance(existing, MaterializedViewConfig): - raise DbtRuntimeError( - f"Invalid comparison between MaterializedViewConfig and {type(existing)}" - ) - - changes: List[Optional[DatabricksConfigChange]] = [] - changes.append(DatabricksConfigChange.get_change(existing.partition_by, self.partition_by)) - changes.append( - DatabricksConfigChange.get_change(existing.tblproperties, self.tblproperties) - ) - changes.append(DatabricksConfigChange.get_change(existing.comment, self.comment)) - changes.append(DatabricksConfigChange.get_change(existing.refresh, self.refresh)) - changes.append(DatabricksConfigChange.get_change(existing.query, self.query)) - - trimmed_changes = [change for change in changes if change] - if trimmed_changes: - return DatabricksRelationChangeSet(trimmed_changes) - return None diff --git a/dbt/adapters/databricks/relation_configs/partitioning.py b/dbt/adapters/databricks/relation_configs/partitioning.py index 4d7e18493..8d58a5cef 100644 --- a/dbt/adapters/databricks/relation_configs/partitioning.py +++ b/dbt/adapters/databricks/relation_configs/partitioning.py @@ -1,6 +1,5 @@ -from dataclasses import dataclass import itertools -from typing import ClassVar, List, Optional +from typing import ClassVar, List from dbt.adapters.relation_configs.config_base import RelationResults from dbt.contracts.graph.nodes import ModelNode @@ -10,14 +9,12 @@ ) -@dataclass(frozen=True, eq=True, unsafe_hash=True) class PartitionedByConfig(DatabricksComponentConfig): - partition_by: Optional[List[str]] = None + partition_by: List[str] - def to_sql_clause(self) -> str: - if self.partition_by: - return f"PARTITIONED BY ({', '.join(self.partition_by)})" - return "" + @property + def requires_full_refresh(self) -> bool: + return True class PartitionedByProcessor(DatabricksComponentProcessor): @@ -35,11 +32,13 @@ def from_results(cls, results: RelationResults) -> PartitionedByConfig: if not row[0].startswith("# "): cols.append(row[0]) - return PartitionedByConfig(cols) + return PartitionedByConfig(partition_by=cols) @classmethod def from_model_node(cls, model_node: ModelNode) -> PartitionedByConfig: partition_by = model_node.config.extra.get("partition_by") if isinstance(partition_by, str): - return PartitionedByConfig([partition_by]) - return PartitionedByConfig(partition_by) + return PartitionedByConfig(partition_by=[partition_by]) + if not partition_by: + return PartitionedByConfig(partition_by=[]) + return PartitionedByConfig(partition_by=partition_by) diff --git a/dbt/adapters/databricks/relation_configs/query.py b/dbt/adapters/databricks/relation_configs/query.py index c99c9337d..a96cb552e 100644 --- a/dbt/adapters/databricks/relation_configs/query.py +++ b/dbt/adapters/databricks/relation_configs/query.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass from typing import ClassVar from dbt.adapters.relation_configs.config_base import RelationResults from dbt.contracts.graph.nodes import ModelNode @@ -9,12 +8,12 @@ from dbt.exceptions import DbtRuntimeError -@dataclass(frozen=True, eq=True, unsafe_hash=True) class QueryConfig(DatabricksComponentConfig): query: str - def to_sql_clause(self) -> str: - return self.query + @property + def requires_full_refresh(self) -> bool: + return True class QueryProcessor(DatabricksComponentProcessor[QueryConfig]): @@ -23,13 +22,13 @@ class QueryProcessor(DatabricksComponentProcessor[QueryConfig]): @classmethod def from_results(cls, result: RelationResults) -> QueryConfig: row = result["information_schema.views"] - return QueryConfig(row["view_definition"]) + return QueryConfig(query=row["view_definition"]) @classmethod def from_model_node(cls, model_node: ModelNode) -> QueryConfig: query = model_node.compiled_code if query: - return QueryConfig(query.strip()) + return QueryConfig(query=query.strip()) else: raise DbtRuntimeError(f"Cannot compile model {model_node.unique_id} with no SQL query") diff --git a/dbt/adapters/databricks/relation_configs/refresh.py b/dbt/adapters/databricks/relation_configs/refresh.py index 54ef3c96f..377b2c5f6 100644 --- a/dbt/adapters/databricks/relation_configs/refresh.py +++ b/dbt/adapters/databricks/relation_configs/refresh.py @@ -1,14 +1,11 @@ -from abc import ABC -from dataclasses import dataclass import re -from typing import ClassVar, List, Optional +from typing import ClassVar, Optional from dbt.adapters.relation_configs.config_base import RelationResults from dbt.exceptions import DbtRuntimeError from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( - DatabricksAlterableComponentConfig, DatabricksComponentConfig, DatabricksComponentProcessor, ) @@ -16,60 +13,25 @@ SCHEDULE_REGEX = re.compile(r"CRON '(.*)' AT TIME ZONE '(.*)'") -class RefreshConfig(DatabricksAlterableComponentConfig, ABC): - pass - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class ScheduledRefreshConfig(RefreshConfig): - cron: str +class RefreshConfig(DatabricksComponentConfig): + cron: Optional[str] = None time_zone_value: Optional[str] = None - alter: bool = False - - def to_sql_clause(self) -> str: - schedule = f"SCHEDULE CRON '{self.cron}'" - - if self.time_zone_value: - schedule += f" AT TIME ZONE '{self.time_zone_value}'" - - return schedule - - def to_alter_sql_clauses(self) -> List[str]: - prefix = "ALTER " if self.alter else "ADD " - - return [prefix + self.to_sql_clause()] - - def get_diff(self, other: DatabricksComponentConfig) -> "ScheduledRefreshConfig": - if not isinstance(other, RefreshConfig): - raise DbtRuntimeError( - f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" + is_altered: bool = False + + @property + def requires_full_refresh(self) -> bool: + return False + + def get_diff(self, other: "RefreshConfig") -> Optional["RefreshConfig"]: + if self != other: + return RefreshConfig( + cron=self.cron, + time_zone_value=self.time_zone_value, + is_altered=self.cron is not None and other.cron is not None, ) - - return ScheduledRefreshConfig( - cron=self.cron, - time_zone_value=self.time_zone_value, - alter=isinstance(other, ScheduledRefreshConfig), - ) - - -@dataclass(frozen=True, eq=True, unsafe_hash=True) -class ManualRefreshConfig(RefreshConfig): - def to_sql_clause(self) -> str: - return "" - - def to_alter_sql_clauses(self) -> List[str]: - return ["DROP SCHEDULE"] - - def get_diff(self, other: DatabricksComponentConfig) -> "ManualRefreshConfig": - if not isinstance(other, RefreshConfig): - raise DbtRuntimeError( - f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" - ) - - return ManualRefreshConfig() + return None -@dataclass(frozen=True, eq=True, unsafe_hash=True) class RefreshProcessor(DatabricksComponentProcessor[RefreshConfig]): name: ClassVar[str] = "refresh" @@ -79,13 +41,13 @@ def from_results(cls, results: RelationResults) -> RefreshConfig: for row in table.rows: if row[0] == "Refresh Schedule": if row[1] == "MANUAL": - return ManualRefreshConfig() + return RefreshConfig() match = SCHEDULE_REGEX.match(row[1]) if match: cron, time_zone_value = match.groups() - return ScheduledRefreshConfig(cron=cron, time_zone_value=time_zone_value) + return RefreshConfig(cron=cron, time_zone_value=time_zone_value) raise DbtRuntimeError( f"Could not parse schedule from description: {row[1]}." @@ -104,8 +66,8 @@ def from_model_node(cls, model_node: ModelNode) -> RefreshConfig: if schedule: if "cron" not in schedule: raise DbtRuntimeError(f"Schedule config must contain a 'cron' key, got {schedule}") - return ScheduledRefreshConfig( + return RefreshConfig( cron=schedule["cron"], time_zone_value=schedule.get("time_zone_value") ) else: - return ManualRefreshConfig() + return RefreshConfig() diff --git a/dbt/adapters/databricks/relation_configs/tblproperties.py b/dbt/adapters/databricks/relation_configs/tblproperties.py index 7de606ca0..b4ceefc57 100644 --- a/dbt/adapters/databricks/relation_configs/tblproperties.py +++ b/dbt/adapters/databricks/relation_configs/tblproperties.py @@ -1,5 +1,4 @@ -from dataclasses import dataclass, field -from typing import Any, Dict, List, ClassVar +from typing import Any, Dict, ClassVar, List from dbt.contracts.graph.nodes import ModelNode from dbt.adapters.databricks.relation_configs.base import ( @@ -10,76 +9,23 @@ from dbt.exceptions import DbtRuntimeError -@dataclass(frozen=True) class TblPropertiesConfig(DatabricksComponentConfig): - tblproperties: Dict[str, str] = field(default_factory=dict) - ignore_list: List[str] = field(default_factory=list) - to_unset: List[str] = field(default_factory=list) + tblproperties: Dict[str, str] + ignore_list: List[str] = ["pipelines.pipelineId"] - def __post_init__(self) -> None: - if any([key in self.tblproperties for key in self.to_unset]): - raise DbtRuntimeError("Cannot set and unset the same tblproperty in the same model") + @property + def requires_full_refresh(self) -> bool: + return True - def _format_tblproperties(self, to_format: Dict[str, str]) -> str: - return ", ".join(f"'{k}' = '{v}'" for k, v in to_format.items()) - - def to_sql_clause(self) -> str: - without_ignore_list = self._without_ignore_list(self.tblproperties) - if without_ignore_list: - return f"TBLPROPERTIES ({self._format_tblproperties(without_ignore_list)})" - return "" - - def to_alter_sql_clauses(self) -> List[str]: - """For now this is not called because MVs do not currently allow altering tblproperties. - When that changes, switch to Alterable config to start invoking this method.""" - clauses = [] - without_ignore_list = self._without_ignore_list(self.tblproperties) - if without_ignore_list: - clauses.append(f"SET TBLPROPERTIES ({self._format_tblproperties(without_ignore_list)})") - if self.to_unset: - to_unset = ", ".join(f"'{k}'" for k in self.to_unset) - clauses.append(f"UNSET TBLPROPERTIES IF EXISTS ({to_unset})") - return clauses - - def __eq__(self, other: Any) -> bool: - if not isinstance(other, TblPropertiesConfig): + def __eq__(self, __value: Any) -> bool: + if not isinstance(__value, TblPropertiesConfig): return False - return ( - self._without_ignore_list(self.tblproperties) - == self._without_ignore_list(other.tblproperties) - and self.to_unset == other.to_unset - ) - - def _without_ignore_list(self, tblproperties: Dict[str, str]) -> Dict[str, str]: - if tblproperties: - ignore_list = self.ignore_list or ["pipelines.pipelineId"] - return {k: v for k, v in tblproperties.items() if k not in ignore_list} - return tblproperties - - def get_diff(self, other: DatabricksComponentConfig) -> "TblPropertiesConfig": - if not isinstance(other, TblPropertiesConfig): - raise DbtRuntimeError( - f"Cannot diff {self.__class__.__name__} with {other.__class__.__name__}" - ) - tblproperties = self._without_ignore_list(self.tblproperties) - other_tblproperties = self._without_ignore_list(other.tblproperties) + def _without_ignore_list(d: Dict[str, str]) -> Dict[str, str]: + return {k: v for k, v in d.items() if k not in self.ignore_list} - tblproperties = { - k: v - for k, v in tblproperties.items() - if k not in other_tblproperties or v != other_tblproperties[k] - } - - to_unset = [] - for k in other_tblproperties.keys(): - if k not in self.tblproperties: - to_unset.append(k) - - return TblPropertiesConfig( - tblproperties=tblproperties, - ignore_list=self.ignore_list, - to_unset=to_unset, + return _without_ignore_list(self.tblproperties) == _without_ignore_list( + __value.tblproperties ) @@ -95,30 +41,15 @@ def from_results(cls, results: RelationResults) -> TblPropertiesConfig: for row in table.rows: tblproperties[str(row[0])] = str(row[1]) - return TblPropertiesConfig(tblproperties) + return TblPropertiesConfig(tblproperties=tblproperties) @classmethod def from_model_node(cls, model_node: ModelNode) -> TblPropertiesConfig: - tblproperties_config = model_node.config.extra.get("tblproperties_config") - if tblproperties_config: - ignore_list = tblproperties_config.get("ignore_list") - tblproperties = tblproperties_config.get("tblproperties") - if isinstance(tblproperties, Dict): - tblproperties = {str(k): str(v) for k, v in tblproperties.items()} - elif tblproperties is None: - tblproperties = dict() - else: - raise DbtRuntimeError( - "If tblproperties_config is set, tblproperties must be a dictionary" - ) - - if ignore_list is None: - return TblPropertiesConfig(tblproperties) - - if isinstance(ignore_list, List): - ignore_list = [str(i) for i in ignore_list] - return TblPropertiesConfig(tblproperties, ignore_list) - else: - raise DbtRuntimeError("ignore_list must be a list") - - return TblPropertiesConfig() + tblproperties = model_node.config.extra.get("tblproperties") + if not tblproperties: + return TblPropertiesConfig(tblproperties=dict()) + if isinstance(tblproperties, Dict): + tblproperties = {str(k): str(v) for k, v in tblproperties.items()} + return TblPropertiesConfig(tblproperties=tblproperties) + else: + raise DbtRuntimeError("tblproperties must be a dictionary") diff --git a/dbt/include/databricks/macros/relations/components/comment.sql b/dbt/include/databricks/macros/relations/components/comment.sql new file mode 100644 index 000000000..86f26a6b0 --- /dev/null +++ b/dbt/include/databricks/macros/relations/components/comment.sql @@ -0,0 +1,5 @@ +{%- macro get_create_sql_comment(comment) -%} +{% if comment is string -%} + COMMENT '{{ comment }}' +{%- endif -%} +{%- endmacro -%} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/components/partitioning.sql b/dbt/include/databricks/macros/relations/components/partitioning.sql new file mode 100644 index 000000000..a5597f719 --- /dev/null +++ b/dbt/include/databricks/macros/relations/components/partitioning.sql @@ -0,0 +1,5 @@ +{% macro get_create_sql_partition_by(partition_by) -%} +{%- if partition_by -%} + PARTITIONED BY ({%- for col in partition_by -%}{{ col }}{% if not loop.last %}, {% endif %}{%- endfor %}) +{%- endif -%} +{%- endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/components/refresh_schedule.sql b/dbt/include/databricks/macros/relations/components/refresh_schedule.sql new file mode 100644 index 000000000..618c4a3c3 --- /dev/null +++ b/dbt/include/databricks/macros/relations/components/refresh_schedule.sql @@ -0,0 +1,17 @@ +{% macro get_create_sql_refresh_schedule(cron, time_zone_value) %} + {%- if cron -%} + SCHEDULE CRON '{{ cron }}'{%- if time_zone_value %} AT TIME ZONE '{{ time_zone_value }}'{%- endif -%} + {%- endif -%} +{% endmacro %} + +{% macro get_alter_sql_refresh_schedule(cron, time_zone_value, is_altered) %} + {%- if cron -%} + {%- if is_altered -%} + ALTER SCHEDULE CRON '{{ cron }}'{%- if time_zone_value %} AT TIME ZONE '{{ time_zone_value }}'{%- endif -%} + {%- else -%} + ADD SCHEDULE CRON '{{ cron }}'{%- if time_zone_value %} AT TIME ZONE '{{ time_zone_value }}'{%- endif -%} + {%- endif -%} + {%- else -%} + DROP SCHEDULE + {%- endif -%} +{% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/components/tblproperties.sql b/dbt/include/databricks/macros/relations/components/tblproperties.sql new file mode 100644 index 000000000..7e6c95faf --- /dev/null +++ b/dbt/include/databricks/macros/relations/components/tblproperties.sql @@ -0,0 +1,9 @@ +{% macro get_create_sql_tblproperties(tblproperties) %} + {%- if tblproperties and tblproperties|length>0 -%} + TBLPROPERTIES ( + {%- for prop in tblproperties -%} + '{{ prop }}' = '{{ tblproperties[prop] }}'{%- if not loop.last -%}, {% endif -%} + {% endfor -%} + ) + {%- endif -%} +{% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/materialized_view/alter.sql b/dbt/include/databricks/macros/relations/materialized_view/alter.sql index 3a7b5cf16..55a9599bf 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/alter.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/alter.sql @@ -7,14 +7,14 @@ intermediate_relation ) %} {{- log('Applying ALTER to: ' ~ relation) -}} - {{- return(adapter.dispatch('get_alter_materialized_view_as_sql', 'dbt')( + {%- do return(adapter.dispatch('get_alter_materialized_view_as_sql', 'dbt')( relation, configuration_changes, sql, existing_relation, backup_relation, intermediate_relation - )) -}} + )) -%} {% endmacro %} @@ -35,15 +35,17 @@ ) %} -- apply a full refresh immediately if needed {% if configuration_changes.requires_full_refresh %} - {{ return(get_replace_sql(existing_relation, relation, sql)) }} + {% do return(get_replace_sql(existing_relation, relation, sql)) %} -- otherwise apply individual changes as needed {% else %} - {% set changes = [] %} - {% for clause in configuration_changes.get_alter_sql_clauses() %} - {% do changes.append("alter materialized view " ~ relation ~ " " ~ clause) %} - {% endfor %} - {{ log(changes) }} - {{ return(changes) }} + {% do return(get_alter_internal(relation, configuration_changes)) %} {%- endif -%} +{% endmacro %} + +{% macro get_alter_internal(relation, configuration_changes) %} + {%- set refresh = configuration_changes.changes["refresh"] -%} + -- Currently only schedule can be altered + ALTER MATERIALIZED VIEW {{ relation }} + {{ get_alter_sql_refresh_schedule(refresh.cron, refresh.time_zone_value, refresh.is_altered) -}} {% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/materialized_view/create.sql b/dbt/include/databricks/macros/relations/materialized_view/create.sql index 9f475c697..5c18911ec 100644 --- a/dbt/include/databricks/macros/relations/materialized_view/create.sql +++ b/dbt/include/databricks/macros/relations/materialized_view/create.sql @@ -1,12 +1,14 @@ {% macro databricks__get_create_materialized_view_as_sql(relation, sql) -%} - {%- set materialized_view = adapter.materialized_view_config_from_model(config.model) -%} - + {%- set partition_by = materialized_view.config["partition_by"].partition_by -%} + {%- set tblproperties = materialized_view.config["tblproperties"].tblproperties -%} + {%- set comment = materialized_view.config["comment"].comment -%} + {%- set refresh = materialized_view.config["refresh"] -%} create materialized view {{ relation }} - {{ materialized_view.partition_by.to_sql_clause() }} - {{ materialized_view.comment.to_sql_clause() }} - {{ materialized_view.tblproperties.to_sql_clause() }} - {{ materialized_view.refresh.to_sql_clause() }} + {{ get_create_sql_partition_by(partition_by) }} + {{ get_create_sql_comment(comment) }} + {{ get_create_sql_tblproperties(tblproperties) }} + {{ get_create_sql_refresh_schedule(refresh.cron, refresh.time_zone_value) }} as {{ sql }} {% endmacro %} \ No newline at end of file diff --git a/dbt/include/databricks/macros/relations/replace.sql b/dbt/include/databricks/macros/relations/replace.sql index 4d853ba93..91dd99373 100644 --- a/dbt/include/databricks/macros/relations/replace.sql +++ b/dbt/include/databricks/macros/relations/replace.sql @@ -1,10 +1,9 @@ {% macro get_replace_sql(existing_relation, target_relation, sql) %} {{- log('Applying REPLACE to: ' ~ existing_relation) -}} - {{- return(adapter.dispatch('get_replace_sql', 'dbt')(existing_relation, target_relation, sql)) -}} + {% do return(adapter.dispatch('get_replace_sql', 'dbt')(existing_relation, target_relation, sql)) %} {% endmacro %} {% macro databricks__get_replace_sql(existing_relation, target_relation, sql) %} - {{ log('in get replace')}} {# /* use a create or replace statement if possible */ #} {% set is_replaceable = existing_relation.type == target_relation_type and existing_relation.can_be_replaced %} diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index 34e3003aa..558e9075d 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -35,12 +35,9 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: 'cron': '0 0 * * * ? *', 'time_zone': 'Etc/UTC' }, - tblproperties_config = { - 'tblproperties': { - 'key': 'value' - }, - 'ignore_list': ['pipelines.pipelineId'] - } + tblproperties={ + 'key': 'value' + }, ) }} select * from {{ ref('my_seed') }} """ diff --git a/tests/functional/adapter/materialized_view_tests/test_changes.py b/tests/functional/adapter/materialized_view_tests/test_changes.py index 926348fb3..4243b4273 100644 --- a/tests/functional/adapter/materialized_view_tests/test_changes.py +++ b/tests/functional/adapter/materialized_view_tests/test_changes.py @@ -9,14 +9,16 @@ from dbt.tests import util import pytest from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig -from dbt.adapters.databricks.relation_configs.refresh import ScheduledRefreshConfig from dbt.adapters.databricks.relation_configs.tblproperties import TblPropertiesConfig from tests.functional.adapter.materialized_view_tests import fixtures def _check_tblproperties(tblproperties: TblPropertiesConfig, expected: dict): - assert tblproperties._without_ignore_list(tblproperties.tblproperties) == expected + final_tblproperties = { + k: v for k, v in tblproperties.tblproperties.items() if not k.startswith("pipeline") + } + assert final_tblproperties == expected class MaterializedViewChangesMixin(MaterializedViewChanges): @@ -29,12 +31,11 @@ def check_start_state(project, materialized_view): with util.get_connection(project.adapter): results = project.adapter.get_relation_config(materialized_view) assert isinstance(results, MaterializedViewConfig) - assert results.partition_by.partition_by == ["id"] - assert results.query.query.startswith("select * from") - _check_tblproperties(results.tblproperties, {"key": "value"}) - assert isinstance(results.refresh, ScheduledRefreshConfig) - assert results.refresh.cron == "0 0 * * * ? *" - assert results.refresh.time_zone_value == "Etc/UTC" + assert results.config["partition_by"].partition_by == ["id"] + assert results.config["query"].query.startswith("select * from") + _check_tblproperties(results.config["tblproperties"], {"key": "value"}) + assert results.config["refresh"].cron == "0 0 * * * ? *" + assert results.config["refresh"].time_zone_value == "Etc/UTC" @staticmethod def change_config_via_alter(project, materialized_view): @@ -47,9 +48,8 @@ def check_state_alter_change_is_applied(project, materialized_view): with util.get_connection(project.adapter): results = project.adapter.get_relation_config(materialized_view) assert isinstance(results, MaterializedViewConfig) - assert isinstance(results.refresh, ScheduledRefreshConfig) - assert results.refresh.cron == "0 5 * * * ? *" - assert results.refresh.time_zone_value == "Etc/UTC" + assert results.config["refresh"].cron == "0 5 * * * ? *" + assert results.config["refresh"].time_zone_value == "Etc/UTC" @staticmethod def change_config_via_replace(project, materialized_view): @@ -66,9 +66,9 @@ def check_state_replace_change_is_applied(project, materialized_view): with util.get_connection(project.adapter): results = project.adapter.get_relation_config(materialized_view) assert isinstance(results, MaterializedViewConfig) - assert results.partition_by.partition_by == [] - assert results.query.query.startswith("select id, value") - _check_tblproperties(results.tblproperties, {"other": "other"}) + assert results.config["partition_by"].partition_by == [] + assert results.config["query"].query.startswith("select id, value") + _check_tblproperties(results.config["tblproperties"], {"other": "other"}) @staticmethod def query_relation_type(project, relation: BaseRelation) -> Optional[str]: diff --git a/tests/unit/macros/relation_configs/base.py b/tests/unit/macros/relation_configs/base.py new file mode 100644 index 000000000..63b7111f4 --- /dev/null +++ b/tests/unit/macros/relation_configs/base.py @@ -0,0 +1,32 @@ +from typing import Any +from jinja2 import Environment, FileSystemLoader +import pytest + + +class RelationConfigTestBase: + @pytest.fixture(scope="class") + def env(self) -> Environment: + """ + The environment used for rendering Databricks macros + """ + return Environment( + loader=FileSystemLoader("dbt/include/databricks/macros/relations/components"), + extensions=["jinja2.ext.do"], + ) + + @pytest.fixture(scope="class") + def template_name(self) -> str: + """ + The name of the Databricks template you want to test, not including the path. + + Example: "adapters.sql" + """ + raise NotImplementedError("Must be implemented by subclasses") + + @pytest.fixture + def template(self, template_name, env) -> Any: + """ + This creates the template you will test against. + You generally don't want to override this. + """ + return env.get_template(template_name).module diff --git a/tests/unit/macros/relation_configs/test_comment_macros.py b/tests/unit/macros/relation_configs/test_comment_macros.py new file mode 100644 index 000000000..119012460 --- /dev/null +++ b/tests/unit/macros/relation_configs/test_comment_macros.py @@ -0,0 +1,20 @@ +import pytest +from tests.unit.macros.relation_configs.base import RelationConfigTestBase + + +class TestCommentMacros(RelationConfigTestBase): + @pytest.fixture(scope="class") + def template_name(self) -> str: + return "comment.sql" + + def test_get_create_sql_comment__with_no_comment(self, template): + s = template.get_create_sql_comment(None) + assert s == "" + + def test_get_create_sql_comment__with_empty_comment(self, template): + s = template.get_create_sql_comment("") + assert s == "COMMENT ''" + + def test_get_create_sql_comment__with_comment(self, template): + s = template.get_create_sql_comment("test_comment") + assert s == "COMMENT 'test_comment'" diff --git a/tests/unit/macros/relation_configs/test_partitioning_macros.py b/tests/unit/macros/relation_configs/test_partitioning_macros.py new file mode 100644 index 000000000..b502f58c6 --- /dev/null +++ b/tests/unit/macros/relation_configs/test_partitioning_macros.py @@ -0,0 +1,24 @@ +import pytest +from tests.unit.macros.relation_configs.base import RelationConfigTestBase + + +class TestPartitioningMacros(RelationConfigTestBase): + @pytest.fixture(scope="class") + def template_name(self) -> str: + return "partitioning.sql" + + def test_get_create_sql_partition_by__empty(self, template): + s = template.get_create_sql_partition_by([]) + assert s == "" + + def test_get_create_sql_partition_by__none(self, template): + s = template.get_create_sql_partition_by(None) + assert s == "" + + def test_get_create_sql_partition_by__single(self, template): + s = template.get_create_sql_partition_by(["id"]) + assert s == "PARTITIONED BY (id)" + + def test_get_create_sql_partition_by__multiple(self, template): + s = template.get_create_sql_partition_by(["id", "value"]) + assert s == "PARTITIONED BY (id, value)" diff --git a/tests/unit/macros/relation_configs/test_refresh_schedule_macros.py b/tests/unit/macros/relation_configs/test_refresh_schedule_macros.py new file mode 100644 index 000000000..cf11d3ad1 --- /dev/null +++ b/tests/unit/macros/relation_configs/test_refresh_schedule_macros.py @@ -0,0 +1,40 @@ +import pytest +from tests.unit.macros.relation_configs.base import RelationConfigTestBase + + +class TestRefreshScheduleMacros(RelationConfigTestBase): + @pytest.fixture(scope="class") + def template_name(self) -> str: + return "refresh_schedule.sql" + + def test_get_create_sql_refresh_schedule__manual(self, template): + s = template.get_create_sql_refresh_schedule(None, None) + assert s == "" + + def test_get_create_sql_refresh_schedule__cron_only(self, template): + s = template.get_create_sql_refresh_schedule("*/5 * * * * ?", None) + assert s == "SCHEDULE CRON '*/5 * * * * ?'" + + def test_get_create_sql_refresh_schedule__with_time_zone(self, template): + s = template.get_create_sql_refresh_schedule("*/5 * * * * ?", "UTC") + assert s == "SCHEDULE CRON '*/5 * * * * ?' AT TIME ZONE 'UTC'" + + def test_get_alter_sql_refresh_schedule__manual(self, template): + s = template.get_alter_sql_refresh_schedule(None, None, False) + assert s == "DROP SCHEDULE" + + def test_get_alter_sql_refresh_schedule__add_cron_only(self, template): + s = template.get_alter_sql_refresh_schedule("*/5 * * * * ?", None, False) + assert s == "ADD SCHEDULE CRON '*/5 * * * * ?'" + + def test_get_alter_sql_refresh_schedule__add_with_time_zone(self, template): + s = template.get_alter_sql_refresh_schedule("*/5 * * * * ?", "UTC", False) + assert s == "ADD SCHEDULE CRON '*/5 * * * * ?' AT TIME ZONE 'UTC'" + + def test_get_alter_sql_refresh_schedule__alter_cron_only(self, template): + s = template.get_alter_sql_refresh_schedule("*/5 * * * * ?", None, True) + assert s == "ALTER SCHEDULE CRON '*/5 * * * * ?'" + + def test_get_alter_sql_refresh_schedule__alter_with_time_zone(self, template): + s = template.get_alter_sql_refresh_schedule("*/5 * * * * ?", "UTC", True) + assert s == "ALTER SCHEDULE CRON '*/5 * * * * ?' AT TIME ZONE 'UTC'" diff --git a/tests/unit/macros/relation_configs/test_tblproperties_macros.py b/tests/unit/macros/relation_configs/test_tblproperties_macros.py new file mode 100644 index 000000000..60ff71f0d --- /dev/null +++ b/tests/unit/macros/relation_configs/test_tblproperties_macros.py @@ -0,0 +1,24 @@ +import pytest +from tests.unit.macros.relation_configs.base import RelationConfigTestBase + + +class TestTblPropertiesMacros(RelationConfigTestBase): + @pytest.fixture(scope="class") + def template_name(self) -> str: + return "tblproperties.sql" + + def test_get_create_sql_tblproperties__empty(self, template): + s = template.get_create_sql_tblproperties({}) + assert s == "" + + def test_get_create_sql_tblproperties__none(self, template): + s = template.get_create_sql_tblproperties(None) + assert s == "" + + def test_get_create_sql_tblproperties__single(self, template): + s = template.get_create_sql_tblproperties({"key": "value"}) + assert s == "TBLPROPERTIES ('key' = 'value')" + + def test_get_create_sql_tblproperties__multiple(self, template): + s = template.get_create_sql_tblproperties({"key": "value", "other": "other_value"}) + assert s == "TBLPROPERTIES ('key' = 'value', 'other' = 'other_value')" diff --git a/tests/unit/relation_configs/test_comment.py b/tests/unit/relation_configs/test_comment.py index 4525c42d5..6329641a8 100644 --- a/tests/unit/relation_configs/test_comment.py +++ b/tests/unit/relation_configs/test_comment.py @@ -1,16 +1,8 @@ -import pytest +from mock import Mock from agate import Table from dbt.adapters.databricks.relation_configs.comment import CommentConfig, CommentProcessor -class TestCommentConfig: - @pytest.mark.parametrize( - "input,expected", [(None, ""), ("", ""), ("comment", "COMMENT 'comment'")] - ) - def test_comment_config(self, input, expected): - assert CommentConfig(input).to_sql_clause() == expected - - class TestCommentProcessor: def test_from_results__no_comment(self): results = { @@ -45,4 +37,22 @@ def test_from_results__with_comment(self): ) } config = CommentProcessor.from_results(results) - assert config == CommentConfig("This is the table comment") + assert config == CommentConfig(comment="This is the table comment") + + def test_from_model_node__no_comment(self): + model_node = Mock() + model_node.description = None + config = CommentProcessor.from_model_node(model_node) + assert config == CommentConfig() + + def test_from_model_node__empty_comment(self): + model_node = Mock() + model_node.description = "" + config = CommentProcessor.from_model_node(model_node) + assert config == CommentConfig(comment="") + + def test_from_model_node__comment(self): + model_node = Mock() + model_node.description = "a comment" + config = CommentProcessor.from_model_node(model_node) + assert config == CommentConfig(comment="a comment") diff --git a/tests/unit/relation_configs/test_config_base.py b/tests/unit/relation_configs/test_config_base.py index 1637c065b..2f99307e7 100644 --- a/tests/unit/relation_configs/test_config_base.py +++ b/tests/unit/relation_configs/test_config_base.py @@ -1,36 +1,21 @@ -from dataclasses import dataclass -import pytest from dbt.adapters.databricks.relation_configs.base import ( DatabricksComponentConfig, - DatabricksConfigChange, DatabricksRelationChangeSet, ) -from dbt.exceptions import DbtRuntimeError from dbt.adapters.databricks.relation_configs.comment import CommentConfig -from dbt.adapters.databricks.relation_configs.base import RelationConfigChangeAction -from dbt.adapters.databricks.relation_configs.refresh import ( - ManualRefreshConfig, - ScheduledRefreshConfig, -) +from dbt.adapters.databricks.relation_configs.refresh import RefreshConfig -@dataclass(frozen=True, eq=True, unsafe_hash=True) class MockComponentConfig(DatabricksComponentConfig): data: int = 1 - def to_sql_clause(self) -> str: - return "" + @property + def requires_full_refresh(self) -> bool: + return True class TestDatabricksComponentConfig: - def test_get_diff__invalid_type(self): - config = MockComponentConfig() - other = CommentConfig() - - with pytest.raises(DbtRuntimeError, match="Cannot diff"): - config.get_diff(other) - def test_get_diff__valid_type(self): config = MockComponentConfig() other = MockComponentConfig(data=2) @@ -38,73 +23,25 @@ def test_get_diff__valid_type(self): assert config.get_diff(other) == config -class TestDatabricksConfigChange: - def test_requires_full_refresh__unalterable(self): - change = DatabricksConfigChange(RelationConfigChangeAction.alter, MockComponentConfig()) - assert change.requires_full_refresh - - def test_requires_full_refresh__alterable(self): - change = DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()) - assert not change.requires_full_refresh - - def test_get_change__no_change(self): - change = DatabricksConfigChange.get_change(ManualRefreshConfig(), ManualRefreshConfig()) - assert change is None - - def test_get_change__with_diff(self): - change = DatabricksConfigChange.get_change( - ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC"), ManualRefreshConfig() - ) - assert change == DatabricksConfigChange( - RelationConfigChangeAction.alter, ManualRefreshConfig() - ) - - class TestDatabricksRelationChangeSet: def test_requires_full_refresh__no_changes(self): - changeset = DatabricksRelationChangeSet([]) + changeset = DatabricksRelationChangeSet(changes={}) assert not changeset.requires_full_refresh def test_requires_full_refresh__has_only_alterable_changes(self): - changeset = DatabricksRelationChangeSet( - [DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig())] - ) + changeset = DatabricksRelationChangeSet(changes={"refresh": RefreshConfig()}) assert not changeset.requires_full_refresh def test_requires_full_refresh__has_an_inalterable_change(self): changeset = DatabricksRelationChangeSet( - [ - DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()), - DatabricksConfigChange(RelationConfigChangeAction.alter, CommentConfig()), - ] + changes={"comment": CommentConfig(), "refresh": RefreshConfig()} ) assert changeset.requires_full_refresh def test_has_changes__no_changes(self): - changeset = DatabricksRelationChangeSet([]) + changeset = DatabricksRelationChangeSet(changes={}) assert not changeset.has_changes def test_has_changes__has_changes(self): - changeset = DatabricksRelationChangeSet( - [DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig())] - ) + changeset = DatabricksRelationChangeSet(changes={"refresh": RefreshConfig()}) assert changeset.has_changes - - def test_get_alter_sql_clauses__no_changes(self): - changeset = DatabricksRelationChangeSet([]) - assert changeset.get_alter_sql_clauses() == [] - - def test_get_alter_sql_clauses__with_inalterable_change(self): - changeset = DatabricksRelationChangeSet( - [DatabricksConfigChange(RelationConfigChangeAction.alter, CommentConfig())] - ) - with pytest.raises(AssertionError): - changeset.get_alter_sql_clauses() - - def test_get_alter_sql_clauses__with_alterable_change(self): - changeset = DatabricksRelationChangeSet( - [ - DatabricksConfigChange(RelationConfigChangeAction.alter, ManualRefreshConfig()), - ] - ) - assert changeset.get_alter_sql_clauses() == ["DROP SCHEDULE"] diff --git a/tests/unit/relation_configs/test_materialized_view_config.py b/tests/unit/relation_configs/test_materialized_view_config.py index 42814b75c..4d10a72a7 100644 --- a/tests/unit/relation_configs/test_materialized_view_config.py +++ b/tests/unit/relation_configs/test_materialized_view_config.py @@ -4,7 +4,7 @@ from dbt.adapters.databricks.relation_configs.materialized_view import MaterializedViewConfig from dbt.adapters.databricks.relation_configs.partitioning import PartitionedByConfig from dbt.adapters.databricks.relation_configs.query import QueryConfig -from dbt.adapters.databricks.relation_configs.refresh import ManualRefreshConfig +from dbt.adapters.databricks.relation_configs.refresh import RefreshConfig from dbt.adapters.databricks.relation_configs.tblproperties import TblPropertiesConfig @@ -35,11 +35,13 @@ def test_from_results(self): config = MaterializedViewConfig.from_results(results) assert config == MaterializedViewConfig( - PartitionedByConfig(["col_a", "col_b"]), - CommentConfig("This is the table comment"), - TblPropertiesConfig({"prop": "1", "other": "other"}), - ManualRefreshConfig(), - QueryConfig("select * from foo"), + config={ + "partition_by": PartitionedByConfig(partition_by=["col_a", "col_b"]), + "comment": CommentConfig(comment="This is the table comment"), + "tblproperties": TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}), + "refresh": RefreshConfig(), + "query": QueryConfig(query="select * from foo"), + } ) def test_from_model_node(self): @@ -47,22 +49,71 @@ def test_from_model_node(self): model.compiled_code = "select * from foo" model.config.extra = { "partition_by": ["col_a", "col_b"], - "tblproperties_config": { - "tblproperties": { - "prop": "1", - "other": "other", - } + "tblproperties": { + "prop": "1", + "other": "other", }, - "refresh": "manual", } model.description = "This is the table comment" config = MaterializedViewConfig.from_model_node(model) assert config == MaterializedViewConfig( - PartitionedByConfig(["col_a", "col_b"]), - CommentConfig("This is the table comment"), - TblPropertiesConfig({"prop": "1", "other": "other"}), - ManualRefreshConfig(), - QueryConfig("select * from foo"), + config={ + "partition_by": PartitionedByConfig(partition_by=["col_a", "col_b"]), + "comment": CommentConfig(comment="This is the table comment"), + "tblproperties": TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}), + "refresh": RefreshConfig(), + "query": QueryConfig(query="select * from foo"), + } ) + + def test_get_changeset__no_changes(self): + old = MaterializedViewConfig( + config={ + "partition_by": PartitionedByConfig(partition_by=["col_a", "col_b"]), + "comment": CommentConfig(comment="This is the table comment"), + "tblproperties": TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}), + "refresh": RefreshConfig(), + "query": QueryConfig(query="select * from foo"), + } + ) + new = MaterializedViewConfig( + config={ + "partition_by": PartitionedByConfig(partition_by=["col_a", "col_b"]), + "comment": CommentConfig(comment="This is the table comment"), + "tblproperties": TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}), + "refresh": RefreshConfig(), + "query": QueryConfig(query="select * from foo"), + } + ) + + assert new.get_changeset(old) is None + + def test_get_changeset__some_changes(self): + old = MaterializedViewConfig( + config={ + "partition_by": PartitionedByConfig(partition_by=["col_a", "col_b"]), + "comment": CommentConfig(comment="This is the table comment"), + "tblproperties": TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}), + "refresh": RefreshConfig(), + "query": QueryConfig(query="select * from foo"), + } + ) + new = MaterializedViewConfig( + config={ + "partition_by": PartitionedByConfig(partition_by=["col_a"]), + "comment": CommentConfig(comment="This is the table comment"), + "tblproperties": TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}), + "refresh": RefreshConfig(cron="*/5 * * * *"), + "query": QueryConfig(query="select * from foo"), + } + ) + + changeset = new.get_changeset(old) + assert changeset.has_changes + assert changeset.requires_full_refresh + assert changeset.changes == { + "partition_by": PartitionedByConfig(partition_by=["col_a"]), + "refresh": RefreshConfig(cron="*/5 * * * *"), + } diff --git a/tests/unit/relation_configs/test_partitioning.py b/tests/unit/relation_configs/test_partitioning.py index 56ce49fd3..9fb01f126 100644 --- a/tests/unit/relation_configs/test_partitioning.py +++ b/tests/unit/relation_configs/test_partitioning.py @@ -1,5 +1,4 @@ from mock import Mock -import pytest from agate import Table from dbt.adapters.databricks.relation_configs.partitioning import ( @@ -8,21 +7,6 @@ ) -class TestPartitionedByConfig: - @pytest.mark.parametrize( - "input,expected", - [ - (None, ""), - ([], ""), - (["col_a"], "PARTITIONED BY (col_a)"), - (["col_a", "col_b"], "PARTITIONED BY (col_a, col_b)"), - ], - ) - def test_to_sql_clause(self, input, expected): - config = PartitionedByConfig(input) - assert config.to_sql_clause() == expected - - class TestPartitionedByProcessor: def test_from_results__none(self): results = { @@ -38,7 +22,7 @@ def test_from_results__none(self): } spec = PartitionedByProcessor.from_results(results) - assert spec == PartitionedByConfig([]) + assert spec == PartitionedByConfig(partition_by=[]) def test_from_results__single(self): results = { @@ -57,7 +41,7 @@ def test_from_results__single(self): } spec = PartitionedByProcessor.from_results(results) - assert spec == PartitionedByConfig(["col_a"]) + assert spec == PartitionedByConfig(partition_by=["col_a"]) def test_from_results__multiple(self): results = { @@ -76,22 +60,22 @@ def test_from_results__multiple(self): ) } spec = PartitionedByProcessor.from_results(results) - assert spec == PartitionedByConfig(["col_a", "col_b"]) + assert spec == PartitionedByConfig(partition_by=["col_a", "col_b"]) def test_from_model_node__without_partition_by(self): model = Mock() model.config.extra = {} spec = PartitionedByProcessor.from_model_node(model) - assert spec == PartitionedByConfig(None) + assert spec == PartitionedByConfig(partition_by=[]) def test_from_model_node__single_column(self): model = Mock() model.config.extra = {"partition_by": "col_a"} spec = PartitionedByProcessor.from_model_node(model) - assert spec == PartitionedByConfig(["col_a"]) + assert spec == PartitionedByConfig(partition_by=["col_a"]) def test_from_model_node__multiple_columns(self): model = Mock() model.config.extra = {"partition_by": ["col_a", "col_b"]} spec = PartitionedByProcessor.from_model_node(model) - assert spec == PartitionedByConfig(["col_a", "col_b"]) + assert spec == PartitionedByConfig(partition_by=["col_a", "col_b"]) diff --git a/tests/unit/relation_configs/test_query.py b/tests/unit/relation_configs/test_query.py index ed25aa0b1..c3d9c412b 100644 --- a/tests/unit/relation_configs/test_query.py +++ b/tests/unit/relation_configs/test_query.py @@ -11,13 +11,13 @@ class TestQueryProcessor: def test_from_results(self): results = {"information_schema.views": Row([sql, "other"], ["view_definition", "comment"])} spec = QueryProcessor.from_results(results) - assert spec == QueryConfig(sql) + assert spec == QueryConfig(query=sql) def test_from_model_node__with_query(self): model = Mock() model.compiled_code = sql spec = QueryProcessor.from_model_node(model) - assert spec == QueryConfig(sql) + assert spec == QueryConfig(query=sql) def test_from_model_node__without_query(self): model = Mock() diff --git a/tests/unit/relation_configs/test_refresh.py b/tests/unit/relation_configs/test_refresh.py index 2ea6256c5..5dddb3a7d 100644 --- a/tests/unit/relation_configs/test_refresh.py +++ b/tests/unit/relation_configs/test_refresh.py @@ -3,8 +3,7 @@ import pytest from dbt.adapters.databricks.relation_configs.refresh import ( RefreshProcessor, - ManualRefreshConfig, - ScheduledRefreshConfig, + RefreshConfig, ) from dbt.exceptions import DbtRuntimeError from agate import Table @@ -30,12 +29,12 @@ def test_from_results__valid_schedule(self, rows): ) } spec = RefreshProcessor.from_results(results) - assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + assert spec == RefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") def test_from_results__manual(self, rows): results = {"describe_extended": Table(rows=rows + [["Refresh Schedule", "MANUAL"]])} spec = RefreshProcessor.from_results(results) - assert spec == ManualRefreshConfig() + assert spec == RefreshConfig() def test_from_results__invalid(self, rows): results = { @@ -51,7 +50,7 @@ def test_from_model_node__without_schedule(self): model = Mock() model.config.extra = {} spec = RefreshProcessor.from_model_node(model) - assert spec == ManualRefreshConfig() + assert spec == RefreshConfig() def test_from_model_node__without_cron(self): model = Mock() @@ -66,80 +65,36 @@ def test_from_model_node__without_timezone(self): model = Mock() model.config.extra = {"schedule": {"cron": "*/5 * * * *"}} spec = RefreshProcessor.from_model_node(model) - assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) + assert spec == RefreshConfig(cron="*/5 * * * *", time_zone_value=None) def test_process_model_node__both(self): model = Mock() model.config.extra = {"schedule": {"cron": "*/5 * * * *", "time_zone_value": "UTC"}} spec = RefreshProcessor.from_model_node(model) - assert spec == ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") + assert spec == RefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") -class TestScheduledRefreshConfig: - def test_to_sql_clause__no_timezone(self): - spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value=None) - assert spec.to_sql_clause() == "SCHEDULE CRON '*/5 * * * *'" - - def test_to_sql_clause__with_timezone(self): - spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC") - assert spec.to_sql_clause() == "SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'" - - def test_to_alter_sql_clauses__alter_false(self): - spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC", alter=False) - assert spec.to_alter_sql_clauses() == [ - "ADD SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'", - ] - - def test_to_alter_sql_clauses__alter_true(self): - spec = ScheduledRefreshConfig(cron="*/5 * * * *", time_zone_value="UTC", alter=True) - assert spec.to_alter_sql_clauses() == [ - "ALTER SCHEDULE CRON '*/5 * * * *' AT TIME ZONE 'UTC'", - ] - - def test_get_diff__other_not_refresh(self): - config = ScheduledRefreshConfig(cron="*/5 * * * *") - other = Mock() - with pytest.raises( - DbtRuntimeError, - match="Cannot diff ScheduledRefreshConfig with Mock", - ): - config.get_diff(other) - - def test_get_diff__other_manual_refresh(self): - config = ScheduledRefreshConfig(cron="*/5 * * * *") - other = ManualRefreshConfig() +class TestRefreshConfig: + def test_get_diff__scheduled_other_manual_refresh(self): + config = RefreshConfig(cron="*/5 * * * *") + other = RefreshConfig() diff = config.get_diff(other) - assert diff == ScheduledRefreshConfig(cron="*/5 * * * *", alter=False) + assert diff == RefreshConfig(cron="*/5 * * * *", is_altered=False) - def test_get_diff__other_scheduled_refresh(self): - config = ScheduledRefreshConfig(cron="*/5 * * * *") - other = ScheduledRefreshConfig(cron="0 * * * *") + def test_get_diff__scheduled_other_scheduled_refresh(self): + config = RefreshConfig(cron="*/5 * * * *") + other = RefreshConfig(cron="0 * * * *") diff = config.get_diff(other) - assert diff == ScheduledRefreshConfig(cron="*/5 * * * *", alter=True) - - -class TestManualRefreshConfig: - def test_get_diff__other_not_refresh(self): - config = ManualRefreshConfig() - other = Mock() - with pytest.raises( - DbtRuntimeError, - match="Cannot diff ManualRefreshConfig with Mock", - ): - config.get_diff(other) + assert diff == RefreshConfig(cron="*/5 * * * *", is_altered=True) - def test_get_diff__other_scheduled_refresh(self): - config = ManualRefreshConfig() - other = ScheduledRefreshConfig(cron="*/5 * * * *") + def test_get_diff__manual_other_scheduled_refresh(self): + config = RefreshConfig() + other = RefreshConfig(cron="*/5 * * * *") diff = config.get_diff(other) assert diff == config - def test_get_diff__other_manual_refresh(self): - config = ManualRefreshConfig() - other = ManualRefreshConfig() + def test_get_diff__manual_other_manual_refresh(self): + config = RefreshConfig() + other = RefreshConfig() diff = config.get_diff(other) - assert diff == config - - def test_to_alter_sql_clauses(self): - spec = ManualRefreshConfig() - assert spec.to_alter_sql_clauses() == ["DROP SCHEDULE"] + assert diff is None diff --git a/tests/unit/relation_configs/test_tblproperties.py b/tests/unit/relation_configs/test_tblproperties.py index 606196282..a1929bba9 100644 --- a/tests/unit/relation_configs/test_tblproperties.py +++ b/tests/unit/relation_configs/test_tblproperties.py @@ -9,111 +9,11 @@ from dbt.exceptions import DbtRuntimeError -class TestTblPropertiesConfig: - def test_post_init__conflicting_tblproperties(self): - with pytest.raises( - DbtRuntimeError, - match="Cannot set and unset the same tblproperty in the same model", - ): - _ = TblPropertiesConfig({"prop": "1"}, to_unset=["prop"]) - - @pytest.mark.parametrize( - "input,expected", - [ - (None, ""), - (dict(), ""), - ({"prop": 1}, "TBLPROPERTIES ('prop' = '1')"), - ({"prop": 1, "other": "thing"}, "TBLPROPERTIES ('prop' = '1', 'other' = 'thing')"), - ], - ) - def test_to_sql_clause(self, input, expected): - config = TblPropertiesConfig(tblproperties=input) - assert config.to_sql_clause() == expected - - def test_eq__match(self): - config1 = TblPropertiesConfig(tblproperties={"prop": "1"}) - config2 = TblPropertiesConfig(tblproperties={"prop": "1"}) - assert config1 == config2 - - def test_eq__mismatch(self): - config1 = TblPropertiesConfig(tblproperties={"prop": "1"}) - config2 = TblPropertiesConfig(tblproperties={"prop": "2"}) - assert config1 != config2 - - def test_eq__ignore_list(self): - config1 = TblPropertiesConfig(tblproperties={"prop": "1"}, ignore_list=["prop"]) - config2 = TblPropertiesConfig(tblproperties={"prop": "2"}) - assert config1 == config2 - - def test_eq__ignore_list_extra(self): - config1 = TblPropertiesConfig( - tblproperties={"prop": "1", "other": "other"}, ignore_list=["prop"] - ) - config2 = TblPropertiesConfig(tblproperties={"prop": "2", "other": "other"}) - assert config1 == config2 - - def test_eq__default_ignore_list(self): - config1 = TblPropertiesConfig(tblproperties={"pipelines.pipelineId": "1"}) - config2 = TblPropertiesConfig(tblproperties={}) - assert config1 == config2 - - def test_get_diff__same_properties(self): - config = TblPropertiesConfig(tblproperties={"prop": "1"}) - diff = config.get_diff(config) - assert diff == TblPropertiesConfig() - - def test_get_diff__other_has_added_prop(self): - config = TblPropertiesConfig(tblproperties={"prop": "1"}) - other = TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}) - diff = config.get_diff(other) - assert diff == TblPropertiesConfig(to_unset=["other"]) - - def test_get_diff__other_has_diff_value(self): - config = TblPropertiesConfig(tblproperties={"prop": "1"}) - other = TblPropertiesConfig(tblproperties={"prop": "2"}) - diff = config.get_diff(other) - assert diff == TblPropertiesConfig(tblproperties={"prop": "1"}) - - def test_get_diff__mix(self): - config = TblPropertiesConfig( - tblproperties={"prop": "1", "other": "2", "third": "3"}, ignore_list=["third"] - ) - other = TblPropertiesConfig( - tblproperties={"prop": "2", "other": "2", "third": "4", "fourth": "5"} - ) - diff = config.get_diff(other) - assert diff == TblPropertiesConfig(tblproperties={"prop": "1"}, to_unset=["fourth"]) - - def test_to_alter_sql_clauses__none(self): - config = TblPropertiesConfig() - assert config.to_alter_sql_clauses() == [] - - def test_to_alter_sql_clauses__set_only(self): - config = TblPropertiesConfig({"prop": "1", "other": "other"}) - assert config.to_alter_sql_clauses() == [ - "SET TBLPROPERTIES ('prop' = '1', 'other' = 'other')" - ] - - def test_to_alter_sql_clauses__unset_only(self): - config = TblPropertiesConfig(to_unset=["prop", "other"]) - assert config.to_alter_sql_clauses() == ["UNSET TBLPROPERTIES IF EXISTS ('prop', 'other')"] - - def test_to_alter_sql_clauses__both(self): - config = TblPropertiesConfig( - {"prop": "1"}, - to_unset=["other"], - ) - assert config.to_alter_sql_clauses() == [ - "SET TBLPROPERTIES ('prop' = '1')", - "UNSET TBLPROPERTIES IF EXISTS ('other')", - ] - - class TestTblPropertiesProcessor: def test_from_results__none(self): results = {"show_tblproperties": None} spec = TblPropertiesProcessor.from_results(results) - assert spec == TblPropertiesConfig() + assert spec == TblPropertiesConfig(tblproperties={}) def test_from_results__single(self): results = {"show_tblproperties": Table(rows=[["prop", "f1"]])} @@ -125,61 +25,31 @@ def test_from_results__multiple(self): spec = TblPropertiesProcessor.from_results(results) assert spec == TblPropertiesConfig(tblproperties={"prop": "1", "other": "other"}) - def test_from_model_node__without_tblproperties_config(self): + def test_from_model_node__without_tblproperties(self): model = Mock() model.config.extra = {} spec = TblPropertiesProcessor.from_model_node(model) - assert spec == TblPropertiesConfig() - - def test_from_model_node__with_all_config(self): - model = Mock() - model.config.extra = { - "tblproperties_config": { - "tblproperties": {"prop": 1}, - "ignore_list": ["other"], - } - } - spec = TblPropertiesProcessor.from_model_node(model) - assert spec == TblPropertiesConfig(tblproperties={"prop": "1"}, ignore_list=["other"]) + assert spec == TblPropertiesConfig(tblproperties={}) - def test_from_model_node__without_ignore_list(self): + def test_from_model_node__with_tblpropoerties(self): model = Mock() model.config.extra = { - "tblproperties_config": { - "tblproperties": {"prop": 1}, - } + "tblproperties": {"prop": 1}, } spec = TblPropertiesProcessor.from_model_node(model) assert spec == TblPropertiesConfig(tblproperties={"prop": "1"}) - def test_from_model_node__without_tblproperties(self): + def test_from_model_node__with_empty_tblproperties(self): model = Mock() - model.config.extra = { - "tblproperties_config": { - "ignore_list": ["other"], - } - } + model.config.extra = {"tblproperties": {}} spec = TblPropertiesProcessor.from_model_node(model) - assert spec == TblPropertiesConfig(ignore_list=["other"]) + assert spec == TblPropertiesConfig(tblproperties={}) def test_from_model_node__with_incorrect_tblproperties(self): model = Mock() - model.config.extra = { - "tblproperties_config": {"ignore_list": ["other"], "tblproperties": True} - } + model.config.extra = {"tblproperties": True} with pytest.raises( DbtRuntimeError, match="tblproperties must be a dictionary", ): _ = TblPropertiesProcessor.from_model_node(model) - - def test_from_model_node__with_incorrect_ignore_list(self): - model = Mock() - model.config.extra = { - "tblproperties_config": {"tblproperties": {"prop": 1}, "ignore_list": True} - } - with pytest.raises( - DbtRuntimeError, - match="ignore_list must be a list", - ): - _ = TblPropertiesProcessor.from_model_node(model) From de1cc29015d7b9bb603b0f67b7af5870acad2194 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 2 Jan 2024 09:30:10 -0800 Subject: [PATCH 22/35] Changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dc3b5989..f34986522 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## dbt-databricks 1.7.4 (Jan 24, 2024) +### Features + +- Support `on_config_change` for materialized views, expand the supported config options ([536](https://github.com/databricks/dbt-databricks/pull/536))) + ### Fixes - Added python model specific connection handling to prevent using invalid sessions ([547](https://github.com/databricks/dbt-databricks/pull/547)) From b7f43c11730b7596479d02893e34383e80e3a9b7 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 3 Jan 2024 09:10:45 -0800 Subject: [PATCH 23/35] does this fix test failures --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 5adee10af..602702990 100644 --- a/tox.ini +++ b/tox.ini @@ -64,7 +64,7 @@ allowlist_externals = /bin/bash basepython = python3 commands = /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_uc_sql_endpoint -n4 tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' - /bin/bash -c '{envpython} -m pytest -v --profile databricks_uc_sql_endpoint -n4 tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v --profile databricks_uc_sql_endpoint tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' passenv = DBT_* PYTEST_ADDOPTS From a4f5c0de354b7f3746dfacc4a5d84e788cf8e777 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 3 Jan 2024 14:49:00 -0800 Subject: [PATCH 24/35] doc strings --- .../databricks/relation_configs/base.py | 68 +++++++++++++++++++ .../databricks/relation_configs/comment.py | 4 ++ .../relation_configs/partitioning.py | 2 + .../databricks/relation_configs/query.py | 2 + .../databricks/relation_configs/refresh.py | 6 ++ .../relation_configs/tblproperties.py | 10 +++ 6 files changed, 92 insertions(+) diff --git a/dbt/adapters/databricks/relation_configs/base.py b/dbt/adapters/databricks/relation_configs/base.py index 07e0c25d3..7a6216eac 100644 --- a/dbt/adapters/databricks/relation_configs/base.py +++ b/dbt/adapters/databricks/relation_configs/base.py @@ -8,29 +8,65 @@ class DatabricksComponentConfig(BaseModel, ABC): + """Class for encapsulating a single component of a Databricks relation config. + + Ex: A materialized view has a `query` component, which is a string that if changed, requires a + full refresh. + """ + model_config = ConfigDict(frozen=True) @property @abstractmethod def requires_full_refresh(self) -> bool: + """Whether or not the relation that is configured by this component requires a full refresh + (i.e. a drop and recreate) if this component has changed. + """ + raise NotImplementedError("Must be implemented by subclass") def get_diff(self, other: Self) -> Optional[Self]: + """Get the config that must be applied when this component differs from the existing + version. This method is intended to only be called on the new version (i.e. the version + specified in the dbt project). + + If the difference does not require any changes to the existing relation, this method should + return None. If some partial change can be applied to the existing relation, the + implementing component should override this method to return an instance representing the + partial change; however, care should be taken to ensure that the returned object retains + the complete config specified in the dbt project, so as to support rendering the `create` + as well as the `alter` statements, for the case where a different component requires full + refresh. + + Consider updating tblproperties: we can apply only the differences to an existing relation, + but if some other modified component requires us to completely replace the relation, we + should still be able to construct appropriate create clause from the object returned by + this method. + """ + if self != other: return self return None class DatabricksRelationChangeSet(BaseModel): + """Class for encapsulating the changes that need to be applied to a Databricks relation.""" + model_config = ConfigDict(frozen=True) changes: Dict[str, DatabricksComponentConfig] @property def requires_full_refresh(self) -> bool: + """Whether or not the relation that is to be configured by this change set requires a full + refresh. + """ + return any(change.requires_full_refresh for change in self.changes.values()) @property def has_changes(self) -> bool: + """Whether or not this change set has any changes that need to be applied.""" + return len(self.changes) > 0 @@ -38,25 +74,51 @@ def has_changes(self) -> bool: class DatabricksComponentProcessor(ABC, Generic[Component]): + """Class for encapsulating the logic for extracting a single config component from either the + project config, or the existing relation. + """ + + # The name of the component. This is used as the key in the config dictionary of the relation + # config. name: ClassVar[str] @classmethod @abstractmethod def from_results(cls, row: RelationResults) -> Component: + """Extract the component from the results of a query against the existing relation.""" + raise NotImplementedError("Must be implemented by subclass") @classmethod @abstractmethod def from_model_node(cls, model_node: ModelNode) -> Component: + """Extract the component from the model node. + + While some components, e.g. query, can be extracted directly from the model node, + specialized Databricks config can be found in model_node.config.extra. + """ + raise NotImplementedError("Must be implemented by subclass") class DatabricksRelationConfigBase(BaseModel, ABC): + """Class for encapsulating the config of a Databricks relation. + + Ex: all of the config for specifying a Materialized View is handled by MaterializedViewConfig. + Concretely though, since that config is compatible with the default behavior of this class, + only the list of component processors is specified by its subclass. + """ + + # The list of components that make up the relation config. In the base implemenation, these + # components are applied sequentially to either the existing relation, or the model node, to + # build up the config. config_components: ClassVar[List[Type[DatabricksComponentProcessor]]] config: Dict[str, DatabricksComponentConfig] @classmethod def from_model_node(cls, model_node: ModelNode) -> Self: + """Build the relation config from a model node.""" + config_dict: Dict[str, DatabricksComponentConfig] = {} for component in cls.config_components: model_component = component.from_model_node(model_node) @@ -67,6 +129,8 @@ def from_model_node(cls, model_node: ModelNode) -> Self: @classmethod def from_results(cls, results: RelationResults) -> Self: + """Build the relation config from the results of a query against the existing relation.""" + config_dict: Dict[str, DatabricksComponentConfig] = {} for component in cls.config_components: result_component = component.from_results(results) @@ -76,6 +140,10 @@ def from_results(cls, results: RelationResults) -> Self: return cls(config=config_dict) def get_changeset(self, existing: Self) -> Optional[DatabricksRelationChangeSet]: + """Get the changeset that must be applied to the existing relation to make it match the + current state of the dbt project. + """ + changes = {} for key, value in self.config.items(): diff --git a/dbt/adapters/databricks/relation_configs/comment.py b/dbt/adapters/databricks/relation_configs/comment.py index 4f88db77d..94e9c359c 100644 --- a/dbt/adapters/databricks/relation_configs/comment.py +++ b/dbt/adapters/databricks/relation_configs/comment.py @@ -9,10 +9,14 @@ class CommentConfig(DatabricksComponentConfig): + """Component encapsulating the relation-level comment.""" + comment: Optional[str] = None @property def requires_full_refresh(self) -> bool: + # TODO: This is only True for MVs since they don't currently allow ALTER VIEW to change the + # comment. Should be False for tables and views, if and when they move to this approach. return True diff --git a/dbt/adapters/databricks/relation_configs/partitioning.py b/dbt/adapters/databricks/relation_configs/partitioning.py index 8d58a5cef..6cc288cd8 100644 --- a/dbt/adapters/databricks/relation_configs/partitioning.py +++ b/dbt/adapters/databricks/relation_configs/partitioning.py @@ -10,6 +10,8 @@ class PartitionedByConfig(DatabricksComponentConfig): + """Component encapsulating the partitioning of relations.""" + partition_by: List[str] @property diff --git a/dbt/adapters/databricks/relation_configs/query.py b/dbt/adapters/databricks/relation_configs/query.py index a96cb552e..2c8cc37e2 100644 --- a/dbt/adapters/databricks/relation_configs/query.py +++ b/dbt/adapters/databricks/relation_configs/query.py @@ -9,6 +9,8 @@ class QueryConfig(DatabricksComponentConfig): + """Component encapsulating the query that defines a relation.""" + query: str @property diff --git a/dbt/adapters/databricks/relation_configs/refresh.py b/dbt/adapters/databricks/relation_configs/refresh.py index 377b2c5f6..7768ac4fd 100644 --- a/dbt/adapters/databricks/relation_configs/refresh.py +++ b/dbt/adapters/databricks/relation_configs/refresh.py @@ -14,8 +14,14 @@ class RefreshConfig(DatabricksComponentConfig): + """Component encapsulating the refresh schedule of a relation.""" + cron: Optional[str] = None time_zone_value: Optional[str] = None + + # Property indicating whether the schedule change should be accomplished by an ADD SCHEDULE + # vs an ALTER SCHEDULE. This is only True when modifying an existing schedule, rather than + # switching from manual refresh to scheduled or vice versa. is_altered: bool = False @property diff --git a/dbt/adapters/databricks/relation_configs/tblproperties.py b/dbt/adapters/databricks/relation_configs/tblproperties.py index b4ceefc57..d71c36b13 100644 --- a/dbt/adapters/databricks/relation_configs/tblproperties.py +++ b/dbt/adapters/databricks/relation_configs/tblproperties.py @@ -10,14 +10,24 @@ class TblPropertiesConfig(DatabricksComponentConfig): + """Component encapsulating the tblproperties of a relation.""" + tblproperties: Dict[str, str] + + # List of tblproperties that should be ignored when comparing configs. These are generally + # set by Databricks and are not user-configurable. ignore_list: List[str] = ["pipelines.pipelineId"] @property def requires_full_refresh(self) -> bool: + # TODO: This is only True for MVs since they don't currently allow ALTER VIEW to change the + # tblproperties. Should be False for tables and views, if and when they move to this + # approach. return True def __eq__(self, __value: Any) -> bool: + """Override equality check to ignore certain tblproperties.""" + if not isinstance(__value, TblPropertiesConfig): return False From a99093cf555e5134003e7dd8b264fc661e138dfd Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 18 Jan 2024 14:11:00 -0800 Subject: [PATCH 25/35] fix rebase --- dbt/include/databricks/macros/relations/drop.sql | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dbt/include/databricks/macros/relations/drop.sql b/dbt/include/databricks/macros/relations/drop.sql index c09220874..4fe95b114 100644 --- a/dbt/include/databricks/macros/relations/drop.sql +++ b/dbt/include/databricks/macros/relations/drop.sql @@ -15,9 +15,3 @@ {{ drop_table(relation) }} {%- endif -%} {% endmacro %} - -{% macro databricks__drop_relation(relation) -%} - {% call statement('drop_relation', auto_begin=False) -%} - {{ get_drop_sql(relation) }} - {%- endcall %} -{% endmacro %} \ No newline at end of file From d0026cc6846761d8c8e4452ee922f3cff505f307 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 19 Jan 2024 14:23:25 -0800 Subject: [PATCH 26/35] this is lame, but need to ensure test consistency...investigating other options --- tests/functional/adapter/materialized_view_tests/fixtures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index 558e9075d..7ad708f3b 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -1,3 +1,4 @@ +import time from typing import Optional from dbt.adapters.base import BaseRelation @@ -5,6 +6,7 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + time.sleep(5) table_type = project.run_sql( f"select table_type from {relation.information_schema_only()}." f"`tables` where table_name = '{relation.identifier}'", From dc518130251b9b1a2e9f4c2f65cae3efea50c78d Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 19 Jan 2024 15:28:48 -0800 Subject: [PATCH 27/35] trying a different approach --- .../adapter/materialized_view_tests/fixtures.py | 1 - tox.ini | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index 7ad708f3b..069c04726 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -6,7 +6,6 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: - time.sleep(5) table_type = project.run_sql( f"select table_type from {relation.information_schema_only()}." f"`tables` where table_name = '{relation.identifier}'", diff --git a/tox.ini b/tox.ini index 602702990..20c9e4826 100644 --- a/tox.ini +++ b/tox.ini @@ -37,8 +37,8 @@ deps = [testenv:integration-databricks-cluster] basepython = python3 commands = - /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_cluster -n4 tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' - /bin/bash -c '{envpython} -m pytest -v --profile databricks_cluster -n4 tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_cluster -n auto tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v --profile databricks_cluster -n auto tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' passenv = DBT_* PYTEST_ADDOPTS @@ -50,8 +50,8 @@ allowlist_externals = /bin/bash [testenv:integration-databricks-uc-cluster] basepython = python3 commands = - /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_uc_cluster -n4 tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' - /bin/bash -c '{envpython} -m pytest -v --profile databricks_uc_cluster -n4 tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_uc_cluster -n auto tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v --profile databricks_uc_cluster -n auto tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' passenv = DBT_* PYTEST_ADDOPTS @@ -63,8 +63,8 @@ allowlist_externals = /bin/bash [testenv:integration-databricks-uc-sql-endpoint] basepython = python3 commands = - /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_uc_sql_endpoint -n4 tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' - /bin/bash -c '{envpython} -m pytest -v --profile databricks_uc_sql_endpoint tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v -m profile_databricks_uc_sql_endpoint -n auto tests/integration/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/bash -c '{envpython} -m pytest -v --profile databricks_uc_sql_endpoint -n auto --dist loadscope tests/functional/adapter/* {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' passenv = DBT_* PYTEST_ADDOPTS From d40d8fb3092f25b30bcb02495ac37782661d62b5 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 19 Jan 2024 16:11:05 -0800 Subject: [PATCH 28/35] bringing back the wait --- tests/functional/adapter/materialized_view_tests/fixtures.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index 069c04726..7ad708f3b 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -6,6 +6,7 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + time.sleep(5) table_type = project.run_sql( f"select table_type from {relation.information_schema_only()}." f"`tables` where table_name = '{relation.identifier}'", From 6903bcf3dc3e67cb419cadcdb55769be2aa8a979 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 23 Jan 2024 16:18:22 -0800 Subject: [PATCH 29/35] make a test more reliable --- tests/functional/adapter/materialized_view_tests/test_basic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/functional/adapter/materialized_view_tests/test_basic.py b/tests/functional/adapter/materialized_view_tests/test_basic.py index e3604ba09..863efc6ed 100644 --- a/tests/functional/adapter/materialized_view_tests/test_basic.py +++ b/tests/functional/adapter/materialized_view_tests/test_basic.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple from dbt.tests.adapter.materialized_view.basic import MaterializedViewBasic from dbt.adapters.base.relation import BaseRelation +from dbt.tests import util import pytest from tests.functional.adapter.materialized_view_tests import fixtures @@ -13,7 +14,7 @@ def insert_record(project, table: BaseRelation, record: Tuple[int, int]) -> None @staticmethod def refresh_materialized_view(project, materialized_view: BaseRelation) -> None: - project.run_sql(f"refresh materialized view {materialized_view}") + util.run_dbt(["run", "--models", str(materialized_view.identifier)]) @staticmethod def query_row_count(project, relation: BaseRelation) -> int: From 6c95405d6af897f5785438ac2f8c7570c4e04f3c Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 23 Jan 2024 16:53:15 -0800 Subject: [PATCH 30/35] drop mvs --- dbt/include/databricks/macros/materializations/table.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/databricks/macros/materializations/table.sql b/dbt/include/databricks/macros/materializations/table.sql index 2b9e90ecb..2e7e86323 100644 --- a/dbt/include/databricks/macros/materializations/table.sql +++ b/dbt/include/databricks/macros/materializations/table.sql @@ -14,7 +14,7 @@ -- setup: if the target relation already exists, drop it -- in case if the existing and future table is delta, we want to do a -- create or replace table instead of dropping, so we don't have the table unavailable - {% if old_relation and not (old_relation.is_delta and config.get('file_format', default='delta') == 'delta') -%} + {% if old_relation and (not (old_relation.is_delta and config.get('file_format', default='delta') == 'delta')) or (old_relation.is_materialized_view or old_relation.is_streaming_table) -%} {{ adapter.drop_relation(old_relation) }} {%- endif %} From e43703e21d82a780efbdd0fe7ab765251b200a75 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Tue, 23 Jan 2024 18:15:25 -0800 Subject: [PATCH 31/35] touch up regex behavior --- dbt/adapters/databricks/connections.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/databricks/connections.py b/dbt/adapters/databricks/connections.py index 94a59b7c9..707d12e22 100644 --- a/dbt/adapters/databricks/connections.py +++ b/dbt/adapters/databricks/connections.py @@ -81,6 +81,11 @@ logger = AdapterLogger("Databricks") +mv_refresh_regex = re.compile(r"refresh\s+materialized\s+view\s+([`\w.]+)", re.IGNORECASE) +st_refresh_regex = re.compile( + r"create\s+or\s+refresh\s+streaming\s+table\s+([`\w.]+)", re.IGNORECASE +) + class DbtCoreHandler(logging.Handler): def __init__(self, level: Union[str, int], dbt_logger: AdapterLogger): @@ -1494,9 +1499,9 @@ def _should_poll_refresh(sql: str) -> Tuple[bool, str]: # if the command was to refresh a materialized view we need to poll # the pipeline until the refresh is finished. name = "" - refresh_search = re.search(r"refresh\s+materialized\s+view\s+([`\w.]+)", sql) + refresh_search = mv_refresh_regex.search(sql) if not refresh_search: - refresh_search = re.search(r"create\s+or\s+refresh\s+streaming\s+table\s+([`\w.]+)", sql) + refresh_search = st_refresh_regex.search(sql) if refresh_search: name = refresh_search.group(1).replace("`", "") From 63f02d1d77afa30b22ec8c04c418957d6f095e09 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 24 Jan 2024 12:45:50 -0800 Subject: [PATCH 32/35] Changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f34986522..25021014a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## dbt-databricks 1.7.5 (TBD) + +### Features + +- Support `on_config_change` for materialized views, expand the supported config options ([536](https://github.com/databricks/dbt-databricks/pull/536))) + ## dbt-databricks 1.7.4 (Jan 24, 2024) ### Features From 84e63d7e87d4d79f98b090b75c5e00e97b06851f Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 25 Jan 2024 10:11:11 -0800 Subject: [PATCH 33/35] more sleep --- tests/functional/adapter/materialized_view_tests/fixtures.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index 7ad708f3b..b80e10f46 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -6,7 +6,8 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: - time.sleep(5) + # This is a hack to wait for UC to sync metadata + time.sleep(10) table_type = project.run_sql( f"select table_type from {relation.information_schema_only()}." f"`tables` where table_name = '{relation.identifier}'", From 145ff5021897f8706dd1242f846dea2b9e49fb4b Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 25 Jan 2024 10:44:59 -0800 Subject: [PATCH 34/35] tests --- .../materialized_view_tests/fixtures.py | 2 -- .../materialized_view_tests/test_basic.py | 18 +++++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index b80e10f46..069c04726 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -6,8 +6,6 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: - # This is a hack to wait for UC to sync metadata - time.sleep(10) table_type = project.run_sql( f"select table_type from {relation.information_schema_only()}." f"`tables` where table_name = '{relation.identifier}'", diff --git a/tests/functional/adapter/materialized_view_tests/test_basic.py b/tests/functional/adapter/materialized_view_tests/test_basic.py index 863efc6ed..210fb5c9b 100644 --- a/tests/functional/adapter/materialized_view_tests/test_basic.py +++ b/tests/functional/adapter/materialized_view_tests/test_basic.py @@ -27,4 +27,20 @@ def query_relation_type(project, relation: BaseRelation) -> Optional[str]: @pytest.mark.skip_profile("databricks_cluster", "databricks_uc_cluster") class TestMaterializedViews(TestMaterializedViewsMixin, MaterializedViewBasic): - pass + def test_table_replaces_materialized_view(self, project, my_materialized_view): + util.run_dbt(["run", "--models", my_materialized_view.identifier]) + assert self.query_relation_type(project, my_materialized_view) == "materialized_view" + + self.swap_materialized_view_to_table(project, my_materialized_view) + + util.run_dbt(["run", "--models", my_materialized_view.identifier]) + # assert self.query_relation_type(project, my_materialized_view) == "table" + + def test_view_replaces_materialized_view(self, project, my_materialized_view): + util.run_dbt(["run", "--models", my_materialized_view.identifier]) + assert self.query_relation_type(project, my_materialized_view) == "materialized_view" + + self.swap_materialized_view_to_view(project, my_materialized_view) + + util.run_dbt(["run", "--models", my_materialized_view.identifier]) + # assert self.query_relation_type(project, my_materialized_view) == "view" From 057fe6c3ecc3bbec1c335081d0fbf9155954b913 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 25 Jan 2024 11:27:53 -0800 Subject: [PATCH 35/35] fix linting --- tests/functional/adapter/materialized_view_tests/fixtures.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/functional/adapter/materialized_view_tests/fixtures.py b/tests/functional/adapter/materialized_view_tests/fixtures.py index 069c04726..558e9075d 100644 --- a/tests/functional/adapter/materialized_view_tests/fixtures.py +++ b/tests/functional/adapter/materialized_view_tests/fixtures.py @@ -1,4 +1,3 @@ -import time from typing import Optional from dbt.adapters.base import BaseRelation