datahub-project · anshbansal · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/metadata-ingestion/pyproject.toml b/metadata-ingestion/pyproject.toml
@@ -30,6 +30,7 @@ extend-select = [
     "I",    # isort
     "TID",  # flake8-tidy-imports
     "RUF100",  # unused-noqa
+    "SIM",   # flake8-simplify
 ]
 extend-ignore = [
     "E501",  # Handled by formatter
@@ -40,8 +41,21 @@ extend-ignore = [
     "B008",  # Allow function call in argument defaults
     "RUF012", # mutable-class-default; incompatible with pydantic
     "RUF015", # unnecessary-iterable-allocation-for-first-element
-    # TODO: Enable these later
-    "B006",  # Mutable args
+
+    # Can be enabled later if someone wants to fix existing cases or an auto-fix becomes available
+    "SIM101", # Multiple isinstance calls for {name}, merge into a single call
+    "SIM102", # Use a single `if` statement instead of nested `if` statements
+    "SIM103", # Return the condition directly
+    "SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
+    "SIM108", # Use ternary operator {contents} instead of if-else-block
+    "SIM110", # Use `return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)` instead of `for` loop
+    "SIM113", # Use enumerate() for index variable {index} in for loop
+    "SIM115", # Use a context manager for opening files
+    "SIM116", # Use a dictionary instead of consecutive `if` statements
+    "SIM117", # Use a single with statement with multiple contexts instead of nested with statements
+    "SIM118", # Use key {operator} dict instead of key {operator} dict.keys()
+    "SIM210", # Use `bool(...)` instead of `True if ... else False`
+    "SIM401", # Use `sample_data.get(schema_field.fieldPath, [])` instead of an `if` block
 ]
 
 [tool.ruff.lint.mccabe]

diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py
@@ -43,9 +43,7 @@ class GitReference(ConfigModel):
 
     @validator("repo", pre=True)
     def simplify_repo_url(cls, repo: str) -> str:
-        if repo.startswith("github.com/"):
-            repo = f"https://{repo}"
-        elif repo.startswith("gitlab.com"):
+        if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
             repo = f"https://{repo}"
         elif repo.count("/") == 1:
             repo = f"https://github.com/{repo}"

diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py
@@ -281,7 +281,7 @@ def get_columns_to_classify(
                     ),
                     values=(
                         sample_data[schema_field.fieldPath]
-                        if schema_field.fieldPath in sample_data.keys()
+                        if schema_field.fieldPath in sample_data
                         else []
                     ),
                 )

diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -330,7 +330,7 @@ def get_aspect_v2(
         aspect_type_name: Optional[str] = None,
         version: int = 0,
     ) -> Optional[Aspect]:
-        assert aspect_type.ASPECT_NAME == aspect
+        assert aspect == aspect_type.ASPECT_NAME
         return self.get_aspect(
             entity_urn=entity_urn,
             aspect_type=aspect_type,

diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py
@@ -163,12 +163,7 @@ def _convert_sets_to_lists(obj: Any) -> Any:
                 key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
                 for key, value in obj.items()
             }
-        elif isinstance(obj, list):
-            return [
-                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
-                for element in obj
-            ]
-        elif isinstance(obj, set):
+        elif isinstance(obj, list) or isinstance(obj, set):
             return [
                 DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
                 for element in obj

diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/config.py b/metadata-ingestion/src/datahub/ingestion/source/abs/config.py
@@ -144,10 +144,8 @@ def check_path_specs_and_infer_platform(
         return path_specs
 
     @pydantic.validator("platform", always=True)
-    def platform_not_empty(cls, platform: str, values: dict) -> str:
-        inferred_platform = values.get(
-            "platform", None
-        )  # we may have inferred it above
+    def platform_not_empty(cls, platform: Any, values: dict) -> str:
+        inferred_platform = values.get("platform")  # we may have inferred it above
         platform = platform or inferred_platform
         if not platform:
             raise ValueError("platform must not be empty")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
@@ -165,7 +165,7 @@ def from_bigquery_table(cls, table: BigqueryTableIdentifier) -> "BigQueryTableRe
     @classmethod
     def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
         for key in ["projectId", "datasetId", "tableId"]:
-            if key not in spec.keys():
+            if key not in spec:
                 raise ValueError(f"invalid BigQuery table reference dict: {spec}")
 
         return cls(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -344,7 +344,7 @@ def get_tables_for_dataset(
         with_partitions: bool = False,
     ) -> Iterator[BigqueryTable]:
         with PerfTimer() as current_timer:
-            filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
+            filter_clause: str = ", ".join(f"'{table}'" for table in tables)
 
             if with_partitions:
                 query_template = BigqueryQuery.tables_for_dataset

diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
@@ -314,7 +314,7 @@ def get_resource_description_work_unit(
             "datajob": EditableDataJobPropertiesClass,
             "dataflow": EditableDataFlowPropertiesClass,
             "notebook": EditableNotebookPropertiesClass,
-        }.get(entityType, None)
+        }.get(entityType)
 
         if not entityClass:
             raise ValueError(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1033,7 +1033,7 @@ def add_node_to_cll_list(dbt_name: str) -> None:
             cll_nodes.add(dbt_name)
             schema_nodes.add(dbt_name)
 
-        for dbt_name in all_nodes_map.keys():
+        for dbt_name in all_nodes_map:
             if self._is_allowed_node(dbt_name):
                 add_node_to_cll_list(dbt_name)
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
@@ -498,7 +498,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
             deleted_data_flows: int = 0
-            for key in dataFlows.keys():
+            for key in dataFlows:
                 if not dataJobs.get(key) or len(dataJobs[key]) == 0:
                     logger.info(
                         f"Deleting dataflow {key} because there are not datajobs"

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
             ).select_from(self._table)
         )
         return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == BIGQUERY:
-        element_values = self.engine.execute(
-            sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
-                self._table
-            )
-        )
-        return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == SNOWFLAKE:
+    elif (
+        self.engine.dialect.name.lower() == BIGQUERY
+        or self.engine.dialect.name.lower() == SNOWFLAKE
+    ):
         element_values = self.engine.execute(
             sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
                 self._table
@@ -381,13 +377,14 @@ def _get_columns_to_profile(self) -> List[str]:
             col = col_dict["name"]
             self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
-            if not self.config._allow_deny_patterns.allowed(
-                f"{self.dataset_name}.{col}"
+            if (
+                not self.config._allow_deny_patterns.allowed(
+                    f"{self.dataset_name}.{col}"
+                )
+                or not self.config.profile_nested_fields
+                and "." in col
             ):
                 ignored_columns_by_pattern.append(col)
-            # We try to ignore nested columns as well
-            elif not self.config.profile_nested_fields and "." in col:
-                ignored_columns_by_pattern.append(col)
             elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
                 ignored_columns_by_type.append(col)
             else:
@@ -1408,7 +1405,7 @@ def _get_ge_dataset(
             },
         )
 
-        if platform == BIGQUERY or platform == DATABRICKS:
+        if platform in (BIGQUERY, DATABRICKS):
             # This is done as GE makes the name as DATASET.TABLE
             # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
             name_parts = pretty_name.split(".")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py
@@ -568,9 +568,7 @@ def _filter_okta_user(self, okta_user: User) -> bool:
         if (
             self.config.include_deprovisioned_users is False
             and okta_user.status == UserStatus.DEPROVISIONED
-        ):
-            return False
-        elif (
+        ) or (
             self.config.include_suspended_users is False
             and okta_user.status == UserStatus.SUSPENDED
         ):

diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py
@@ -447,13 +447,10 @@ def get_parser(
     ) -> DebeziumParser:
         connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
 
-        if connector_class == "io.debezium.connector.mysql.MySqlConnector":
-            parser = self.DebeziumParser(
-                source_platform="mysql",
-                server_name=self.get_server_name(connector_manifest),
-                database_name=None,
-            )
-        elif connector_class == "MySqlConnector":
+        if (
+            connector_class == "io.debezium.connector.mysql.MySqlConnector"
+            or connector_class == "MySqlConnector"
+        ):
             parser = self.DebeziumParser(
                 source_platform="mysql",
                 server_name=self.get_server_name(connector_manifest),

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
@@ -205,8 +205,9 @@ def lookml_model_explore(self, model: str, explore_name: str) -> LookmlModelExpl
     def folder_ancestors(
         self,
         folder_id: str,
-        fields: Union[str, List[str]] = ["id", "name", "parent_id"],
+        fields: Optional[Union[str, List[str]]] = None,
     ) -> Sequence[Folder]:
+        fields = fields or ["id", "name", "parent_id"]
         self.client_stats.folder_calls += 1
         try:
             return self.client.folder_ancestors(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py
@@ -464,9 +464,10 @@ def process_lookml_template_language(
     source_config: LookMLSourceConfig,
     view_lkml_file_dict: dict,
     reporter: LookMLSourceReport,
-    manifest_constants: Dict[str, "LookerConstant"] = {},
+    manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
     resolve_constants: bool = False,
 ) -> None:
+    manifest_constants = manifest_constants or {}
     if "views" not in view_lkml_file_dict:
         return
 
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
     path: Union[str, pathlib.Path],
     source_config: LookMLSourceConfig,
     reporter: LookMLSourceReport,
-    manifest_constants: Dict[str, "LookerConstant"] = {},
+    manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
     resolve_constants: bool = False,
 ) -> dict:
+    manifest_constants = manifest_constants or {}
     parsed = load_lkml(path)
 
     process_lookml_template_language(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
@@ -1006,8 +1006,9 @@ def gen_project_workunits(self, project_name: str) -> Iterable[MetadataWorkUnit]
     def report_skipped_unreachable_views(
         self,
         viewfile_loader: LookerViewFileLoader,
-        processed_view_map: Dict[str, Set[str]] = {},
+        processed_view_map: Optional[Dict[str, Set[str]]] = None,
     ) -> None:
+        processed_view_map = processed_view_map or {}
         view_files: Dict[str, List[pathlib.Path]] = {}
         for project, folder_path in self.base_projects_folder.items():
             folder = pathlib.Path(folder_path)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py
@@ -292,7 +292,7 @@ def get_properties(self, record: dict) -> str:
         return record["properties"]
 
     def get_relationships(self, record: dict) -> dict:
-        return record.get("relationships", None)
+        return record.get("relationships", {})
 
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [

diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py
@@ -1234,11 +1234,14 @@ def construct_job_workunits(
         job_type: str,
         description: Optional[str],
         job_properties: Optional[Dict[str, str]] = None,
-        inlets: List[str] = [],
-        outlets: List[str] = [],
-        inputJobs: List[str] = [],
+        inlets: Optional[List[str]] = None,
+        outlets: Optional[List[str]] = None,
+        inputJobs: Optional[List[str]] = None,
         status: Optional[str] = None,
     ) -> Iterable[MetadataWorkUnit]:
+        inlets = inlets or []
+        outlets = outlets or []
+        inputJobs = inputJobs or []
         logger.debug(f"Begining construction of job workunit for {job_urn}")
         if job_properties:
             job_properties = {k: v for k, v in job_properties.items() if v is not None}

diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
@@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
     Try to determine if example data is defined for the endpoint, and return it
     """
     data = {}
-    if "content" in base_res.keys():
+    if "content" in base_res:
         res_cont = base_res["content"]
         if "application/json" in res_cont.keys():
             ex_field = None
@@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
                 )
         elif "text/csv" in res_cont.keys():
             data = res_cont["text/csv"]["schema"]
-    elif "examples" in base_res.keys():
+    elif "examples" in base_res:
         data = base_res["examples"]["application/json"]
 
     return data

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
@@ -2,7 +2,7 @@
 import importlib.resources as pkg_resource
 import logging
 import os
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import lark
 from lark import Lark, Tree
@@ -65,8 +65,9 @@ def get_upstream_tables(
     platform_instance_resolver: AbstractDataPlatformInstanceResolver,
     ctx: PipelineContext,
     config: PowerBiDashboardSourceConfig,
-    parameters: Dict[str, str] = {},
+    parameters: Optional[Dict[str, str]] = None,
 ) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
+    parameters = parameters or {}
     if table.expression is None:
         logger.debug(f"There is no M-Query expression in table {table.full_name}")
         return []

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py
@@ -70,13 +70,14 @@ def internal(node: Union[Tree, Token]) -> Optional[Tree]:
     return expression_tree
 
 
-def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
+def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]:
     """
     :param tree: Tree to traverse
     :param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references
                        using the values in 'parameters'.
     :return: List of leaf token data
     """
+    parameters = parameters or {}
     values: List[str] = []
 
     def internal(node: Union[Tree, Token]) -> None:

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -890,9 +890,7 @@ def to_datahub_users(
                         set(user_rights) & set(self.__config.ownership.owner_criteria)
                     )
                     > 0
-                ):
-                    user_mcps.extend(self.to_datahub_user(user))
-                elif self.__config.ownership.owner_criteria is None:
+                ) or self.__config.ownership.owner_criteria is None:
                     user_mcps.extend(self.to_datahub_user(user))
                 else:
                     continue

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
@@ -380,8 +380,9 @@ def get_tiles(self, workspace: Workspace, dashboard: Dashboard) -> List[Tile]:
     def itr_pages(
         self,
         endpoint: str,
-        parameter_override: Dict = {},
+        parameter_override: Optional[Dict] = None,
     ) -> Iterator[List[Dict]]:
+        parameter_override = parameter_override or {}
         params: dict = {
             "$skip": 0,
             "$top": self.TOP,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
@@ -196,7 +196,7 @@ def get_all_reports(self) -> List[Any]:
         }
 
         reports: List[Any] = []
-        for report_type in report_types_mapping.keys():
+        for report_type in report_types_mapping:
             report_get_endpoint: str = API_ENDPOINTS[report_type]
             # Replace place holders
             report_get_endpoint_http = report_get_endpoint.format(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/websocket_connection.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/websocket_connection.py
@@ -17,8 +17,9 @@ def __init__(self, tenant_hostname: str, api_key: str, app_id: str) -> None:
         self.handle = [-1]
 
     def _build_websocket_request_dict(
-        self, method: str, params: Union[Dict, List] = {}
+        self, method: str, params: Optional[Union[Dict, List]] = None
     ) -> Dict:
+        params = params or {}
         return {
             "jsonrpc": "2.0",
             "id": self.request_id,
@@ -37,11 +38,12 @@ def _send_request(self, request: Dict) -> Dict:
         return {}
 
     def websocket_send_request(
-        self, method: str, params: Union[Dict, List] = {}
+        self, method: str, params: Optional[Union[Dict, List]] = None
     ) -> Dict:
         """
         Method to send request to websocket
         """
+        params = params or {}
         self.request_id += 1
         request = self._build_websocket_request_dict(method, params)
         response = self._send_request(request=request)