dev: enable SIM rules that have autofix available

datahub-project · Feb 28, 2025 · 8f984aa · 8f984aa
1 parent b00ef7a
commit 8f984aa
Show file tree

Hide file tree

Showing 52 changed files with 169 additions and 184 deletions.
diff --git a/metadata-ingestion/pyproject.toml b/metadata-ingestion/pyproject.toml
@@ -30,6 +30,7 @@ extend-select = [
     "I",    # isort
     "TID",  # flake8-tidy-imports
     "RUF100",  # unused-noqa
+    "SIM",   # flake8-simplify
 ]
 extend-ignore = [
     "E501",  # Handled by formatter
@@ -40,6 +41,21 @@ extend-ignore = [
     "B008",  # Allow function call in argument defaults
     "RUF012", # mutable-class-default; incompatible with pydantic
     "RUF015", # unnecessary-iterable-allocation-for-first-element
+
+    # Can be enabled later if someone wants to fix existing cases or an auto-fix becomes available
+    "SIM101", # Multiple isinstance calls for {name}, merge into a single call
+    "SIM102", # Use a single `if` statement instead of nested `if` statements
+    "SIM103", # Return the condition directly
+    "SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
+    "SIM108", # Use ternary operator {contents} instead of if-else-block
+    "SIM110", # Use `return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)` instead of `for` loop
+    "SIM113", # Use enumerate() for index variable {index} in for loop
+    "SIM115", # Use a context manager for opening files
+    "SIM116", # Use a dictionary instead of consecutive `if` statements
+    "SIM117", # Use a single with statement with multiple contexts instead of nested with statements
+    "SIM118", # Use key {operator} dict instead of key {operator} dict.keys()
+    "SIM210", # Use `bool(...)` instead of `True if ... else False`
+    "SIM401", # Use `sample_data.get(schema_field.fieldPath, [])` instead of an `if` block
 ]
 
 [tool.ruff.lint.mccabe]

diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py
@@ -43,9 +43,7 @@ class GitReference(ConfigModel):
 
     @validator("repo", pre=True)
     def simplify_repo_url(cls, repo: str) -> str:
-        if repo.startswith("github.com/"):
-            repo = f"https://{repo}"
-        elif repo.startswith("gitlab.com"):
+        if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
             repo = f"https://{repo}"
         elif repo.count("/") == 1:
             repo = f"https://github.com/{repo}"

diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py
@@ -281,7 +281,7 @@ def get_columns_to_classify(
                     ),
                     values=(
                         sample_data[schema_field.fieldPath]
-                        if schema_field.fieldPath in sample_data.keys()
+                        if schema_field.fieldPath in sample_data
                         else []
                     ),
                 )

diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -330,7 +330,7 @@ def get_aspect_v2(
         aspect_type_name: Optional[str] = None,
         version: int = 0,
     ) -> Optional[Aspect]:
-        assert aspect_type.ASPECT_NAME == aspect
+        assert aspect == aspect_type.ASPECT_NAME
         return self.get_aspect(
             entity_urn=entity_urn,
             aspect_type=aspect_type,

diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py
@@ -163,12 +163,7 @@ def _convert_sets_to_lists(obj: Any) -> Any:
                 key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
                 for key, value in obj.items()
             }
-        elif isinstance(obj, list):
-            return [
-                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
-                for element in obj
-            ]
-        elif isinstance(obj, set):
+        elif isinstance(obj, list) or isinstance(obj, set):
             return [
                 DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
                 for element in obj

diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/config.py b/metadata-ingestion/src/datahub/ingestion/source/abs/config.py
@@ -145,9 +145,7 @@ def check_path_specs_and_infer_platform(
 
     @pydantic.validator("platform", always=True)
     def platform_not_empty(cls, platform: str, values: dict) -> str:
-        inferred_platform = values.get(
-            "platform", None
-        )  # we may have inferred it above
+        inferred_platform = values.get("platform")  # we may have inferred it above
         platform = platform or inferred_platform
         if not platform:
             raise ValueError("platform must not be empty")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
@@ -165,7 +165,7 @@ def from_bigquery_table(cls, table: BigqueryTableIdentifier) -> "BigQueryTableRe
     @classmethod
     def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
         for key in ["projectId", "datasetId", "tableId"]:
-            if key not in spec.keys():
+            if key not in spec:
                 raise ValueError(f"invalid BigQuery table reference dict: {spec}")
 
         return cls(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -344,7 +344,7 @@ def get_tables_for_dataset(
         with_partitions: bool = False,
     ) -> Iterator[BigqueryTable]:
         with PerfTimer() as current_timer:
-            filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
+            filter_clause: str = ", ".join(f"'{table}'" for table in tables)
 
             if with_partitions:
                 query_template = BigqueryQuery.tables_for_dataset

diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
@@ -314,7 +314,7 @@ def get_resource_description_work_unit(
             "datajob": EditableDataJobPropertiesClass,
             "dataflow": EditableDataFlowPropertiesClass,
             "notebook": EditableNotebookPropertiesClass,
-        }.get(entityType, None)
+        }.get(entityType)
 
         if not entityClass:
             raise ValueError(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1033,7 +1033,7 @@ def add_node_to_cll_list(dbt_name: str) -> None:
             cll_nodes.add(dbt_name)
             schema_nodes.add(dbt_name)
 
-        for dbt_name in all_nodes_map.keys():
+        for dbt_name in all_nodes_map:
             if self._is_allowed_node(dbt_name):
                 add_node_to_cll_list(dbt_name)
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
@@ -498,7 +498,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
             deleted_data_flows: int = 0
-            for key in dataFlows.keys():
+            for key in dataFlows:
                 if not dataJobs.get(key) or len(dataJobs[key]) == 0:
                     logger.info(
                         f"Deleting dataflow {key} because there are not datajobs"

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
             ).select_from(self._table)
         )
         return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == BIGQUERY:
-        element_values = self.engine.execute(
-            sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
-                self._table
-            )
-        )
-        return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == SNOWFLAKE:
+    elif (
+        self.engine.dialect.name.lower() == BIGQUERY
+        or self.engine.dialect.name.lower() == SNOWFLAKE
+    ):
         element_values = self.engine.execute(
             sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
                 self._table
@@ -381,13 +377,14 @@ def _get_columns_to_profile(self) -> List[str]:
             col = col_dict["name"]
             self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
-            if not self.config._allow_deny_patterns.allowed(
-                f"{self.dataset_name}.{col}"
+            if (
+                not self.config._allow_deny_patterns.allowed(
+                    f"{self.dataset_name}.{col}"
+                )
+                or not self.config.profile_nested_fields
+                and "." in col
             ):
                 ignored_columns_by_pattern.append(col)
-            # We try to ignore nested columns as well
-            elif not self.config.profile_nested_fields and "." in col:
-                ignored_columns_by_pattern.append(col)
             elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
                 ignored_columns_by_type.append(col)
             else:
@@ -1408,7 +1405,7 @@ def _get_ge_dataset(
             },
         )
 
-        if platform == BIGQUERY or platform == DATABRICKS:
+        if platform in (BIGQUERY, DATABRICKS):
             # This is done as GE makes the name as DATASET.TABLE
             # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
             name_parts = pretty_name.split(".")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py
@@ -568,9 +568,7 @@ def _filter_okta_user(self, okta_user: User) -> bool:
         if (
             self.config.include_deprovisioned_users is False
             and okta_user.status == UserStatus.DEPROVISIONED
-        ):
-            return False
-        elif (
+        ) or (
             self.config.include_suspended_users is False
             and okta_user.status == UserStatus.SUSPENDED
         ):

diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py
@@ -447,13 +447,10 @@ def get_parser(
     ) -> DebeziumParser:
         connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
 
-        if connector_class == "io.debezium.connector.mysql.MySqlConnector":
-            parser = self.DebeziumParser(
-                source_platform="mysql",
-                server_name=self.get_server_name(connector_manifest),
-                database_name=None,
-            )
-        elif connector_class == "MySqlConnector":
+        if (
+            connector_class == "io.debezium.connector.mysql.MySqlConnector"
+            or connector_class == "MySqlConnector"
+        ):
             parser = self.DebeziumParser(
                 source_platform="mysql",
                 server_name=self.get_server_name(connector_manifest),

diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py
@@ -292,7 +292,7 @@ def get_properties(self, record: dict) -> str:
         return record["properties"]
 
     def get_relationships(self, record: dict) -> dict:
-        return record.get("relationships", None)
+        return record.get("relationships")
 
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [

diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
@@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
     Try to determine if example data is defined for the endpoint, and return it
     """
     data = {}
-    if "content" in base_res.keys():
+    if "content" in base_res:
         res_cont = base_res["content"]
         if "application/json" in res_cont.keys():
             ex_field = None
@@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
                 )
         elif "text/csv" in res_cont.keys():
             data = res_cont["text/csv"]["schema"]
-    elif "examples" in base_res.keys():
+    elif "examples" in base_res:
         data = base_res["examples"]["application/json"]
 
     return data

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -890,9 +890,7 @@ def to_datahub_users(
                         set(user_rights) & set(self.__config.ownership.owner_criteria)
                     )
                     > 0
-                ):
-                    user_mcps.extend(self.to_datahub_user(user))
-                elif self.__config.ownership.owner_criteria is None:
+                ) or self.__config.ownership.owner_criteria is None:
                     user_mcps.extend(self.to_datahub_user(user))
                 else:
                     continue

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
@@ -196,7 +196,7 @@ def get_all_reports(self) -> List[Any]:
         }
 
         reports: List[Any] = []
-        for report_type in report_types_mapping.keys():
+        for report_type in report_types_mapping:
             report_get_endpoint: str = API_ENDPOINTS[report_type]
             # Replace place holders
             report_get_endpoint_http = report_get_endpoint.format(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py
@@ -155,9 +155,7 @@ def check_path_specs_and_infer_platform(
 
     @pydantic.validator("platform", always=True)
     def platform_valid(cls, platform: str, values: dict) -> str:
-        inferred_platform = values.get(
-            "platform", None
-        )  # we may have inferred it above
+        inferred_platform = values.get("platform")  # we may have inferred it above
         platform = platform or inferred_platform
         if not platform:
             raise ValueError("platform must not be empty")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
@@ -834,7 +834,7 @@ def get_dir_to_process(
                         min=min,
                     )
                     folders.extend(folders_list)
-                    if not path_spec.traversal_method == FolderTraversalMethod.ALL:
+                    if path_spec.traversal_method != FolderTraversalMethod.ALL:
                         return folders
             if folders:
                 return folders

diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
@@ -615,7 +615,7 @@ def _get_field_description(self, field: dict, customField: dict) -> str:
             prefix = "\\" if text.startswith("#") else ""
             desc += f"\n\n{prefix}{text}"
 
-        text = field.get("InlineHelpText", None)
+        text = field.get("InlineHelpText")
         if text:
             prefix = "\\" if text.startswith("#") else ""
             desc += f"\n\n{prefix}{text}"

diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py
@@ -149,7 +149,7 @@ def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> Non
 
     extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
 
-    for field_path in schema.keys():
+    for field_path in schema:
         field_types = schema[field_path]["types"]
         field_type: Union[str, type] = "mixed"
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
@@ -125,7 +125,7 @@ def validate_account_id(cls, account_id: str) -> str:
 
     @pydantic.validator("authentication_type", always=True)
     def authenticator_type_is_valid(cls, v, values):
-        if v not in _VALID_AUTH_TYPES.keys():
+        if v not in _VALID_AUTH_TYPES:
             raise ValueError(
                 f"unsupported authenticator type '{v}' was provided,"
                 f" use one of {list(_VALID_AUTH_TYPES.keys())}"

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -439,7 +439,7 @@ class SnowflakePrivilege:
                     failure_reason=failure_message,
                 )
 
-            if c in _report.keys():
+            if c in _report:
                 continue
 
             # If some capabilities are missing, then mark them as not capable

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
@@ -396,7 +396,7 @@ def get_table_properties(
             metadata.table_type if metadata.table_type else ""
         )
 
-        location: Optional[str] = custom_properties.get("location", None)
+        location: Optional[str] = custom_properties.get("location")
         if location is not None:
             if location.startswith("s3://"):
                 location = make_s3_urn(location, self.config.env)
@@ -538,7 +538,7 @@ def get_schema_fields_for_column(
             column_name=column["name"],
             column_type=column["type"],
             inspector=inspector,
-            description=column.get("comment", None),
+            description=column.get("comment"),
             nullable=column.get("nullable", True),
             is_part_of_key=(
                 True

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -204,7 +204,7 @@ def get_column_type(
     """
 
     TypeClass: Optional[Type] = None
-    for sql_type in _field_type_mapping.keys():
+    for sql_type in _field_type_mapping:
         if isinstance(column_type, sql_type):
             TypeClass = _field_type_mapping[sql_type]
             break
@@ -973,7 +973,7 @@ def get_schema_fields_for_column(
                     inspector=inspector,
                 )
             ),
-            description=column.get("comment", None),
+            description=column.get("comment"),
             nullable=column["nullable"],
             recursive=False,
             globalTags=gtc,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
     match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
     if match:
         modified_type_base = match.group(1)  # Extract the base type
-        return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
+        return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
 
     # Fallback for types without precision/scale
-    return SNOWFLAKE_TYPES_MAP.get(type_string, None)
+    return SNOWFLAKE_TYPES_MAP.get(type_string)
 
 
 # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
                     if col_value is not None:
                         properties[col_name] = col_value
 
-            return {"text": properties.get("comment", None), "properties": properties}
+            return {"text": properties.get("comment"), "properties": properties}
         else:
             return self.get_table_comment_default(connection, table_name, schema)
     except Exception:
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
 
 
 def _parse_basic_datatype(s):
-    for sql_type in _all_atomic_types.keys():
+    for sql_type in _all_atomic_types:
         if isinstance(s, sql_type):
             return {
                 "type": _all_atomic_types[sql_type],

diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -1911,11 +1911,7 @@ def get_upstream_columns_of_fields_in_datasource(
                     if upstream_col.get(c.TABLE)
                     else None
                 )
-                if (
-                    name
-                    and upstream_table_id
-                    and upstream_table_id in table_id_to_urn.keys()
-                ):
+                if name and upstream_table_id and upstream_table_id in table_id_to_urn:
                     parent_dataset_urn = table_id_to_urn[upstream_table_id]
                     if (
                         self.is_snowflake_urn(parent_dataset_urn)