Skip to content

Commit

Permalink
dev: enable SIM rules that have autofix available
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal committed Feb 28, 2025
1 parent b00ef7a commit 8f984aa
Show file tree
Hide file tree
Showing 52 changed files with 169 additions and 184 deletions.
16 changes: 16 additions & 0 deletions metadata-ingestion/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ extend-select = [
"I", # isort
"TID", # flake8-tidy-imports
"RUF100", # unused-noqa
"SIM", # flake8-simplify
]
extend-ignore = [
"E501", # Handled by formatter
Expand All @@ -40,6 +41,21 @@ extend-ignore = [
"B008", # Allow function call in argument defaults
"RUF012", # mutable-class-default; incompatible with pydantic
"RUF015", # unnecessary-iterable-allocation-for-first-element

# Can be enabled later if someone wants to fix existing cases or an auto-fix becomes available
"SIM101", # Multiple isinstance calls for {name}, merge into a single call
"SIM102", # Use a single `if` statement instead of nested `if` statements
"SIM103", # Return the condition directly
"SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
"SIM108", # Use ternary operator {contents} instead of if-else-block
"SIM110", # Use `return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)` instead of `for` loop
"SIM113", # Use enumerate() for index variable {index} in for loop
"SIM115", # Use a context manager for opening files
"SIM116", # Use a dictionary instead of consecutive `if` statements
"SIM117", # Use a single with statement with multiple contexts instead of nested with statements
"SIM118", # Use key {operator} dict instead of key {operator} dict.keys()
"SIM210", # Use `bool(...)` instead of `True if ... else False`
"SIM401", # Use `sample_data.get(schema_field.fieldPath, [])` instead of an `if` block
]

[tool.ruff.lint.mccabe]
Expand Down
4 changes: 1 addition & 3 deletions metadata-ingestion/src/datahub/configuration/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ class GitReference(ConfigModel):

@validator("repo", pre=True)
def simplify_repo_url(cls, repo: str) -> str:
if repo.startswith("github.com/"):
repo = f"https://{repo}"
elif repo.startswith("gitlab.com"):
if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
repo = f"https://{repo}"
elif repo.count("/") == 1:
repo = f"https://github.com/{repo}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def get_columns_to_classify(
),
values=(
sample_data[schema_field.fieldPath]
if schema_field.fieldPath in sample_data.keys()
if schema_field.fieldPath in sample_data
else []
),
)
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/src/datahub/ingestion/graph/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def get_aspect_v2(
aspect_type_name: Optional[str] = None,
version: int = 0,
) -> Optional[Aspect]:
assert aspect_type.ASPECT_NAME == aspect
assert aspect == aspect_type.ASPECT_NAME
return self.get_aspect(
entity_urn=entity_urn,
aspect_type=aspect_type,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,7 @@ def _convert_sets_to_lists(obj: Any) -> Any:
key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
for key, value in obj.items()
}
elif isinstance(obj, list):
return [
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
for element in obj
]
elif isinstance(obj, set):
elif isinstance(obj, list) or isinstance(obj, set):
return [
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
for element in obj
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,7 @@ def check_path_specs_and_infer_platform(

@pydantic.validator("platform", always=True)
def platform_not_empty(cls, platform: str, values: dict) -> str:
inferred_platform = values.get(
"platform", None
) # we may have inferred it above
inferred_platform = values.get("platform") # we may have inferred it above
platform = platform or inferred_platform
if not platform:
raise ValueError("platform must not be empty")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def from_bigquery_table(cls, table: BigqueryTableIdentifier) -> "BigQueryTableRe
@classmethod
def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
for key in ["projectId", "datasetId", "tableId"]:
if key not in spec.keys():
if key not in spec:
raise ValueError(f"invalid BigQuery table reference dict: {spec}")

return cls(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def get_tables_for_dataset(
with_partitions: bool = False,
) -> Iterator[BigqueryTable]:
with PerfTimer() as current_timer:
filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
filter_clause: str = ", ".join(f"'{table}'" for table in tables)

if with_partitions:
query_template = BigqueryQuery.tables_for_dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def get_resource_description_work_unit(
"datajob": EditableDataJobPropertiesClass,
"dataflow": EditableDataFlowPropertiesClass,
"notebook": EditableNotebookPropertiesClass,
}.get(entityType, None)
}.get(entityType)

if not entityClass:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1033,7 +1033,7 @@ def add_node_to_cll_list(dbt_name: str) -> None:
cll_nodes.add(dbt_name)
schema_nodes.add(dbt_name)

for dbt_name in all_nodes_map.keys():
for dbt_name in all_nodes_map:
if self._is_allowed_node(dbt_name):
add_node_to_cll_list(dbt_name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
# Delete empty dataflows if needed
if self.config.delete_empty_data_flows:
deleted_data_flows: int = 0
for key in dataFlows.keys():
for key in dataFlows:
if not dataJobs.get(key) or len(dataJobs[key]) == 0:
logger.info(
f"Deleting dataflow {key} because there are not datajobs"
Expand Down
25 changes: 11 additions & 14 deletions metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
).select_from(self._table)
)
return convert_to_json_serializable(element_values.fetchone()[0])
elif self.engine.dialect.name.lower() == BIGQUERY:
element_values = self.engine.execute(
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
self._table
)
)
return convert_to_json_serializable(element_values.fetchone()[0])
elif self.engine.dialect.name.lower() == SNOWFLAKE:
elif (
self.engine.dialect.name.lower() == BIGQUERY
or self.engine.dialect.name.lower() == SNOWFLAKE
):
element_values = self.engine.execute(
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
self._table
Expand Down Expand Up @@ -381,13 +377,14 @@ def _get_columns_to_profile(self) -> List[str]:
col = col_dict["name"]
self.column_types[col] = str(col_dict["type"])
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
if not self.config._allow_deny_patterns.allowed(
f"{self.dataset_name}.{col}"
if (
not self.config._allow_deny_patterns.allowed(
f"{self.dataset_name}.{col}"
)
or not self.config.profile_nested_fields
and "." in col
):
ignored_columns_by_pattern.append(col)
# We try to ignore nested columns as well
elif not self.config.profile_nested_fields and "." in col:
ignored_columns_by_pattern.append(col)
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
ignored_columns_by_type.append(col)
else:
Expand Down Expand Up @@ -1408,7 +1405,7 @@ def _get_ge_dataset(
},
)

if platform == BIGQUERY or platform == DATABRICKS:
if platform in (BIGQUERY, DATABRICKS):
# This is done as GE makes the name as DATASET.TABLE
# but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
name_parts = pretty_name.split(".")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -568,9 +568,7 @@ def _filter_okta_user(self, okta_user: User) -> bool:
if (
self.config.include_deprovisioned_users is False
and okta_user.status == UserStatus.DEPROVISIONED
):
return False
elif (
) or (
self.config.include_suspended_users is False
and okta_user.status == UserStatus.SUSPENDED
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -447,13 +447,10 @@ def get_parser(
) -> DebeziumParser:
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")

if connector_class == "io.debezium.connector.mysql.MySqlConnector":
parser = self.DebeziumParser(
source_platform="mysql",
server_name=self.get_server_name(connector_manifest),
database_name=None,
)
elif connector_class == "MySqlConnector":
if (
connector_class == "io.debezium.connector.mysql.MySqlConnector"
or connector_class == "MySqlConnector"
):
parser = self.DebeziumParser(
source_platform="mysql",
server_name=self.get_server_name(connector_manifest),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def get_properties(self, record: dict) -> str:
return record["properties"]

def get_relationships(self, record: dict) -> dict:
return record.get("relationships", None)
return record.get("relationships")

def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
return [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
Try to determine if example data is defined for the endpoint, and return it
"""
data = {}
if "content" in base_res.keys():
if "content" in base_res:
res_cont = base_res["content"]
if "application/json" in res_cont.keys():
ex_field = None
Expand All @@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
)
elif "text/csv" in res_cont.keys():
data = res_cont["text/csv"]["schema"]
elif "examples" in base_res.keys():
elif "examples" in base_res:
data = base_res["examples"]["application/json"]

return data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -890,9 +890,7 @@ def to_datahub_users(
set(user_rights) & set(self.__config.ownership.owner_criteria)
)
> 0
):
user_mcps.extend(self.to_datahub_user(user))
elif self.__config.ownership.owner_criteria is None:
) or self.__config.ownership.owner_criteria is None:
user_mcps.extend(self.to_datahub_user(user))
else:
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def get_all_reports(self) -> List[Any]:
}

reports: List[Any] = []
for report_type in report_types_mapping.keys():
for report_type in report_types_mapping:
report_get_endpoint: str = API_ENDPOINTS[report_type]
# Replace place holders
report_get_endpoint_http = report_get_endpoint.format(
Expand Down
4 changes: 1 addition & 3 deletions metadata-ingestion/src/datahub/ingestion/source/s3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,7 @@ def check_path_specs_and_infer_platform(

@pydantic.validator("platform", always=True)
def platform_valid(cls, platform: str, values: dict) -> str:
inferred_platform = values.get(
"platform", None
) # we may have inferred it above
inferred_platform = values.get("platform") # we may have inferred it above
platform = platform or inferred_platform
if not platform:
raise ValueError("platform must not be empty")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def get_dir_to_process(
min=min,
)
folders.extend(folders_list)
if not path_spec.traversal_method == FolderTraversalMethod.ALL:
if path_spec.traversal_method != FolderTraversalMethod.ALL:
return folders
if folders:
return folders
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ def _get_field_description(self, field: dict, customField: dict) -> str:
prefix = "\\" if text.startswith("#") else ""
desc += f"\n\n{prefix}{text}"

text = field.get("InlineHelpText", None)
text = field.get("InlineHelpText")
if text:
prefix = "\\" if text.startswith("#") else ""
desc += f"\n\n{prefix}{text}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> Non

extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}

for field_path in schema.keys():
for field_path in schema:
field_types = schema[field_path]["types"]
field_type: Union[str, type] = "mixed"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def validate_account_id(cls, account_id: str) -> str:

@pydantic.validator("authentication_type", always=True)
def authenticator_type_is_valid(cls, v, values):
if v not in _VALID_AUTH_TYPES.keys():
if v not in _VALID_AUTH_TYPES:
raise ValueError(
f"unsupported authenticator type '{v}' was provided,"
f" use one of {list(_VALID_AUTH_TYPES.keys())}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ class SnowflakePrivilege:
failure_reason=failure_message,
)

if c in _report.keys():
if c in _report:
continue

# If some capabilities are missing, then mark them as not capable
Expand Down
4 changes: 2 additions & 2 deletions metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def get_table_properties(
metadata.table_type if metadata.table_type else ""
)

location: Optional[str] = custom_properties.get("location", None)
location: Optional[str] = custom_properties.get("location")
if location is not None:
if location.startswith("s3://"):
location = make_s3_urn(location, self.config.env)
Expand Down Expand Up @@ -538,7 +538,7 @@ def get_schema_fields_for_column(
column_name=column["name"],
column_type=column["type"],
inspector=inspector,
description=column.get("comment", None),
description=column.get("comment"),
nullable=column.get("nullable", True),
is_part_of_key=(
True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def get_column_type(
"""

TypeClass: Optional[Type] = None
for sql_type in _field_type_mapping.keys():
for sql_type in _field_type_mapping:
if isinstance(column_type, sql_type):
TypeClass = _field_type_mapping[sql_type]
break
Expand Down Expand Up @@ -973,7 +973,7 @@ def get_schema_fields_for_column(
inspector=inspector,
)
),
description=column.get("comment", None),
description=column.get("comment"),
nullable=column["nullable"],
recursive=False,
globalTags=gtc,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
if match:
modified_type_base = match.group(1) # Extract the base type
return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
return SNOWFLAKE_TYPES_MAP.get(modified_type_base)

# Fallback for types without precision/scale
return SNOWFLAKE_TYPES_MAP.get(type_string, None)
return SNOWFLAKE_TYPES_MAP.get(type_string)


# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
Expand Down
4 changes: 2 additions & 2 deletions metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
if col_value is not None:
properties[col_name] = col_value

return {"text": properties.get("comment", None), "properties": properties}
return {"text": properties.get("comment"), "properties": properties}
else:
return self.get_table_comment_default(connection, table_name, schema)
except Exception:
Expand Down Expand Up @@ -483,7 +483,7 @@ def _parse_struct_fields(parts):


def _parse_basic_datatype(s):
for sql_type in _all_atomic_types.keys():
for sql_type in _all_atomic_types:
if isinstance(s, sql_type):
return {
"type": _all_atomic_types[sql_type],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1911,11 +1911,7 @@ def get_upstream_columns_of_fields_in_datasource(
if upstream_col.get(c.TABLE)
else None
)
if (
name
and upstream_table_id
and upstream_table_id in table_id_to_urn.keys()
):
if name and upstream_table_id and upstream_table_id in table_id_to_urn:
parent_dataset_urn = table_id_to_urn[upstream_table_id]
if (
self.is_snowflake_urn(parent_dataset_urn)
Expand Down
Loading

0 comments on commit 8f984aa

Please sign in to comment.