Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dev: enable ruff rule #12749

Merged
merged 3 commits into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions metadata-ingestion/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ extend-select = [
"I", # isort
"TID", # flake8-tidy-imports
"RUF100", # unused-noqa
"SIM", # flake8-simplify
]
extend-ignore = [
"E501", # Handled by formatter
Expand All @@ -40,8 +41,21 @@ extend-ignore = [
"B008", # Allow function call in argument defaults
"RUF012", # mutable-class-default; incompatible with pydantic
"RUF015", # unnecessary-iterable-allocation-for-first-element
# TODO: Enable these later
"B006", # Mutable args

# Can be enabled later if someone wants to fix existing cases or an auto-fix becomes available
"SIM101", # Multiple isinstance calls for {name}, merge into a single call
"SIM102", # Use a single `if` statement instead of nested `if` statements
"SIM103", # Return the condition directly
"SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
"SIM108", # Use ternary operator {contents} instead of if-else-block
"SIM110", # Use `return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)` instead of `for` loop
"SIM113", # Use enumerate() for index variable {index} in for loop
"SIM115", # Use a context manager for opening files
"SIM116", # Use a dictionary instead of consecutive `if` statements
"SIM117", # Use a single with statement with multiple contexts instead of nested with statements
"SIM118", # Use key {operator} dict instead of key {operator} dict.keys()
"SIM210", # Use `bool(...)` instead of `True if ... else False`
"SIM401", # Use `sample_data.get(schema_field.fieldPath, [])` instead of an `if` block
]

[tool.ruff.lint.mccabe]
Expand Down
4 changes: 1 addition & 3 deletions metadata-ingestion/src/datahub/configuration/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ class GitReference(ConfigModel):

@validator("repo", pre=True)
def simplify_repo_url(cls, repo: str) -> str:
if repo.startswith("github.com/"):
repo = f"https://{repo}"
elif repo.startswith("gitlab.com"):
if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
repo = f"https://{repo}"
elif repo.count("/") == 1:
repo = f"https://github.com/{repo}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def get_columns_to_classify(
),
values=(
sample_data[schema_field.fieldPath]
if schema_field.fieldPath in sample_data.keys()
if schema_field.fieldPath in sample_data
else []
),
)
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/src/datahub/ingestion/graph/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def get_aspect_v2(
aspect_type_name: Optional[str] = None,
version: int = 0,
) -> Optional[Aspect]:
assert aspect_type.ASPECT_NAME == aspect
assert aspect == aspect_type.ASPECT_NAME
return self.get_aspect(
entity_urn=entity_urn,
aspect_type=aspect_type,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,7 @@ def _convert_sets_to_lists(obj: Any) -> Any:
key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
for key, value in obj.items()
}
elif isinstance(obj, list):
return [
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
for element in obj
]
elif isinstance(obj, set):
elif isinstance(obj, list) or isinstance(obj, set):
return [
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
for element in obj
Expand Down
6 changes: 2 additions & 4 deletions metadata-ingestion/src/datahub/ingestion/source/abs/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,8 @@ def check_path_specs_and_infer_platform(
return path_specs

@pydantic.validator("platform", always=True)
def platform_not_empty(cls, platform: str, values: dict) -> str:
inferred_platform = values.get(
"platform", None
) # we may have inferred it above
def platform_not_empty(cls, platform: Any, values: dict) -> str:
inferred_platform = values.get("platform") # we may have inferred it above
platform = platform or inferred_platform
if not platform:
raise ValueError("platform must not be empty")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def from_bigquery_table(cls, table: BigqueryTableIdentifier) -> "BigQueryTableRe
@classmethod
def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
for key in ["projectId", "datasetId", "tableId"]:
if key not in spec.keys():
if key not in spec:
raise ValueError(f"invalid BigQuery table reference dict: {spec}")

return cls(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def get_tables_for_dataset(
with_partitions: bool = False,
) -> Iterator[BigqueryTable]:
with PerfTimer() as current_timer:
filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
filter_clause: str = ", ".join(f"'{table}'" for table in tables)

if with_partitions:
query_template = BigqueryQuery.tables_for_dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def get_resource_description_work_unit(
"datajob": EditableDataJobPropertiesClass,
"dataflow": EditableDataFlowPropertiesClass,
"notebook": EditableNotebookPropertiesClass,
}.get(entityType, None)
}.get(entityType)

if not entityClass:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1033,7 +1033,7 @@ def add_node_to_cll_list(dbt_name: str) -> None:
cll_nodes.add(dbt_name)
schema_nodes.add(dbt_name)

for dbt_name in all_nodes_map.keys():
for dbt_name in all_nodes_map:
if self._is_allowed_node(dbt_name):
add_node_to_cll_list(dbt_name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
# Delete empty dataflows if needed
if self.config.delete_empty_data_flows:
deleted_data_flows: int = 0
for key in dataFlows.keys():
for key in dataFlows:
if not dataJobs.get(key) or len(dataJobs[key]) == 0:
logger.info(
f"Deleting dataflow {key} because there are not datajobs"
Expand Down
25 changes: 11 additions & 14 deletions metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
).select_from(self._table)
)
return convert_to_json_serializable(element_values.fetchone()[0])
elif self.engine.dialect.name.lower() == BIGQUERY:
element_values = self.engine.execute(
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
self._table
)
)
return convert_to_json_serializable(element_values.fetchone()[0])
elif self.engine.dialect.name.lower() == SNOWFLAKE:
elif (
self.engine.dialect.name.lower() == BIGQUERY
or self.engine.dialect.name.lower() == SNOWFLAKE
):
element_values = self.engine.execute(
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
self._table
Expand Down Expand Up @@ -381,13 +377,14 @@ def _get_columns_to_profile(self) -> List[str]:
col = col_dict["name"]
self.column_types[col] = str(col_dict["type"])
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
if not self.config._allow_deny_patterns.allowed(
f"{self.dataset_name}.{col}"
if (
not self.config._allow_deny_patterns.allowed(
f"{self.dataset_name}.{col}"
)
or not self.config.profile_nested_fields
and "." in col
):
ignored_columns_by_pattern.append(col)
# We try to ignore nested columns as well
elif not self.config.profile_nested_fields and "." in col:
ignored_columns_by_pattern.append(col)
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
ignored_columns_by_type.append(col)
else:
Expand Down Expand Up @@ -1408,7 +1405,7 @@ def _get_ge_dataset(
},
)

if platform == BIGQUERY or platform == DATABRICKS:
if platform in (BIGQUERY, DATABRICKS):
# This is done as GE makes the name as DATASET.TABLE
# but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
name_parts = pretty_name.split(".")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -568,9 +568,7 @@ def _filter_okta_user(self, okta_user: User) -> bool:
if (
self.config.include_deprovisioned_users is False
and okta_user.status == UserStatus.DEPROVISIONED
):
return False
elif (
) or (
self.config.include_suspended_users is False
and okta_user.status == UserStatus.SUSPENDED
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -447,13 +447,10 @@ def get_parser(
) -> DebeziumParser:
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")

if connector_class == "io.debezium.connector.mysql.MySqlConnector":
parser = self.DebeziumParser(
source_platform="mysql",
server_name=self.get_server_name(connector_manifest),
database_name=None,
)
elif connector_class == "MySqlConnector":
if (
connector_class == "io.debezium.connector.mysql.MySqlConnector"
or connector_class == "MySqlConnector"
):
parser = self.DebeziumParser(
source_platform="mysql",
server_name=self.get_server_name(connector_manifest),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,9 @@ def lookml_model_explore(self, model: str, explore_name: str) -> LookmlModelExpl
def folder_ancestors(
self,
folder_id: str,
fields: Union[str, List[str]] = ["id", "name", "parent_id"],
fields: Optional[Union[str, List[str]]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While in practice this is not a breaking change, I prefer to keep the default value in the signature than inside the implementation. Because knowing the default value is relevant for the caller.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mutable args can lead to bugs https://docs.python-guide.org/writing/gotchas/ This is a well documented problem.

) -> Sequence[Folder]:
fields = fields or ["id", "name", "parent_id"]
self.client_stats.folder_calls += 1
try:
return self.client.folder_ancestors(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,10 @@ def process_lookml_template_language(
source_config: LookMLSourceConfig,
view_lkml_file_dict: dict,
reporter: LookMLSourceReport,
manifest_constants: Dict[str, "LookerConstant"] = {},
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
resolve_constants: bool = False,
) -> None:
manifest_constants = manifest_constants or {}
if "views" not in view_lkml_file_dict:
return

Expand Down Expand Up @@ -507,9 +508,10 @@ def load_and_preprocess_file(
path: Union[str, pathlib.Path],
source_config: LookMLSourceConfig,
reporter: LookMLSourceReport,
manifest_constants: Dict[str, "LookerConstant"] = {},
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
resolve_constants: bool = False,
) -> dict:
manifest_constants = manifest_constants or {}
parsed = load_lkml(path)

process_lookml_template_language(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1006,8 +1006,9 @@ def gen_project_workunits(self, project_name: str) -> Iterable[MetadataWorkUnit]
def report_skipped_unreachable_views(
self,
viewfile_loader: LookerViewFileLoader,
processed_view_map: Dict[str, Set[str]] = {},
processed_view_map: Optional[Dict[str, Set[str]]] = None,
) -> None:
processed_view_map = processed_view_map or {}
view_files: Dict[str, List[pathlib.Path]] = {}
for project, folder_path in self.base_projects_folder.items():
folder = pathlib.Path(folder_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def get_properties(self, record: dict) -> str:
return record["properties"]

def get_relationships(self, record: dict) -> dict:
return record.get("relationships", None)
return record.get("relationships", {})

def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
return [
Expand Down
9 changes: 6 additions & 3 deletions metadata-ingestion/src/datahub/ingestion/source/nifi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,11 +1234,14 @@ def construct_job_workunits(
job_type: str,
description: Optional[str],
job_properties: Optional[Dict[str, str]] = None,
inlets: List[str] = [],
outlets: List[str] = [],
inputJobs: List[str] = [],
inlets: Optional[List[str]] = None,
outlets: Optional[List[str]] = None,
inputJobs: Optional[List[str]] = None,
status: Optional[str] = None,
) -> Iterable[MetadataWorkUnit]:
inlets = inlets or []
outlets = outlets or []
inputJobs = inputJobs or []
logger.debug(f"Begining construction of job workunit for {job_urn}")
if job_properties:
job_properties = {k: v for k, v in job_properties.items() if v is not None}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
Try to determine if example data is defined for the endpoint, and return it
"""
data = {}
if "content" in base_res.keys():
if "content" in base_res:
res_cont = base_res["content"]
if "application/json" in res_cont.keys():
ex_field = None
Expand All @@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
)
elif "text/csv" in res_cont.keys():
data = res_cont["text/csv"]["schema"]
elif "examples" in base_res.keys():
elif "examples" in base_res:
data = base_res["examples"]["application/json"]

return data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import importlib.resources as pkg_resource
import logging
import os
from typing import Dict, List
from typing import Dict, List, Optional

import lark
from lark import Lark, Tree
Expand Down Expand Up @@ -65,8 +65,9 @@ def get_upstream_tables(
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
ctx: PipelineContext,
config: PowerBiDashboardSourceConfig,
parameters: Dict[str, str] = {},
parameters: Optional[Dict[str, str]] = None,
) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
parameters = parameters or {}
if table.expression is None:
logger.debug(f"There is no M-Query expression in table {table.full_name}")
return []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,14 @@ def internal(node: Union[Tree, Token]) -> Optional[Tree]:
return expression_tree


def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]:
"""
:param tree: Tree to traverse
:param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references
using the values in 'parameters'.
:return: List of leaf token data
"""
parameters = parameters or {}
values: List[str] = []

def internal(node: Union[Tree, Token]) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -890,9 +890,7 @@ def to_datahub_users(
set(user_rights) & set(self.__config.ownership.owner_criteria)
)
> 0
):
user_mcps.extend(self.to_datahub_user(user))
elif self.__config.ownership.owner_criteria is None:
) or self.__config.ownership.owner_criteria is None:
user_mcps.extend(self.to_datahub_user(user))
else:
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,9 @@ def get_tiles(self, workspace: Workspace, dashboard: Dashboard) -> List[Tile]:
def itr_pages(
self,
endpoint: str,
parameter_override: Dict = {},
parameter_override: Optional[Dict] = None,
) -> Iterator[List[Dict]]:
parameter_override = parameter_override or {}
params: dict = {
"$skip": 0,
"$top": self.TOP,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def get_all_reports(self) -> List[Any]:
}

reports: List[Any] = []
for report_type in report_types_mapping.keys():
for report_type in report_types_mapping:
report_get_endpoint: str = API_ENDPOINTS[report_type]
# Replace place holders
report_get_endpoint_http = report_get_endpoint.format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ def __init__(self, tenant_hostname: str, api_key: str, app_id: str) -> None:
self.handle = [-1]

def _build_websocket_request_dict(
self, method: str, params: Union[Dict, List] = {}
self, method: str, params: Optional[Union[Dict, List]] = None
) -> Dict:
params = params or {}
return {
"jsonrpc": "2.0",
"id": self.request_id,
Expand All @@ -37,11 +38,12 @@ def _send_request(self, request: Dict) -> Dict:
return {}

def websocket_send_request(
self, method: str, params: Union[Dict, List] = {}
self, method: str, params: Optional[Union[Dict, List]] = None
) -> Dict:
"""
Method to send request to websocket
"""
params = params or {}
self.request_id += 1
request = self._build_websocket_request_dict(method, params)
response = self._send_request(request=request)
Expand Down
Loading
Loading