diff --git a/paasta_tools/tron_tools.py b/paasta_tools/tron_tools.py index ccccf5f557..f5918fce25 100644 --- a/paasta_tools/tron_tools.py +++ b/paasta_tools/tron_tools.py @@ -62,7 +62,9 @@ from paasta_tools import spark_tools from paasta_tools.kubernetes_tools import ( + NodeSelectorConfig, allowlist_denylist_to_requirements, + contains_zone_label, get_service_account_name, limit_size_with_hash, raw_selectors_to_requirements, @@ -248,6 +250,7 @@ class TronActionConfigDict(InstanceConfigDict, total=False): # maneuvering to unify command: str service_account_name: str + node_selectors: Dict[str, NodeSelectorConfig] # the values for this dict can be anything since it's whatever # spark accepts @@ -594,18 +597,35 @@ def get_node_selectors(self) -> Dict[str, str]: def get_node_affinities(self) -> Optional[List[Dict[str, Union[str, List[str]]]]]: """Converts deploy_whitelist and deploy_blacklist in node affinities. - note: At the time of writing, `kubectl describe` does not show affinities, + NOTE: At the time of writing, `kubectl describe` does not show affinities, only selectors. To see affinities, use `kubectl get pod -o json` instead. + + WARNING: At the time of writing, we only used requiredDuringSchedulingIgnoredDuringExecution node affinities in Tron as we currently have + no use case for preferredDuringSchedulingIgnoredDuringExecution node affinities. """ requirements = allowlist_denylist_to_requirements( allowlist=self.get_deploy_whitelist(), denylist=self.get_deploy_blacklist(), ) + node_selectors = self.config_dict.get("node_selectors", {}) requirements.extend( raw_selectors_to_requirements( - raw_selectors=self.config_dict.get("node_selectors", {}), # type: ignore + raw_selectors=self.config_dict.get("node_selectors", {}), ) ) + + # PAASTA-18198: To improve AZ balance with Karpenter, we temporarily allow specifying zone affinities per pool + pool_node_affinities = load_system_paasta_config().get_pool_node_affinities() + if pool_node_affinities and self.get_pool() in pool_node_affinities: + current_pool_node_affinities = pool_node_affinities[self.get_pool()] + # If the service already has a node selector for a zone, we don't want to override it + if current_pool_node_affinities and not contains_zone_label(node_selectors): + requirements.extend( + raw_selectors_to_requirements( + raw_selectors=current_pool_node_affinities, + ) + ) + if not requirements: return None @@ -984,6 +1004,25 @@ def format_tron_action_dict(action_config: TronActionConfig): result["node_selectors"] = action_config.get_node_selectors() result["node_affinities"] = action_config.get_node_affinities() + # XXX: this is currently hardcoded since we should only really need TSC for zone-aware scheduling + result["topology_spread_constraints"] = [ + { + # try to evenly spread pods across specified topology + "max_skew": 1, + # narrow down what pods to consider when spreading + "label_selector": { + # only consider pods that are managed by tron + "app.kubernetes.io/managed-by": "tron", + # and in the same pool + "paasta.yelp.com/pool": action_config.get_pool(), + }, + # now, spread across AZs + "topology_key": "topology.kubernetes.io/zone", + # but if not possible, schedule even with a zonal imbalance + "when_unsatisfiable": "ScheduleAnyway", + }, + ] + # XXX: once we're off mesos we can make get_cap_* return just the cap names as a list result["cap_add"] = [cap["value"] for cap in action_config.get_cap_add()] result["cap_drop"] = [cap["value"] for cap in action_config.get_cap_drop()]