Skip to content

Commit

Permalink
account for the case where we have fewer replicas than backends
Browse files Browse the repository at this point in the history
  • Loading branch information
sergiitk committed May 25, 2024
1 parent f4e2772 commit 3389995
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 9 deletions.
19 changes: 16 additions & 3 deletions framework/infrastructure/gcp/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ def wait_for_backends_healthy_status(
*,
timeout_sec: int = _WAIT_FOR_BACKEND_SEC,
wait_sec: int = _WAIT_FOR_BACKEND_SLEEP_SEC,
replica_count: int = 1,
) -> None:
if not backends:
raise ValueError("The list of backends to wait on is empty")
Expand All @@ -418,8 +419,15 @@ def wait_for_backends_healthy_status(
check_result=lambda result: result,
)
pending = set(backends)
healthy = set()
try:
retryer(self._retry_backends_health, backend_service, pending)
retryer(
self._retry_backends_health,
backend_service,
pending,
healthy,
replica_count=replica_count,
)
except retryers.RetryError as retry_err:
unhealthy_backends: str = ",".join(
[backend.name for backend in pending]
Expand Down Expand Up @@ -470,7 +478,11 @@ def wait_for_backends_healthy_status(
raise

def _retry_backends_health(
self, backend_service: GcpResource, pending: Set[ZonalGcpResource]
self,
backend_service: GcpResource,
pending: Set[ZonalGcpResource],
healthy: Set[ZonalGcpResource],
replica_count: int = 1,
):
for backend in pending:
result = self.get_backend_service_backend_health(
Expand Down Expand Up @@ -504,8 +516,9 @@ def _retry_backends_health(
backend.zone,
)
pending.remove(backend)
healthy.add(backend)

return not pending
return not pending or len(healthy) >= replica_count

def get_backend_service_backend_health(self, backend_service, backend):
return (
Expand Down
4 changes: 2 additions & 2 deletions framework/infrastructure/traffic_director.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,14 +301,14 @@ def backend_service_remove_all_backends(self):
)
self.compute.backend_service_remove_all_backends(self.backend_service)

def wait_for_backends_healthy_status(self):
def wait_for_backends_healthy_status(self, replica_count: int = 1):
logger.info(
"Waiting for Backend Service %s to report all backends healthy: %r",
self.backend_service.name,
[backend.name for backend in self.backends],
)
self.compute.wait_for_backends_healthy_status(
self.backend_service, self.backends
self.backend_service, self.backends, replica_count=replica_count
)

def create_alternative_backend_service(
Expand Down
4 changes: 4 additions & 0 deletions framework/test_app/runners/k8s/k8s_xds_server_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class KubernetesServerRunner(k8s_base_runner.KubernetesBaseRunner):

# Below is mutable state associated with the current run.
service: Optional[k8s.V1Service] = None
replica_count: int = 0

# A map from pod names to the server app.
pods_to_servers: dict[str, XdsTestServer]
Expand Down Expand Up @@ -159,6 +160,7 @@ def _reset_state(self):
super()._reset_state()
self.service = None
self.pods_to_servers = {}
self.replica_count = 0

def run( # pylint: disable=arguments-differ,too-many-branches
self,
Expand Down Expand Up @@ -298,6 +300,8 @@ def _make_servers_for_deployment(
self.deployment_name, replica_count
)
self._start_completed()
# TODO(sergiitk): move to super()._start_completed
self.replica_count = replica_count

servers: List[XdsTestServer] = []
for pod in self.pods_started.values():
Expand Down
4 changes: 3 additions & 1 deletion framework/xds_k8s_testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@ def setupServerBackends(
neg_name, neg_zones, max_rate_per_endpoint=max_rate_per_endpoint
)
if wait_for_healthy_status:
self.td.wait_for_backends_healthy_status()
self.td.wait_for_backends_healthy_status(
replica_count=server_runner.replica_count
)

def removeServerBackends(self, *, server_runner=None):
if server_runner is None:
Expand Down
4 changes: 1 addition & 3 deletions tests/app_net_csm_observability_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,7 @@ def test_csm_observability(self):

with self.subTest("1_run_test_server"):
start_secs = int(time.time())
test_server: _XdsTestServer = self.startTestServers(
replica_count=1
)[0]
test_server: _XdsTestServer = self.startTestServers()[0]

with self.subTest("5_setup_server_backends"):
self.setupServerBackends()
Expand Down

0 comments on commit 3389995

Please sign in to comment.