From 2239be9dffc9b129b40c744e4c9fe3c1eb59bbbd Mon Sep 17 00:00:00 2001 From: Piotr Stachyra Date: Wed, 5 Jun 2024 16:19:40 +0200 Subject: [PATCH 1/2] use conditions to make sure failed jobs is after all retries --- uptime_service_validation/coordinator/server.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/uptime_service_validation/coordinator/server.py b/uptime_service_validation/coordinator/server.py index 7bba7a2..5107d86 100644 --- a/uptime_service_validation/coordinator/server.py +++ b/uptime_service_validation/coordinator/server.py @@ -269,12 +269,17 @@ def setUpValidatorPods(time_intervals, logging, worker_image, worker_tag): for job_name in list(jobs): try: job_status = api_batch.read_namespaced_job_status(job_name, namespace) - if job_status.status.succeeded: - logging.info(f"Job {job_name} succeeded.") - jobs.remove(job_name) - elif job_status.status.failed: - logging.error(f"Job {job_name} failed.") - jobs.remove(job_name) + conditions = job_status.status.conditions + if conditions: + for condition in conditions: + if condition.type == "Failed" and condition.status == "True": + logging.error(f"Job {job_name} failed: {condition.message}") + jobs.remove(job_name) + elif ( + condition.type == "Complete" and condition.status == "True" + ): + logging.info(f"Job {job_name} succeeded.") + jobs.remove(job_name) except Exception as e: logging.error(f"Error reading job status for {job_name}: {e}") From 637c30f96f2b13d52ff57e7463a7652e65e467fc Mon Sep 17 00:00:00 2001 From: Piotr Stachyra Date: Thu, 6 Jun 2024 10:53:57 +0200 Subject: [PATCH 2/2] exit coordinator on job failure --- uptime_service_validation/coordinator/server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/uptime_service_validation/coordinator/server.py b/uptime_service_validation/coordinator/server.py index 5107d86..7321414 100644 --- a/uptime_service_validation/coordinator/server.py +++ b/uptime_service_validation/coordinator/server.py @@ -275,6 +275,8 @@ def setUpValidatorPods(time_intervals, logging, worker_image, worker_tag): if condition.type == "Failed" and condition.status == "True": logging.error(f"Job {job_name} failed: {condition.message}") jobs.remove(job_name) + logging.fatal("Exiting due to job failure.") + exit(1) elif ( condition.type == "Complete" and condition.status == "True" ):