From a288d6a80654e3ffaa2c19ed6c51941850f1d7ec Mon Sep 17 00:00:00 2001 From: Piotr Stachyra Date: Fri, 7 Jun 2024 08:27:05 +0200 Subject: [PATCH 1/2] retry and exit using statuses --- uptime_service_validation/coordinator/server.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/uptime_service_validation/coordinator/server.py b/uptime_service_validation/coordinator/server.py index 7bba7a2..52dae3f 100644 --- a/uptime_service_validation/coordinator/server.py +++ b/uptime_service_validation/coordinator/server.py @@ -272,9 +272,16 @@ def setUpValidatorPods(time_intervals, logging, worker_image, worker_tag): if job_status.status.succeeded: logging.info(f"Job {job_name} succeeded.") jobs.remove(job_name) - elif job_status.status.failed: - logging.error(f"Job {job_name} failed.") + elif job_status.status.failed < Config.RETRY_COUNT: + logging.warning( + f"Job {job_name} failed. Retrying attempt {job_status.status.failed}/{Config.RETRY_COUNT}..." + ) + else: + logging.error( + f"Job {job_name} failed. Maximum retries ({Config.RETRY_COUNT}) reached. Exiting the program..." + ) jobs.remove(job_name) + exit(1) except Exception as e: logging.error(f"Error reading job status for {job_name}: {e}") From fb599335cd4da7659ce54fa9aa3c2396125467ff Mon Sep 17 00:00:00 2001 From: Piotr Stachyra Date: Fri, 7 Jun 2024 08:52:18 +0200 Subject: [PATCH 2/2] account for when there is not yet failed status --- .../coordinator/server.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/uptime_service_validation/coordinator/server.py b/uptime_service_validation/coordinator/server.py index 52dae3f..d2a9e77 100644 --- a/uptime_service_validation/coordinator/server.py +++ b/uptime_service_validation/coordinator/server.py @@ -272,16 +272,17 @@ def setUpValidatorPods(time_intervals, logging, worker_image, worker_tag): if job_status.status.succeeded: logging.info(f"Job {job_name} succeeded.") jobs.remove(job_name) - elif job_status.status.failed < Config.RETRY_COUNT: - logging.warning( - f"Job {job_name} failed. Retrying attempt {job_status.status.failed}/{Config.RETRY_COUNT}..." - ) - else: - logging.error( - f"Job {job_name} failed. Maximum retries ({Config.RETRY_COUNT}) reached. Exiting the program..." - ) - jobs.remove(job_name) - exit(1) + elif job_status.status.failed is not None: + if job_status.status.failed < Config.RETRY_COUNT: + logging.warning( + f"Job {job_name} failed. Retrying attempt {job_status.status.failed}/{Config.RETRY_COUNT}..." + ) + else: + logging.error( + f"Job {job_name} failed. Maximum retries ({Config.RETRY_COUNT}) reached. Exiting the program..." + ) + jobs.remove(job_name) + exit(1) except Exception as e: logging.error(f"Error reading job status for {job_name}: {e}")