Skip to content

Commit

Permalink
with rx default as 1, catch and report OOM errors
Browse files Browse the repository at this point in the history
  • Loading branch information
amitabhverma committed Jan 22, 2025
1 parent a80a337 commit b016596
Showing 1 changed file with 46 additions and 8 deletions.
54 changes: 46 additions & 8 deletions recOrder/plugin/tab_recon.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
JOB_COMPLETION_STR = "Job completed successfully"
JOB_RUNNING_STR = "Starting with JobEnvironment"
JOB_TRIGGERED_EXC = "Submitted job triggered an exception"
JOB_OOM_EVENT = "oom_kill event"

_validate_alert = "⚠"
_validate_ok = "✔️"
Expand Down Expand Up @@ -3310,6 +3311,21 @@ def table_update_and_cleaup_thread(
logs_folder_path,
extension="err",
)
if JOB_OOM_EVENT in jobERR:
params["status"] = STATUS_errored_job
_infoBox.setText(
jobERR +
"\n\n"
+ jobTXT
)
self.client_release(
expIdx,
jobIdx,
client_socket,
params,
reason=0,
)
break
_infoBox.setText(
jobIdx
+ "\n"
Expand All @@ -3323,7 +3339,7 @@ def table_update_and_cleaup_thread(
jobIdx,
client_socket,
params,
reason=6,
reason=0,
)
break
elif params["status"] == STATUS_finished_job:
Expand All @@ -3336,7 +3352,7 @@ def table_update_and_cleaup_thread(
jobIdx,
client_socket,
params,
reason=7,
reason=6,
)
break
else:
Expand Down Expand Up @@ -3368,15 +3384,35 @@ def table_update_and_cleaup_thread(
jobIdx,
client_socket,
params,
reason=8,
reason=0,
)
break
elif JOB_RUNNING_STR in jobTXT:
params["status"] = STATUS_running_job
_infoBox.setText(jobTXT)
_tUpdateCount += 1
if _tUpdateCount > 60:
if _lastUpdate_jobTXT != jobTXT:
jobERR = self.JobsMgmt.check_for_jobID_File(
jobIdx,
logs_folder_path,
extension="err",
)
if JOB_OOM_EVENT in jobERR:
params["status"] = STATUS_errored_job
_infoBox.setText(
jobERR +
"\n\n"
+ jobTXT
)
self.client_release(
expIdx,
jobIdx,
client_socket,
params,
reason=0,
)
break
elif _lastUpdate_jobTXT != jobTXT:
# if there is an update reset counter
_tUpdateCount = 0
_lastUpdate_jobTXT = jobTXT
Expand All @@ -3391,7 +3427,7 @@ def table_update_and_cleaup_thread(
jobIdx,
client_socket,
params,
reason=9,
reason=0,
)
break
else:
Expand All @@ -3410,14 +3446,14 @@ def table_update_and_cleaup_thread(
jobIdx,
client_socket,
params,
reason=10,
reason=0,
)
break
except Exception as exc:
print(exc.args)
else:
self.client_release(
expIdx, jobIdx, client_socket, params, reason=11
expIdx, jobIdx, client_socket, params, reason=0
)
break
else:
Expand Down Expand Up @@ -3460,7 +3496,9 @@ def client_release(self, expIdx, jobIdx, client_socket, params, reason=0):
}
json_str = json.dumps(json_obj) + "\n"
client_socket.send(json_str.encode())
ROW_POP_QUEUE.append(expIdx)

if reason != 0: # remove processing entry when exiting without error
ROW_POP_QUEUE.append(expIdx)
# print("FINISHED")

if self.pool is not None:
Expand Down

0 comments on commit b016596

Please sign in to comment.