Skip to content

Commit

Permalink
Add test for the propagation of queue_options from ertconfig to cluster
Browse files Browse the repository at this point in the history
This commit adds tests that verify that the queue_options are all
propagated correctly to the slurm, openpbs, and lsf queue system, using
our mocked binaries.
This commit also refactors the mocked binaries to have each job use its
own sub-directory instead of prefixing the files with job_id.
  • Loading branch information
jonathan-eq committed Feb 27, 2025
1 parent a29d845 commit 162e24e
Show file tree
Hide file tree
Showing 17 changed files with 305 additions and 77 deletions.
10 changes: 4 additions & 6 deletions tests/ert/unit_tests/scheduler/bin/bhist.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,16 @@ def main() -> None:

jobs_output: list[Job] = []
for job in args.jobs:
job_name: str = read(jobs_path / f"{job}.name") or "_"
job_name: str = read(jobs_path / job / "name") or "_"
assert job_name is not None

submit_time_millis: int = int(
os.path.getctime(jobs_path / f"{job}.name") * 1000
)
submit_time_millis: int = int(os.path.getctime(jobs_path / job / "name") * 1000)
pending_time_millis = int(read(jobs_path / "pendingtimemillis") or 0)
run_start_time_millis: int = submit_time_millis + pending_time_millis
end_time_millis: int = int(time.time() * 1000)
if (jobs_path / f"{job}.returncode").exists():
if (jobs_path / job / "returncode").exists():
end_time_millis = int(
os.path.getctime(jobs_path / f"{job}.returncode") * 1000
os.path.getctime(jobs_path / job / "returncode") * 1000
)
if not args.l:
print("bhist says job is done")
Expand Down
6 changes: 3 additions & 3 deletions tests/ert/unit_tests/scheduler/bin/bjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ def main() -> None:

# this is for the bjobs call looking for exit code
if args.o.strip() == "exit_code":
returncode = read(jobs_path / f"{args.jobs[0]}.returncode")
returncode = read(jobs_path / args.jobs[0] / "returncode")
print(returncode)
return

jobs_output: list[Job] = []
for job in args.jobs:
pid = read(jobs_path / f"{job}.pid")
returncode = read(jobs_path / f"{job}.returncode")
pid = read(jobs_path / job / "pid")
returncode = read(jobs_path / job / "returncode")

state: JobState = "PEND"

Expand Down
2 changes: 1 addition & 1 deletion tests/ert/unit_tests/scheduler/bin/bkill.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def main() -> None:
jobdir = Path(os.getenv("PYTEST_TMP_PATH", ".")) / "mock_jobs"
killsignal = getattr(signal, args.signal)
for jobid in args.jobids:
pidfile = jobdir / f"{jobid}.pid"
pidfile = jobdir / jobid / "pid"
if not pidfile.exists():
sys.exit(1)
pid = int(pidfile.read_text(encoding="utf-8").strip())
Expand Down
24 changes: 15 additions & 9 deletions tests/ert/unit_tests/scheduler/bin/bsub
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@ set -e

name="STDIN"

while getopts "o:e:J:q:R:n:" opt
jobid="${RANDOM}"
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
mkdir -p "${jobdir}"
command_invocation_file="${jobdir}/complete_command_invocation"
echo "$0 $@" > "$command_invocation_file"

while getopts "o:e:J:q:R:n:P:" opt
do
case "$opt" in
o)
Expand All @@ -24,29 +30,29 @@ do
R)
resource_requirement=$OPTARG
;;
P)
project_code=$OPTARG
;;
*)
echo "Unprocessed option ${opt}"
;;
esac
done
shift $((OPTIND-1))

jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs"
jobid="${RANDOM}"
job_env_file="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.env"
job_env_file="${jobdir}/env"

mkdir -p "${PYTEST_TMP_PATH:-.}/mock_jobs"
echo $@ > "${jobdir}/${jobid}.script"
echo "$name" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.name"
echo "$resource_requirement" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.resource_requirement"
echo $@ > "${jobdir}/script"
echo "$name" > "${jobdir}/name"
echo "$resource_requirement" > "${jobdir}/resource_requirement"
touch $job_env_file

[ -n $num_cpu ] && echo "export LSB_MAX_NUM_PROCESSORS=$num_cpu" >> $job_env_file

[ -z $stdout ] && stdout="/dev/null"
[ -z $stderr ] && stderr="/dev/null"

bash "$(dirname $0)/lsfrunner" "${jobdir}/${jobid}" >$stdout 2>$stderr &
bash "$(dirname $0)/lsfrunner" "${jobdir}" >$stdout 2>$stderr &
disown

echo "Job <$jobid> is submitted to default queue <normal>."
14 changes: 7 additions & 7 deletions tests/ert/unit_tests/scheduler/bin/lsfrunner
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ job=$1 # NB: Includes full path
function handle_sigterm {
# LSF uses (128 + SIGNAL) as the returncode
# SIGTERM=15
echo "143" > "${job}.returncode"
echo "143" > "${job}/returncode"
for grandchild in $(pgrep -P $child_pid); do
kill -s SIGTERM $grandchild
done
Expand All @@ -14,17 +14,17 @@ function handle_sigterm {

trap handle_sigterm SIGTERM

echo "$$" > "${job}.pid"
source "${job}.env"
bash "${job}.script" > "${job}.stdout" 2> "${job}.stderr" &
echo "$$" > "${job}/pid"
source "${job}/env"
bash "${job}/script" > "${job}/stdout" 2> "${job}/stderr" &
child_pid=$!
wait $child_pid

echo "$?" > "${job}.returncode"
echo "$?" > "${job}/returncode"
echo "Sender: Mocked LSF system <$USER@$(hostname -s)"
echo "Subject: Job $job:"
echo "[..skipped in mock..]"
echo "The output (if any) follows:"
cat ${job}.stdout
cat ${job}/stdout

cat ${job}.stderr >&2
cat ${job}/stderr >&2
6 changes: 3 additions & 3 deletions tests/ert/unit_tests/scheduler/bin/qdel
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/usr/bin/env bash
set -e

jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs"
jobid=$1
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"

if ! [ -f "${jobdir}/${jobid}.pid" ]
if ! [ -f "${jobdir}/pid" ]
then
echo "No such job ${jobid}" >&2
exit 1
fi

pid=$(cat "${jobdir}/${jobid}.pid")
pid=$(cat "${jobdir}/pid")
kill $pid
6 changes: 3 additions & 3 deletions tests/ert/unit_tests/scheduler/bin/qstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ def main() -> None:
print(QSTAT_HEADER, end="")

for job in args.jobs:
name = read(jobs_path / f"{job}.name")
name = read(jobs_path / f"{job}/name")
assert name is not None

pid = read(jobs_path / f"{job}.pid")
returncode = read(jobs_path / f"{job}.returncode")
pid = read(jobs_path / f"{job}/pid")
returncode = read(jobs_path / f"{job}/returncode")

state = "Q"
if returncode is not None:
Expand Down
29 changes: 19 additions & 10 deletions tests/ert/unit_tests/scheduler/bin/qsub
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@ set -e

name="STDIN"

while getopts "N:r:l:o:e:" opt
jobid="test${RANDOM}.localhost"
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
mkdir -p "${jobdir}"
command_invocation_file="${jobdir}/complete_command_invocation"
echo "$0 $@" > "$command_invocation_file"

job_env_file="${jobdir}/env"
touch $job_env_file

while getopts "N:r:l:o:e:q:A:" opt
do
case "$opt" in
N)
Expand All @@ -17,6 +26,12 @@ do
;;
l)
resource=$OPTARG
echo $resource >> $job_env_file
echo $resource >> "${jobdir}/resource_requirement"
;;
q)
;;
A)
;;
*)
echo "Unprocessed option ${opt}"
Expand All @@ -25,22 +40,16 @@ do
done
shift $((OPTIND-1))

jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs"
jobid="test${RANDOM}.localhost"
job_env_file="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.env"

mkdir -p "${PYTEST_TMP_PATH:-.}/mock_jobs"
cat <&0 > "${jobdir}/${jobid}.script"
echo "$name" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.name"
touch $job_env_file
cat <&0 > "${jobdir}/script"
echo "$name" > "${jobdir}/name"

echo $resource >> $job_env_file
num_cpu=$(echo $resource | sed 's/.*ncpus=\([[:digit:]]*\).*/\1/')

[ -n $num_cpu ] && echo "export OMP_NUM_THREADS=$num_cpu" >> $job_env_file
[ -n $num_cpu ] && echo "export NCPUS=$num_cpu" >> $job_env_file

bash "$(dirname $0)/runner" "${jobdir}/${jobid}" >/dev/null 2>/dev/null &
bash "$(dirname $0)/runner" "${jobdir}" >/dev/null 2>/dev/null &
disown

echo "$jobid"
14 changes: 7 additions & 7 deletions tests/ert/unit_tests/scheduler/bin/runner
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ job=$1

function handle_sigterm {
# Torque uses (256 + SIGNAL) as the returncode
echo "271" > "${job}.returncode"
echo "271" > "${job}/returncode"
for grandchild in $(pgrep -P $child_pid); do
kill -s SIGTERM $grandchild
done
Expand All @@ -13,13 +13,13 @@ function handle_sigterm {

trap handle_sigterm SIGTERM

echo "$$" > "${job}.pid"
source "${job}.env"
bash "${job}.script" > "${job}.stdout" 2> "${job}.stderr" &
echo "$$" > "${job}/pid"
source "${job}/env"
bash "${job}/script" > "${job}/stdout" 2> "${job}/stderr" &
child_pid=$!
wait $child_pid
echo $? > "${job}.returncode"
echo $? > "${job}/returncode"

cat ${job}.stdout
cat ${job}/stdout

cat ${job}.stderr >&2
cat ${job}/stderr >&2
12 changes: 6 additions & 6 deletions tests/ert/unit_tests/scheduler/bin/sacct.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""

import argparse
import glob
import os
from pathlib import Path
from typing import Literal
Expand Down Expand Up @@ -35,13 +34,14 @@ def main() -> None:

jobs_path = Path(os.getenv("PYTEST_TMP_PATH", ".")) / "mock_jobs"

for pidfile in glob.glob(f"{jobs_path}/*.pid"):
job = pidfile.split("/")[-1].split(".")[0]
for job_dir in jobs_path.iterdir():
pidfile = job_dir / "pid"
job = job_dir.name
if args.j and job != args.j:
continue
pid = read(Path(pidfile))
returncode = read(jobs_path / f"{job}.returncode")
cancelled = read(jobs_path / f"{job}.cancelled", default="no")
pid = read(pidfile)
returncode = read(job_dir / "returncode")
cancelled = read(job_dir / "cancelled", default="no")
state: JobState = "PENDING"

if pid is not None and returncode is None:
Expand Down
22 changes: 15 additions & 7 deletions tests/ert/unit_tests/scheduler/bin/sbatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import argparse
import os
import random
import shlex
import subprocess
import sys
from pathlib import Path


Expand All @@ -20,6 +22,11 @@ def get_parser() -> argparse.ArgumentParser:
parser.add_argument("--parsable", action="store_true")
parser.add_argument("--output", type=str)
parser.add_argument("--error", type=str)
parser.add_argument("--mem", type=str)
parser.add_argument("--nodelist", type=str)
parser.add_argument("--exclude", type=str)
parser.add_argument("--time", type=str)
parser.add_argument("--account", type=str)
parser.add_argument("script", type=str)
return parser

Expand All @@ -28,12 +35,13 @@ def main() -> None:
args = get_parser().parse_args()

jobid = random.randint(1, 2**15)
jobdir = Path(os.getenv("PYTEST_TMP_PATH", "."))
(jobdir / "mock_jobs").mkdir(parents=True, exist_ok=True)
(jobdir / "mock_jobs" / f"{jobid}.script").write_text(args.script, encoding="utf-8")
(jobdir / "mock_jobs" / f"{jobid}.name").write_text(args.job_name, encoding="utf-8")
env_file = jobdir / "mock_jobs" / f"{jobid}.env"

jobsdir = Path(os.getenv("PYTEST_TMP_PATH", ".")) / "mock_jobs"
jobdir = jobsdir / str(jobid)
jobdir.mkdir(parents=True, exist_ok=True)
(jobdir / "script").write_text(args.script, encoding="utf-8")
(jobdir / "name").write_text(args.job_name, encoding="utf-8")
env_file = jobdir / "env"
(jobdir / "complete_command_invocation").write_text(shlex.join(sys.argv))
if args.ntasks:
env_file.write_text(
f"export SLURM_JOB_CPUS_PER_NODE={args.ntasks}\n"
Expand All @@ -44,7 +52,7 @@ def main() -> None:
env_file.touch()

subprocess.Popen(
[str(Path(__file__).parent / "runner"), f"{jobdir}/mock_jobs/{jobid}"],
[str(Path(__file__).parent / "runner"), f"{jobdir}"],
start_new_session=True,
stdout=open(args.output, "w", encoding="utf-8"), # noqa: SIM115
stderr=open(args.error, "w", encoding="utf-8"), # noqa: SIM115
Expand Down
6 changes: 3 additions & 3 deletions tests/ert/unit_tests/scheduler/bin/scancel
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ set -e
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs"
jobid=$1

if ! [ -f "${jobdir}/${jobid}.pid" ]
if ! [ -f "${jobdir}/${jobid}/pid" ]
then
echo "No such job ${jobid}" >&2
exit 1
fi

pid=$(cat "${jobdir}/${jobid}.pid")
pid=$(cat "${jobdir}/${jobid}/pid")
kill $pid
echo "yes" > "${jobdir}/${jobid}.cancelled"
echo "yes" > "${jobdir}/${jobid}/cancelled"
14 changes: 7 additions & 7 deletions tests/ert/unit_tests/scheduler/bin/scontrol.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""

import argparse
import glob
import os
from pathlib import Path
from typing import Literal
Expand All @@ -31,14 +30,15 @@ def main() -> None:

jobs_path = Path(os.getenv("PYTEST_TMP_PATH", ".")) / "mock_jobs"

for pidfile in glob.glob(f"{jobs_path}/*.pid"):
job = pidfile.split("/")[-1].split(".")[0]
for job_dir in jobs_path.iterdir():
pidfile = job_dir / "pid"
job = job_dir.name
if args.jobid and job != args.jobid:
continue
pid = read(Path(pidfile))
returncode = read(jobs_path / f"{job}.returncode")
name = read(jobs_path / f"{job}.name")
cancelled = read(jobs_path / f"{job}.cancelled", default="no")
pid = read(pidfile)
returncode = read(job_dir / "returncode")
name = read(job_dir / "name")
cancelled = read(job_dir / "cancelled", default="no")
state: JobState = "PENDING"

if pid is not None and returncode is None:
Expand Down
Loading

0 comments on commit 162e24e

Please sign in to comment.