Skip to content

Commit

Permalink
Add test for the propagation of queue_options from ertconfig to bsub
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-eq committed Feb 25, 2025
1 parent a29d845 commit 6772ae3
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 25 deletions.
10 changes: 4 additions & 6 deletions tests/ert/unit_tests/scheduler/bin/bhist.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,16 @@ def main() -> None:

jobs_output: list[Job] = []
for job in args.jobs:
job_name: str = read(jobs_path / f"{job}.name") or "_"
job_name: str = read(jobs_path / job / "name") or "_"
assert job_name is not None

submit_time_millis: int = int(
os.path.getctime(jobs_path / f"{job}.name") * 1000
)
submit_time_millis: int = int(os.path.getctime(jobs_path / job / "name") * 1000)
pending_time_millis = int(read(jobs_path / "pendingtimemillis") or 0)
run_start_time_millis: int = submit_time_millis + pending_time_millis
end_time_millis: int = int(time.time() * 1000)
if (jobs_path / f"{job}.returncode").exists():
if (jobs_path / job / "returncode").exists():
end_time_millis = int(
os.path.getctime(jobs_path / f"{job}.returncode") * 1000
os.path.getctime(jobs_path / job / "returncode") * 1000
)
if not args.l:
print("bhist says job is done")
Expand Down
6 changes: 3 additions & 3 deletions tests/ert/unit_tests/scheduler/bin/bjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ def main() -> None:

# this is for the bjobs call looking for exit code
if args.o.strip() == "exit_code":
returncode = read(jobs_path / f"{args.jobs[0]}.returncode")
returncode = read(jobs_path / args.jobs[0] / "returncode")
print(returncode)
return

jobs_output: list[Job] = []
for job in args.jobs:
pid = read(jobs_path / f"{job}.pid")
returncode = read(jobs_path / f"{job}.returncode")
pid = read(jobs_path / job / "pid")
returncode = read(jobs_path / job / "returncode")

state: JobState = "PEND"

Expand Down
2 changes: 1 addition & 1 deletion tests/ert/unit_tests/scheduler/bin/bkill.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def main() -> None:
jobdir = Path(os.getenv("PYTEST_TMP_PATH", ".")) / "mock_jobs"
killsignal = getattr(signal, args.signal)
for jobid in args.jobids:
pidfile = jobdir / f"{jobid}.pid"
pidfile = jobdir / jobid / "pid"
if not pidfile.exists():
sys.exit(1)
pid = int(pidfile.read_text(encoding="utf-8").strip())
Expand Down
26 changes: 17 additions & 9 deletions tests/ert/unit_tests/scheduler/bin/bsub
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@ set -e

name="STDIN"

while getopts "o:e:J:q:R:n:" opt
jobid="${RANDOM}"
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
mkdir -p "${jobdir}"
command_invocation_file="${jobdir}/complete_command_invocation"
echo "$0 $@" > "$command_invocation_file"

while getopts "o:e:J:q:R:n:P:" opt
do
case "$opt" in
o)
Expand All @@ -24,29 +30,31 @@ do
R)
resource_requirement=$OPTARG
;;
P)
project_code=$OPTARG
;;
*)
echo "Unprocessed option ${opt}"
;;
esac
done
shift $((OPTIND-1))

jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs"
jobid="${RANDOM}"
job_env_file="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.env"
job_env_file="${jobdir}/env"

mkdir -p "${PYTEST_TMP_PATH:-.}/mock_jobs"
echo $@ > "${jobdir}/${jobid}.script"
echo "$name" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.name"
echo "$resource_requirement" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.resource_requirement"
echo $@ > "${jobdir}/script"
echo "$name" > "${jobdir}/name"
echo "$resource_requirement" > "${jobdir}/resource_requirement"
touch $job_env_file



[ -n $num_cpu ] && echo "export LSB_MAX_NUM_PROCESSORS=$num_cpu" >> $job_env_file

[ -z $stdout ] && stdout="/dev/null"
[ -z $stderr ] && stderr="/dev/null"

bash "$(dirname $0)/lsfrunner" "${jobdir}/${jobid}" >$stdout 2>$stderr &
bash "$(dirname $0)/lsfrunner" "${jobdir}" >$stdout 2>$stderr &
disown

echo "Job <$jobid> is submitted to default queue <normal>."
12 changes: 6 additions & 6 deletions tests/ert/unit_tests/scheduler/bin/lsfrunner
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@ function handle_sigterm {

trap handle_sigterm SIGTERM

echo "$$" > "${job}.pid"
source "${job}.env"
bash "${job}.script" > "${job}.stdout" 2> "${job}.stderr" &
echo "$$" > "${job}/pid"
source "${job}/env"
bash "${job}/script" > "${job}/stdout" 2> "${job}/stderr" &
child_pid=$!
wait $child_pid

echo "$?" > "${job}.returncode"
echo "$?" > "${job}/returncode"
echo "Sender: Mocked LSF system <$USER@$(hostname -s)"
echo "Subject: Job $job:"
echo "[..skipped in mock..]"
echo "The output (if any) follows:"
cat ${job}.stdout
cat ${job}/stdout

cat ${job}.stderr >&2
cat ${job}/stderr >&2
66 changes: 66 additions & 0 deletions tests/ert/unit_tests/scheduler/test_lsf_driver.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import asyncio
import getopt
import json
import logging
import os
import random
import re
import shlex
import stat
import string
import time
Expand All @@ -19,6 +21,8 @@
from hypothesis import strategies as st

from ert.config import QueueConfig
from ert.config.queue_config import _parse_realization_memory_str
from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
from ert.scheduler import LsfDriver, create_driver
from ert.scheduler.driver import SIGNAL_OFFSET
from ert.scheduler.lsf_driver import (
Expand All @@ -39,6 +43,7 @@
parse_bjobs,
parse_bjobs_exec_hosts,
)
from tests.ert.ui_tests.cli.run_cli import run_cli
from tests.ert.utils import poll, wait_until

from .conftest import mock_bin
Expand Down Expand Up @@ -1364,3 +1369,64 @@ async def finished(iens: int, returncode: int):
# a controlled fashion:
if (tmp_path / "trap_handle_installed").exists():
wait_until((tmp_path / "was_killed").exists, timeout=4)


@pytest.mark.integration_test
@pytest.mark.usefixtures("copy_poly_case")
def test_queue_options_are_propagated_from_config_to_bsub():
"""
This end to end test is here to verify that queue_options are correctly propagated all the way from ert config to the cluster.
"""
expected_queue = "foo_bar_queue"
expected_resource_string = "location=foo_bar_location"
expected_realization_memory = "9GB"
expected_project_code = "foo_bar_project"
expected_excluded_hosts = "foo_host,bar_host"
expected_num_cpu = 98

with open("poly.ert", "a", encoding="utf-8") as f:
f.write(
dedent(
f"""\
NUM_CPU {expected_num_cpu}
REALIZATION_MEMORY {expected_realization_memory}
QUEUE_SYSTEM LSF
QUEUE_OPTION LSF LSF_QUEUE {expected_queue}
QUEUE_OPTION LSF LSF_RESOURCE {expected_resource_string}
QUEUE_OPTION LSF PROJECT_CODE {expected_project_code}
QUEUE_OPTION LSF EXCLUDE_HOST {expected_excluded_hosts}
NUM_REALIZATIONS 1
"""
)
)
run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
mock_jobs_dir = Path(f"{os.environ.get('PYTEST_TMP_PATH')}/mock_jobs")
job_dir = next(
mock_jobs_dir.iterdir()
) # There is only one realization in this test
complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
encoding="utf-8"
)

args = shlex.split(complete_command_invocation)
_ = args.pop(0) # script path
opts, _ = getopt.getopt(args, "o:e:J:q:R:n:P:")
parsed_options = dict(opts) # {opt: arg for opt, arg in opts}

assert parsed_options.get("-q") == expected_queue
assert parsed_options.get("-P") == expected_project_code
assert parsed_options.get("-n") == str(expected_num_cpu)

# -R was not parsed correctly by getopt, so we read it manually instead.
complete_resource_requirement = (job_dir / "resource_requirement").read_text(
encoding="utf-8"
)
assert expected_resource_string in complete_resource_requirement
assert (
f"rusage[mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}]"
in complete_resource_requirement
)
assert (
f"select[{' && '.join(f"hname!='{host_name}'" for host_name in expected_excluded_hosts.split(','))}]"
in complete_resource_requirement
)

0 comments on commit 6772ae3

Please sign in to comment.