Add test for the propagation of queue_options from ertconfig to bsub

equinor · Feb 25, 2025 · 6772ae3 · 6772ae3
1 parent a29d845
commit 6772ae3
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 25 deletions.
diff --git a/tests/ert/unit_tests/scheduler/bin/bhist.py b/tests/ert/unit_tests/scheduler/bin/bhist.py
@@ -83,18 +83,16 @@ def main() -> None:
 
     jobs_output: list[Job] = []
     for job in args.jobs:
-        job_name: str = read(jobs_path / f"{job}.name") or "_"
+        job_name: str = read(jobs_path / job / "name") or "_"
         assert job_name is not None
 
-        submit_time_millis: int = int(
-            os.path.getctime(jobs_path / f"{job}.name") * 1000
-        )
+        submit_time_millis: int = int(os.path.getctime(jobs_path / job / "name") * 1000)
         pending_time_millis = int(read(jobs_path / "pendingtimemillis") or 0)
         run_start_time_millis: int = submit_time_millis + pending_time_millis
         end_time_millis: int = int(time.time() * 1000)
-        if (jobs_path / f"{job}.returncode").exists():
+        if (jobs_path / job / "returncode").exists():
             end_time_millis = int(
-                os.path.getctime(jobs_path / f"{job}.returncode") * 1000
+                os.path.getctime(jobs_path / job / "returncode") * 1000
             )
             if not args.l:
                 print("bhist says job is done")

diff --git a/tests/ert/unit_tests/scheduler/bin/bjobs.py b/tests/ert/unit_tests/scheduler/bin/bjobs.py
@@ -41,14 +41,14 @@ def main() -> None:
 
     # this is for the bjobs call looking for exit code
     if args.o.strip() == "exit_code":
-        returncode = read(jobs_path / f"{args.jobs[0]}.returncode")
+        returncode = read(jobs_path / args.jobs[0] / "returncode")
         print(returncode)
         return
 
     jobs_output: list[Job] = []
     for job in args.jobs:
-        pid = read(jobs_path / f"{job}.pid")
-        returncode = read(jobs_path / f"{job}.returncode")
+        pid = read(jobs_path / job / "pid")
+        returncode = read(jobs_path / job / "returncode")
 
         state: JobState = "PEND"
 

diff --git a/tests/ert/unit_tests/scheduler/bin/bkill.py b/tests/ert/unit_tests/scheduler/bin/bkill.py
@@ -20,7 +20,7 @@ def main() -> None:
     jobdir = Path(os.getenv("PYTEST_TMP_PATH", ".")) / "mock_jobs"
     killsignal = getattr(signal, args.signal)
     for jobid in args.jobids:
-        pidfile = jobdir / f"{jobid}.pid"
+        pidfile = jobdir / jobid / "pid"
         if not pidfile.exists():
             sys.exit(1)
         pid = int(pidfile.read_text(encoding="utf-8").strip())

diff --git a/tests/ert/unit_tests/scheduler/bin/bsub b/tests/ert/unit_tests/scheduler/bin/bsub
@@ -3,7 +3,13 @@ set -e
 
 name="STDIN"
 
-while getopts "o:e:J:q:R:n:" opt
+jobid="${RANDOM}"
+jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
+mkdir -p "${jobdir}"
+command_invocation_file="${jobdir}/complete_command_invocation"
+echo "$0 $@" > "$command_invocation_file"
+
+while getopts "o:e:J:q:R:n:P:" opt
 do
     case "$opt" in
         o)
@@ -24,29 +30,31 @@ do
         R)
             resource_requirement=$OPTARG
             ;;
+        P)
+            project_code=$OPTARG
+            ;;
         *)
             echo "Unprocessed option ${opt}"
             ;;
     esac
 done
 shift $((OPTIND-1))
 
-jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs"
-jobid="${RANDOM}"
-job_env_file="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.env"
+job_env_file="${jobdir}/env"
 
-mkdir -p "${PYTEST_TMP_PATH:-.}/mock_jobs"
-echo $@ > "${jobdir}/${jobid}.script"
-echo "$name" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.name"
-echo "$resource_requirement" > "${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}.resource_requirement"
+echo $@ > "${jobdir}/script"
+echo "$name" > "${jobdir}/name"
+echo "$resource_requirement" > "${jobdir}/resource_requirement"
 touch $job_env_file
 
+
+
 [ -n $num_cpu ] && echo "export LSB_MAX_NUM_PROCESSORS=$num_cpu" >> $job_env_file
 
 [ -z $stdout ] && stdout="/dev/null"
 [ -z $stderr ] && stderr="/dev/null"
 
-bash "$(dirname $0)/lsfrunner" "${jobdir}/${jobid}" >$stdout 2>$stderr &
+bash "$(dirname $0)/lsfrunner" "${jobdir}" >$stdout 2>$stderr &
 disown
 
 echo "Job <$jobid> is submitted to default queue <normal>."
diff --git a/tests/ert/unit_tests/scheduler/bin/lsfrunner b/tests/ert/unit_tests/scheduler/bin/lsfrunner
@@ -14,17 +14,17 @@ function handle_sigterm {
 
 trap handle_sigterm SIGTERM
 
-echo "$$" > "${job}.pid"
-source "${job}.env"
-bash "${job}.script" > "${job}.stdout" 2> "${job}.stderr"  &
+echo "$$" > "${job}/pid"
+source "${job}/env"
+bash "${job}/script" > "${job}/stdout" 2> "${job}/stderr"  &
 child_pid=$!
 wait $child_pid
 
-echo "$?" > "${job}.returncode"
+echo "$?" > "${job}/returncode"
 echo "Sender: Mocked LSF system <$USER@$(hostname -s)"
 echo "Subject: Job $job:"
 echo "[..skipped in mock..]"
 echo "The output (if any) follows:"
-cat ${job}.stdout
+cat ${job}/stdout
 
-cat ${job}.stderr >&2
+cat ${job}/stderr >&2
diff --git a/tests/ert/unit_tests/scheduler/test_lsf_driver.py b/tests/ert/unit_tests/scheduler/test_lsf_driver.py
@@ -1,9 +1,11 @@
 import asyncio
+import getopt
 import json
 import logging
 import os
 import random
 import re
+import shlex
 import stat
 import string
 import time
@@ -19,6 +21,8 @@
 from hypothesis import strategies as st
 
 from ert.config import QueueConfig
+from ert.config.queue_config import _parse_realization_memory_str
+from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
 from ert.scheduler import LsfDriver, create_driver
 from ert.scheduler.driver import SIGNAL_OFFSET
 from ert.scheduler.lsf_driver import (
@@ -39,6 +43,7 @@
     parse_bjobs,
     parse_bjobs_exec_hosts,
 )
+from tests.ert.ui_tests.cli.run_cli import run_cli
 from tests.ert.utils import poll, wait_until
 
 from .conftest import mock_bin
@@ -1364,3 +1369,64 @@ async def finished(iens: int, returncode: int):
     # a controlled fashion:
     if (tmp_path / "trap_handle_installed").exists():
         wait_until((tmp_path / "was_killed").exists, timeout=4)
+
+
+@pytest.mark.integration_test
+@pytest.mark.usefixtures("copy_poly_case")
+def test_queue_options_are_propagated_from_config_to_bsub():
+    """
+    This end to end test is here to verify that queue_options are correctly propagated all the way from ert config to the cluster.
+    """
+    expected_queue = "foo_bar_queue"
+    expected_resource_string = "location=foo_bar_location"
+    expected_realization_memory = "9GB"
+    expected_project_code = "foo_bar_project"
+    expected_excluded_hosts = "foo_host,bar_host"
+    expected_num_cpu = 98
+
+    with open("poly.ert", "a", encoding="utf-8") as f:
+        f.write(
+            dedent(
+                f"""\
+                NUM_CPU {expected_num_cpu}
+                REALIZATION_MEMORY {expected_realization_memory}
+                QUEUE_SYSTEM LSF
+                QUEUE_OPTION LSF LSF_QUEUE {expected_queue}
+                QUEUE_OPTION LSF LSF_RESOURCE {expected_resource_string}
+                QUEUE_OPTION LSF PROJECT_CODE {expected_project_code}
+                QUEUE_OPTION LSF EXCLUDE_HOST {expected_excluded_hosts}
+                NUM_REALIZATIONS 1
+                """
+            )
+        )
+    run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
+    mock_jobs_dir = Path(f"{os.environ.get('PYTEST_TMP_PATH')}/mock_jobs")
+    job_dir = next(
+        mock_jobs_dir.iterdir()
+    )  # There is only one realization in this test
+    complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
+        encoding="utf-8"
+    )
+
+    args = shlex.split(complete_command_invocation)
+    _ = args.pop(0)  # script path
+    opts, _ = getopt.getopt(args, "o:e:J:q:R:n:P:")
+    parsed_options = dict(opts)  # {opt: arg for opt, arg in opts}
+
+    assert parsed_options.get("-q") == expected_queue
+    assert parsed_options.get("-P") == expected_project_code
+    assert parsed_options.get("-n") == str(expected_num_cpu)
+
+    # -R was not parsed correctly by getopt, so we read it manually instead.
+    complete_resource_requirement = (job_dir / "resource_requirement").read_text(
+        encoding="utf-8"
+    )
+    assert expected_resource_string in complete_resource_requirement
+    assert (
+        f"rusage[mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}]"
+        in complete_resource_requirement
+    )
+    assert (
+        f"select[{' && '.join(f"hname!='{host_name}'" for host_name in expected_excluded_hosts.split(','))}]"
+        in complete_resource_requirement
+    )