Add test for the propagation of queue_options from ertconfig to cluster

This commit adds tests that verify that the queue_options are all propagated correctly to the slurm, openpbs, and lsf queue system, using our mocked binaries.
equinor · Feb 27, 2025 · a4f8e8c · a4f8e8c
1 parent 46e4d11
commit a4f8e8c
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 0 deletions.
diff --git a/tests/ert/unit_tests/scheduler/bin/bsub b/tests/ert/unit_tests/scheduler/bin/bsub
@@ -6,6 +6,8 @@ name="STDIN"
 jobid="${RANDOM}"
 jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
 mkdir -p "${jobdir}"
+command_invocation_file="${jobdir}/complete_command_invocation"
+echo "$0 $@" > "$command_invocation_file"
 
 while getopts "o:e:J:q:R:n:P:" opt
 do

diff --git a/tests/ert/unit_tests/scheduler/bin/qsub b/tests/ert/unit_tests/scheduler/bin/qsub
@@ -7,6 +7,8 @@ jobid="test${RANDOM}.localhost"
 jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
 mkdir -p "${jobdir}"
 
+command_invocation_file="${jobdir}/complete_command_invocation"
+echo "$0 $@" > "$command_invocation_file"
 job_env_file="${jobdir}/env"
 touch $job_env_file
 

diff --git a/tests/ert/unit_tests/scheduler/bin/sbatch.py b/tests/ert/unit_tests/scheduler/bin/sbatch.py
@@ -40,6 +40,7 @@ def main() -> None:
     jobdir.mkdir(parents=True, exist_ok=True)
     (jobdir / "script").write_text(args.script, encoding="utf-8")
     (jobdir / "name").write_text(args.job_name, encoding="utf-8")
+    (jobdir / "complete_command_invocation").write_text(shlex.join(sys.argv))
     env_file = jobdir / "env"
     if args.ntasks:
         env_file.write_text(

diff --git a/tests/ert/unit_tests/scheduler/test_lsf_driver.py b/tests/ert/unit_tests/scheduler/test_lsf_driver.py
@@ -19,6 +19,8 @@
 from hypothesis import strategies as st
 
 from ert.config import QueueConfig
+from ert.config.queue_config import _parse_realization_memory_str
+from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
 from ert.scheduler import LsfDriver, create_driver
 from ert.scheduler.driver import SIGNAL_OFFSET
 from ert.scheduler.lsf_driver import (
@@ -39,6 +41,7 @@
     parse_bjobs,
     parse_bjobs_exec_hosts,
 )
+from tests.ert.ui_tests.cli.run_cli import run_cli
 from tests.ert.utils import poll, wait_until
 
 from .conftest import mock_bin
@@ -1364,3 +1367,60 @@ async def finished(iens: int, returncode: int):
     # a controlled fashion:
     if (tmp_path / "trap_handle_installed").exists():
         wait_until((tmp_path / "was_killed").exists, timeout=4)
+
+
+@pytest.mark.integration_test
+@pytest.mark.usefixtures("copy_poly_case")
+def test_queue_options_are_propagated_from_config_to_bsub(monkeypatch):
+    """
+    This end to end test is here to verify that queue_options are correctly
+    propagated all the way from ert config to the cluster.
+    """
+    mock_bin(monkeypatch, os.getcwd())
+    expected_queue = "foo_bar_queue"
+    expected_resource_string = "location=foo_bar_location"
+    expected_realization_memory = "9GB"
+    expected_project_code = "foo_bar_project"
+    expected_excluded_hosts = "foo_host,bar_host"
+    expected_num_cpu = 98
+
+    with open("poly.ert", "a", encoding="utf-8") as f:
+        f.write(
+            dedent(
+                f"""\
+                NUM_CPU {expected_num_cpu}
+                REALIZATION_MEMORY {expected_realization_memory}
+                QUEUE_SYSTEM LSF
+                QUEUE_OPTION LSF LSF_QUEUE {expected_queue}
+                QUEUE_OPTION LSF LSF_RESOURCE {expected_resource_string}
+                QUEUE_OPTION LSF PROJECT_CODE {expected_project_code}
+                QUEUE_OPTION LSF EXCLUDE_HOST {expected_excluded_hosts}
+                NUM_REALIZATIONS 1
+                """
+            )
+        )
+    run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
+    mock_jobs_dir = Path(f"{os.environ.get('PYTEST_TMP_PATH')}/mock_jobs")
+    job_dir = next(
+        mock_jobs_dir.iterdir()
+    )  # There is only one realization in this test
+    complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
+        encoding="utf-8"
+    )
+
+    assert f"-q {expected_queue}" in complete_command_invocation
+    assert f"-P {expected_project_code}" in complete_command_invocation
+    assert f"-n {str(expected_num_cpu)}" in complete_command_invocation
+
+    complete_resource_requirement = (job_dir / "resource_requirement").read_text(
+        encoding="utf-8"
+    )
+    assert expected_resource_string in complete_resource_requirement
+    assert (
+        f"rusage[mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}]"
+        in complete_resource_requirement
+    )
+    assert (
+        f"""select[{" && ".join(f"hname!='{host_name}'" for host_name in expected_excluded_hosts.split(","))}]"""
+        in complete_resource_requirement
+    )
diff --git a/tests/ert/unit_tests/scheduler/test_openpbs_driver.py b/tests/ert/unit_tests/scheduler/test_openpbs_driver.py
@@ -14,6 +14,7 @@
 from hypothesis import strategies as st
 
 from ert.cli.main import ErtCliError
+from ert.config.queue_config import _parse_realization_memory_str
 from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
 from ert.scheduler.openpbs_driver import (
     JOB_STATES,
@@ -606,3 +607,45 @@ def test_openpbs_driver_with_poly_example_failing_poll_fails_ert_and_propagates_
             "poly.ert",
         )
     assert "RuntimeError: Status polling failed" in caplog.text
+
+
+@pytest.mark.integration_test
+@pytest.mark.usefixtures("copy_poly_case")
+def test_queue_options_are_propagated_from_config_to_qsub(monkeypatch):
+    """
+    This end to end test is here to verify that queue_options are correctly
+    propagated all the way from ert config to the cluster.
+    """
+    mock_bin(monkeypatch, os.getcwd())
+    expected_queue = "foo_bar_queue"
+    expected_realization_memory = "9GB"
+    expected_project_code = "foo_bar_project"
+    expected_cluster_label = "foo_bar_cluster"
+    expected_num_cpu = 98
+    with open("poly.ert", "a", encoding="utf-8") as f:
+        f.write(
+            dedent(
+                f"""\
+                NUM_CPU {expected_num_cpu}
+                REALIZATION_MEMORY {expected_realization_memory}
+                QUEUE_SYSTEM TORQUE
+                QUEUE_OPTION TORQUE QUEUE {expected_queue}
+                QUEUE_OPTION TORQUE CLUSTER_LABEL {expected_cluster_label}
+                QUEUE_OPTION TORQUE PROJECT_CODE {expected_project_code}
+                NUM_REALIZATIONS 1
+                """
+            )
+        )
+    run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
+    mock_jobs_dir = Path(f"mock_jobs")
+    job_dir = next(
+        mock_jobs_dir.iterdir()
+    )  # There is only one realization in this test
+    complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
+        encoding="utf-8"
+    )
+
+    assert f"-q {expected_queue}" in complete_command_invocation
+    assert f"-A {expected_project_code}" in complete_command_invocation
+    assert f"-l ncpus={expected_num_cpu}:mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}mb" in complete_command_invocation
+    assert f"-l {expected_cluster_label}" in complete_command_invocation
diff --git a/tests/ert/unit_tests/scheduler/test_slurm_driver.py b/tests/ert/unit_tests/scheduler/test_slurm_driver.py
@@ -7,13 +7,16 @@
 import sys
 from contextlib import ExitStack as does_not_raise
 from pathlib import Path
+from textwrap import dedent
 
 import pytest
 from hypothesis import given
 from hypothesis import strategies as st
 
+from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
 from ert.scheduler import SlurmDriver
 from ert.scheduler.slurm_driver import _seconds_to_slurm_time_format
+from tests.ert.ui_tests.cli.run_cli import run_cli
 from tests.ert.utils import poll
 
 from .conftest import mock_bin
@@ -457,3 +460,58 @@ async def test_slurm_uses_sacct(
 
     # Make sure sacct was tried:
     assert "scontrol failed, trying sacct" in caplog.text
+
+
+from ert.config.queue_config import _parse_realization_memory_str
+
+
+@pytest.mark.integration_test
+@pytest.mark.usefixtures("copy_poly_case")
+def test_queue_options_are_propagated_from_config_to_sbatch(monkeypatch):
+    """
+    This end to end test is here to verify that queue_options are correctly
+    propagated all the way from ert config to the cluster.
+    """
+    mock_bin(monkeypatch, os.getcwd())
+    expected_partition = "foo_bar_partition"
+    expected_realization_memory = "9GB"
+    expected_project_code = "foo_bar_project"
+    expected_exclude_hosts = "not_foohost,not_barhost"
+    expected_include_hosts = "foohost,barhost"
+    expected_max_runtime = 99
+    expected_num_cpu = 98
+    with open("poly.ert", "a", encoding="utf-8") as f:
+        f.write(
+            dedent(
+                f"""\
+                NUM_CPU {expected_num_cpu}
+                REALIZATION_MEMORY {expected_realization_memory}
+                QUEUE_SYSTEM SLURM
+                QUEUE_OPTION SLURM PARTITION {expected_partition}
+                QUEUE_OPTION SLURM INCLUDE_HOST {expected_include_hosts}
+                QUEUE_OPTION SLURM EXCLUDE_HOST {expected_exclude_hosts}
+                QUEUE_OPTION SLURM PROJECT_CODE {expected_project_code}
+                QUEUE_OPTION SLURM MAX_RUNTIME {expected_max_runtime}
+                NUM_REALIZATIONS 1
+                """
+            )
+        )
+    run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
+    mock_jobs_dir = Path(f"{os.environ.get('PYTEST_TMP_PATH')}/mock_jobs")
+    job_dir = next(
+        mock_jobs_dir.iterdir()
+    )  # There is only one realization in this test
+    complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
+        encoding="utf-8"
+    )
+
+    assert f"--ntasks={expected_num_cpu}" in complete_command_invocation
+    assert f"--mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}M" in complete_command_invocation
+
+    assert f"--nodelist={expected_include_hosts}" in complete_command_invocation
+    assert f"--exclude={expected_exclude_hosts}" in complete_command_invocation
+    assert f"--time={_seconds_to_slurm_time_format(expected_max_runtime
+    )}" in complete_command_invocation
+
+    assert f"--partition={expected_partition}" in complete_command_invocation
+    assert f"--account={expected_project_code}" in complete_command_invocation