Skip to content

Commit

Permalink
Add test for the propagation of queue_options from ertconfig to cluster
Browse files Browse the repository at this point in the history
This commit adds tests that verify that the queue_options are all propagated correctly to the slurm, openpbs, and lsf queue system, using
our mocked binaries.
  • Loading branch information
jonathan-eq committed Feb 27, 2025
1 parent 46e4d11 commit a4f8e8c
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tests/ert/unit_tests/scheduler/bin/bsub
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ name="STDIN"
jobid="${RANDOM}"
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
mkdir -p "${jobdir}"
command_invocation_file="${jobdir}/complete_command_invocation"
echo "$0 $@" > "$command_invocation_file"

while getopts "o:e:J:q:R:n:P:" opt
do
Expand Down
2 changes: 2 additions & 0 deletions tests/ert/unit_tests/scheduler/bin/qsub
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ jobid="test${RANDOM}.localhost"
jobdir="${PYTEST_TMP_PATH:-.}/mock_jobs/${jobid}"
mkdir -p "${jobdir}"

command_invocation_file="${jobdir}/complete_command_invocation"
echo "$0 $@" > "$command_invocation_file"
job_env_file="${jobdir}/env"
touch $job_env_file

Expand Down
1 change: 1 addition & 0 deletions tests/ert/unit_tests/scheduler/bin/sbatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def main() -> None:
jobdir.mkdir(parents=True, exist_ok=True)
(jobdir / "script").write_text(args.script, encoding="utf-8")
(jobdir / "name").write_text(args.job_name, encoding="utf-8")
(jobdir / "complete_command_invocation").write_text(shlex.join(sys.argv))
env_file = jobdir / "env"
if args.ntasks:
env_file.write_text(
Expand Down
60 changes: 60 additions & 0 deletions tests/ert/unit_tests/scheduler/test_lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from hypothesis import strategies as st

from ert.config import QueueConfig
from ert.config.queue_config import _parse_realization_memory_str
from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
from ert.scheduler import LsfDriver, create_driver
from ert.scheduler.driver import SIGNAL_OFFSET
from ert.scheduler.lsf_driver import (
Expand All @@ -39,6 +41,7 @@
parse_bjobs,
parse_bjobs_exec_hosts,
)
from tests.ert.ui_tests.cli.run_cli import run_cli
from tests.ert.utils import poll, wait_until

from .conftest import mock_bin
Expand Down Expand Up @@ -1364,3 +1367,60 @@ async def finished(iens: int, returncode: int):
# a controlled fashion:
if (tmp_path / "trap_handle_installed").exists():
wait_until((tmp_path / "was_killed").exists, timeout=4)


@pytest.mark.integration_test
@pytest.mark.usefixtures("copy_poly_case")
def test_queue_options_are_propagated_from_config_to_bsub(monkeypatch):
"""
This end to end test is here to verify that queue_options are correctly
propagated all the way from ert config to the cluster.
"""
mock_bin(monkeypatch, os.getcwd())
expected_queue = "foo_bar_queue"
expected_resource_string = "location=foo_bar_location"
expected_realization_memory = "9GB"
expected_project_code = "foo_bar_project"
expected_excluded_hosts = "foo_host,bar_host"
expected_num_cpu = 98

with open("poly.ert", "a", encoding="utf-8") as f:
f.write(
dedent(
f"""\
NUM_CPU {expected_num_cpu}
REALIZATION_MEMORY {expected_realization_memory}
QUEUE_SYSTEM LSF
QUEUE_OPTION LSF LSF_QUEUE {expected_queue}
QUEUE_OPTION LSF LSF_RESOURCE {expected_resource_string}
QUEUE_OPTION LSF PROJECT_CODE {expected_project_code}
QUEUE_OPTION LSF EXCLUDE_HOST {expected_excluded_hosts}
NUM_REALIZATIONS 1
"""
)
)
run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
mock_jobs_dir = Path(f"{os.environ.get('PYTEST_TMP_PATH')}/mock_jobs")
job_dir = next(
mock_jobs_dir.iterdir()
) # There is only one realization in this test
complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
encoding="utf-8"
)

assert f"-q {expected_queue}" in complete_command_invocation
assert f"-P {expected_project_code}" in complete_command_invocation
assert f"-n {str(expected_num_cpu)}" in complete_command_invocation

complete_resource_requirement = (job_dir / "resource_requirement").read_text(
encoding="utf-8"
)
assert expected_resource_string in complete_resource_requirement
assert (
f"rusage[mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}]"
in complete_resource_requirement
)
assert (
f"""select[{" && ".join(f"hname!='{host_name}'" for host_name in expected_excluded_hosts.split(","))}]"""
in complete_resource_requirement
)
43 changes: 43 additions & 0 deletions tests/ert/unit_tests/scheduler/test_openpbs_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from hypothesis import strategies as st

from ert.cli.main import ErtCliError
from ert.config.queue_config import _parse_realization_memory_str
from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
from ert.scheduler.openpbs_driver import (
JOB_STATES,
Expand Down Expand Up @@ -606,3 +607,45 @@ def test_openpbs_driver_with_poly_example_failing_poll_fails_ert_and_propagates_
"poly.ert",
)
assert "RuntimeError: Status polling failed" in caplog.text


@pytest.mark.integration_test
@pytest.mark.usefixtures("copy_poly_case")
def test_queue_options_are_propagated_from_config_to_qsub(monkeypatch):
"""
This end to end test is here to verify that queue_options are correctly
propagated all the way from ert config to the cluster.
"""
mock_bin(monkeypatch, os.getcwd())
expected_queue = "foo_bar_queue"
expected_realization_memory = "9GB"
expected_project_code = "foo_bar_project"
expected_cluster_label = "foo_bar_cluster"
expected_num_cpu = 98
with open("poly.ert", "a", encoding="utf-8") as f:
f.write(
dedent(
f"""\
NUM_CPU {expected_num_cpu}
REALIZATION_MEMORY {expected_realization_memory}
QUEUE_SYSTEM TORQUE
QUEUE_OPTION TORQUE QUEUE {expected_queue}
QUEUE_OPTION TORQUE CLUSTER_LABEL {expected_cluster_label}
QUEUE_OPTION TORQUE PROJECT_CODE {expected_project_code}
NUM_REALIZATIONS 1
"""
)
)
run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
mock_jobs_dir = Path(f"mock_jobs")
job_dir = next(
mock_jobs_dir.iterdir()
) # There is only one realization in this test
complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
encoding="utf-8"
)

assert f"-q {expected_queue}" in complete_command_invocation
assert f"-A {expected_project_code}" in complete_command_invocation
assert f"-l ncpus={expected_num_cpu}:mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}mb" in complete_command_invocation
assert f"-l {expected_cluster_label}" in complete_command_invocation
58 changes: 58 additions & 0 deletions tests/ert/unit_tests/scheduler/test_slurm_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@
import sys
from contextlib import ExitStack as does_not_raise
from pathlib import Path
from textwrap import dedent

import pytest
from hypothesis import given
from hypothesis import strategies as st

from ert.mode_definitions import ENSEMBLE_EXPERIMENT_MODE
from ert.scheduler import SlurmDriver
from ert.scheduler.slurm_driver import _seconds_to_slurm_time_format
from tests.ert.ui_tests.cli.run_cli import run_cli
from tests.ert.utils import poll

from .conftest import mock_bin
Expand Down Expand Up @@ -457,3 +460,58 @@ async def test_slurm_uses_sacct(

# Make sure sacct was tried:
assert "scontrol failed, trying sacct" in caplog.text


from ert.config.queue_config import _parse_realization_memory_str


@pytest.mark.integration_test
@pytest.mark.usefixtures("copy_poly_case")
def test_queue_options_are_propagated_from_config_to_sbatch(monkeypatch):
"""
This end to end test is here to verify that queue_options are correctly
propagated all the way from ert config to the cluster.
"""
mock_bin(monkeypatch, os.getcwd())
expected_partition = "foo_bar_partition"
expected_realization_memory = "9GB"
expected_project_code = "foo_bar_project"
expected_exclude_hosts = "not_foohost,not_barhost"
expected_include_hosts = "foohost,barhost"
expected_max_runtime = 99
expected_num_cpu = 98
with open("poly.ert", "a", encoding="utf-8") as f:
f.write(
dedent(
f"""\
NUM_CPU {expected_num_cpu}
REALIZATION_MEMORY {expected_realization_memory}
QUEUE_SYSTEM SLURM
QUEUE_OPTION SLURM PARTITION {expected_partition}
QUEUE_OPTION SLURM INCLUDE_HOST {expected_include_hosts}
QUEUE_OPTION SLURM EXCLUDE_HOST {expected_exclude_hosts}
QUEUE_OPTION SLURM PROJECT_CODE {expected_project_code}
QUEUE_OPTION SLURM MAX_RUNTIME {expected_max_runtime}
NUM_REALIZATIONS 1
"""
)
)
run_cli(ENSEMBLE_EXPERIMENT_MODE, "--disable-monitoring", "poly.ert")
mock_jobs_dir = Path(f"{os.environ.get('PYTEST_TMP_PATH')}/mock_jobs")
job_dir = next(
mock_jobs_dir.iterdir()
) # There is only one realization in this test
complete_command_invocation = (job_dir / "complete_command_invocation").read_text(
encoding="utf-8"
)

assert f"--ntasks={expected_num_cpu}" in complete_command_invocation
assert f"--mem={_parse_realization_memory_str(expected_realization_memory) // 1024**2}M" in complete_command_invocation

assert f"--nodelist={expected_include_hosts}" in complete_command_invocation
assert f"--exclude={expected_exclude_hosts}" in complete_command_invocation
assert f"--time={_seconds_to_slurm_time_format(expected_max_runtime
)}" in complete_command_invocation

assert f"--partition={expected_partition}" in complete_command_invocation
assert f"--account={expected_project_code}" in complete_command_invocation

0 comments on commit a4f8e8c

Please sign in to comment.