From 379b5ada9deda73c472324db992fcbbba8f48fa4 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 8 Apr 2024 09:51:38 +0200
Subject: [PATCH] Explicitly passing visible devices to isolation process
 (#177)

---
 examples/pytorch_bert.yaml                    |  3 ++
 optimum_benchmark/backends/config.py          |  7 ++-
 optimum_benchmark/launchers/config.py         |  8 +++-
 optimum_benchmark/launchers/inline/config.py  |  3 ++
 .../launchers/isolation_utils.py              | 46 ++++++++++++-------
 optimum_benchmark/launchers/process/config.py |  4 +-
 .../launchers/torchrun/config.py              |  4 +-
 7 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
index 55ef6822..af98cdc8 100644
--- a/examples/pytorch_bert.yaml
+++ b/examples/pytorch_bert.yaml
@@ -9,6 +9,9 @@ defaults:
 
 experiment_name: pytorch_bert
 
+launcher:
+  device_isolation: true
+
 benchmark:
   latency: true
   memory: true
diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index c6d36534..72034aa5 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -26,9 +26,8 @@ class BackendConfig(ABC):
     version: str
     _target_: str
 
-    model: Optional[str] = None
-
     task: Optional[str] = None
+    model: Optional[str] = None
     library: Optional[str] = None
 
     device: Optional[str] = None
@@ -74,11 +73,11 @@ def __post_init__(self):
             if is_nvidia_system():
                 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
                 os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
-
             elif is_rocm_system():
                 # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html
-                # ROCR_VISIBLE_DEVICES is better than HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES
                 os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids
+            else:
+                raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")
 
         if self.library not in ["transformers", "diffusers", "timm"]:
             raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}")
diff --git a/optimum_benchmark/launchers/config.py b/optimum_benchmark/launchers/config.py
index 1bed8ae8..2fe852a4 100644
--- a/optimum_benchmark/launchers/config.py
+++ b/optimum_benchmark/launchers/config.py
@@ -16,8 +16,12 @@ class LauncherConfig(ABC):
     device_isolation: bool = False
 
     def __post_init__(self):
-        if not is_nvidia_system() and not is_rocm_system():
-            raise ValueError("Device isolation is not supported on NVIDIA or ROCm systems")
+        if self.device_isolation and not is_nvidia_system() and not is_rocm_system():
+            raise ValueError(
+                "Device isolation is only supported on NVIDIA and ROCm systems. "
+                "Please set `device_isolation` to False or make sure your drivers "
+                "are correctly installed by running `nvidia-smi` or `rocm-smi`."
+            )
 
 
 LauncherConfigT = TypeVar("LauncherConfigT", bound=LauncherConfig)
diff --git a/optimum_benchmark/launchers/inline/config.py b/optimum_benchmark/launchers/inline/config.py
index f0be96e5..1e4ff9c7 100644
--- a/optimum_benchmark/launchers/inline/config.py
+++ b/optimum_benchmark/launchers/inline/config.py
@@ -10,3 +10,6 @@
 class InlineConfig(LauncherConfig):
     name: str = "inline"
     _target_: str = "optimum_benchmark.launchers.inline.launcher.InlineLauncher"
+
+    def __post_init__(self):
+        super().__post_init__()
diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py
index b6d5e0d0..435becfb 100644
--- a/optimum_benchmark/launchers/isolation_utils.py
+++ b/optimum_benchmark/launchers/isolation_utils.py
@@ -31,7 +31,7 @@ def isolation_signal_handler(signum, frame):
 signal.signal(signal.SIGUSR1, isolation_signal_handler)
 
 
-def get_nvidia_devices_pids() -> Set[int]:
+def get_nvidia_devices_pids(device_ids: str) -> Set[int]:
     if not is_pynvml_available():
         raise ValueError(
             "The library pynvml is required to get the pids running on NVIDIA GPUs, but is not installed. "
@@ -41,7 +41,7 @@ def get_nvidia_devices_pids() -> Set[int]:
     pynvml.nvmlInit()
 
     devices_pids = set()
-    devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
+    devices_ids = list(map(int, device_ids.split(",")))
 
     for device_id in devices_ids:
         device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
@@ -54,7 +54,7 @@ def get_nvidia_devices_pids() -> Set[int]:
     return devices_pids
 
 
-def get_amd_devices_pids() -> Set[int]:
+def get_amd_devices_pids(device_ids: str) -> Set[int]:
     if not is_amdsmi_available():
         raise ValueError(
             "The library amdsmi is required to get the pids running on AMD GPUs, but is not installed. "
@@ -64,7 +64,7 @@ def get_amd_devices_pids() -> Set[int]:
     amdsmi.amdsmi_init()
 
     devices_pids = set()
-    devices_ids = [int(device_id) for device_id in os.environ["ROCR_VISIBLE_DEVICES"].split(",")]
+    devices_ids = list(map(int, device_ids.split(",")))
 
     processor_handles = amdsmi.amdsmi_get_processor_handles()
     for device_id in devices_ids:
@@ -93,24 +93,25 @@ def get_amd_devices_pids() -> Set[int]:
     return devices_pids
 
 
-def get_pids_running_on_system_devices() -> Set[int]:
+def get_pids_running_on_system_devices(device_ids: str) -> Set[int]:
     """Returns the set of pids running on the system device(s)."""
     if is_nvidia_system():
-        devices_pids = get_nvidia_devices_pids()
+        devices_pids = get_nvidia_devices_pids(device_ids)
     elif is_rocm_system():
-        devices_pids = get_amd_devices_pids()
+        devices_pids = get_amd_devices_pids(device_ids)
     else:
         raise ValueError("get_pids_running_on_system_device is only supported on NVIDIA and AMD GPUs")
 
     return devices_pids
 
 
-def assert_system_devices_isolation(isolated_pid: int) -> None:
+def assert_system_devices_isolation(isolated_pid: int, device_ids: str):
     setup_logging("ERROR")
+
     isolation_pid = os.getpid()
 
     while psutil.pid_exists(isolated_pid):
-        devices_pids = get_pids_running_on_system_devices()
+        devices_pids = get_pids_running_on_system_devices(device_ids=device_ids)
         devices_pids = {pid for pid in devices_pids if psutil.pid_exists(pid)}
         isolated_children_pids = {child.pid for child in psutil.Process(isolated_pid).children(recursive=True)}
         isolation_children_pids = {child.pid for child in psutil.Process(isolation_pid).children(recursive=True)}
@@ -141,19 +142,32 @@ def device_isolation(enabled: bool, isolated_pid: int):
         yield
         return
 
+    if is_nvidia_system():
+        device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    elif is_rocm_system():
+        device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+    else:
+        raise ValueError("Device isolation is only supported on NVIDIA and AMD GPUs")
+
+    if device_ids is None:
+        raise ValueError(
+            "Device isolation requires CUDA_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES to be set but none were found."
+        )
+
     isolation_process = Process(
         target=assert_system_devices_isolation,
-        kwargs={"isolated_pid": isolated_pid},
+        kwargs={"isolated_pid": isolated_pid, "device_ids": device_ids},
         daemon=True,
     )
     isolation_process.start()
 
-    LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}.")
+    LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}")
+    LOGGER.info(f"\t+ Isolating device(s) [{device_ids}]")
 
     yield
 
-    LOGGER.info("\t+ Closing device(s) isolation process...")
-
-    isolation_process.kill()
-    isolation_process.join()
-    isolation_process.close()
+    if isolation_process.is_alive():
+        LOGGER.info("\t+ Closing device(s) isolation process...")
+        isolation_process.kill()
+        isolation_process.join()
+        isolation_process.close()
diff --git a/optimum_benchmark/launchers/process/config.py b/optimum_benchmark/launchers/process/config.py
index 7d2a7c3a..9d101051 100644
--- a/optimum_benchmark/launchers/process/config.py
+++ b/optimum_benchmark/launchers/process/config.py
@@ -13,6 +13,8 @@ class ProcessConfig(LauncherConfig):
 
     start_method: str = "spawn"
 
-    def __post_init__(self) -> None:
+    def __post_init__(self):
+        super().__post_init__()
+
         if self.start_method not in ["spawn", "fork"]:
             raise ValueError(f"start_method must be one of ['spawn', 'fork'], got {self.start_method}")
diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py
index c1fbfc38..ff816315 100644
--- a/optimum_benchmark/launchers/torchrun/config.py
+++ b/optimum_benchmark/launchers/torchrun/config.py
@@ -48,7 +48,9 @@ class TorchrunConfig(LauncherConfig):
     # address of the local node if any. If not set, a lookup on the local machine's FQDN will be performed.
     local_addr: Optional[str] = None
 
-    def __post_init__(self) -> None:
+    def __post_init__(self):
+        super().__post_init__()
+
         if self.start_method not in ["spawn", "fork"]:
             raise ValueError(f"start_method must be one of ['spawn', 'fork'], got {self.start_method}")