From 379b5ada9deda73c472324db992fcbbba8f48fa4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:51:38 +0200 Subject: [PATCH] Explicitly passing visible devices to isolation process (#177) --- examples/pytorch_bert.yaml | 3 ++ optimum_benchmark/backends/config.py | 7 ++- optimum_benchmark/launchers/config.py | 8 +++- optimum_benchmark/launchers/inline/config.py | 3 ++ .../launchers/isolation_utils.py | 46 ++++++++++++------- optimum_benchmark/launchers/process/config.py | 4 +- .../launchers/torchrun/config.py | 4 +- 7 files changed, 51 insertions(+), 24 deletions(-) diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml index 55ef6822..af98cdc8 100644 --- a/examples/pytorch_bert.yaml +++ b/examples/pytorch_bert.yaml @@ -9,6 +9,9 @@ defaults: experiment_name: pytorch_bert +launcher: + device_isolation: true + benchmark: latency: true memory: true diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index c6d36534..72034aa5 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -26,9 +26,8 @@ class BackendConfig(ABC): version: str _target_: str - model: Optional[str] = None - task: Optional[str] = None + model: Optional[str] = None library: Optional[str] = None device: Optional[str] = None @@ -74,11 +73,11 @@ def __post_init__(self): if is_nvidia_system(): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids - elif is_rocm_system(): # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html - # ROCR_VISIBLE_DEVICES is better than HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids + else: + raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.") if self.library not in ["transformers", "diffusers", "timm"]: raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}") diff --git a/optimum_benchmark/launchers/config.py b/optimum_benchmark/launchers/config.py index 1bed8ae8..2fe852a4 100644 --- a/optimum_benchmark/launchers/config.py +++ b/optimum_benchmark/launchers/config.py @@ -16,8 +16,12 @@ class LauncherConfig(ABC): device_isolation: bool = False def __post_init__(self): - if not is_nvidia_system() and not is_rocm_system(): - raise ValueError("Device isolation is not supported on NVIDIA or ROCm systems") + if self.device_isolation and not is_nvidia_system() and not is_rocm_system(): + raise ValueError( + "Device isolation is only supported on NVIDIA and ROCm systems. " + "Please set `device_isolation` to False or make sure your drivers " + "are correctly installed by running `nvidia-smi` or `rocm-smi`." + ) LauncherConfigT = TypeVar("LauncherConfigT", bound=LauncherConfig) diff --git a/optimum_benchmark/launchers/inline/config.py b/optimum_benchmark/launchers/inline/config.py index f0be96e5..1e4ff9c7 100644 --- a/optimum_benchmark/launchers/inline/config.py +++ b/optimum_benchmark/launchers/inline/config.py @@ -10,3 +10,6 @@ class InlineConfig(LauncherConfig): name: str = "inline" _target_: str = "optimum_benchmark.launchers.inline.launcher.InlineLauncher" + + def __post_init__(self): + super().__post_init__() diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py index b6d5e0d0..435becfb 100644 --- a/optimum_benchmark/launchers/isolation_utils.py +++ b/optimum_benchmark/launchers/isolation_utils.py @@ -31,7 +31,7 @@ def isolation_signal_handler(signum, frame): signal.signal(signal.SIGUSR1, isolation_signal_handler) -def get_nvidia_devices_pids() -> Set[int]: +def get_nvidia_devices_pids(device_ids: str) -> Set[int]: if not is_pynvml_available(): raise ValueError( "The library pynvml is required to get the pids running on NVIDIA GPUs, but is not installed. " @@ -41,7 +41,7 @@ def get_nvidia_devices_pids() -> Set[int]: pynvml.nvmlInit() devices_pids = set() - devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")] + devices_ids = list(map(int, device_ids.split(","))) for device_id in devices_ids: device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) @@ -54,7 +54,7 @@ def get_nvidia_devices_pids() -> Set[int]: return devices_pids -def get_amd_devices_pids() -> Set[int]: +def get_amd_devices_pids(device_ids: str) -> Set[int]: if not is_amdsmi_available(): raise ValueError( "The library amdsmi is required to get the pids running on AMD GPUs, but is not installed. " @@ -64,7 +64,7 @@ def get_amd_devices_pids() -> Set[int]: amdsmi.amdsmi_init() devices_pids = set() - devices_ids = [int(device_id) for device_id in os.environ["ROCR_VISIBLE_DEVICES"].split(",")] + devices_ids = list(map(int, device_ids.split(","))) processor_handles = amdsmi.amdsmi_get_processor_handles() for device_id in devices_ids: @@ -93,24 +93,25 @@ def get_amd_devices_pids() -> Set[int]: return devices_pids -def get_pids_running_on_system_devices() -> Set[int]: +def get_pids_running_on_system_devices(device_ids: str) -> Set[int]: """Returns the set of pids running on the system device(s).""" if is_nvidia_system(): - devices_pids = get_nvidia_devices_pids() + devices_pids = get_nvidia_devices_pids(device_ids) elif is_rocm_system(): - devices_pids = get_amd_devices_pids() + devices_pids = get_amd_devices_pids(device_ids) else: raise ValueError("get_pids_running_on_system_device is only supported on NVIDIA and AMD GPUs") return devices_pids -def assert_system_devices_isolation(isolated_pid: int) -> None: +def assert_system_devices_isolation(isolated_pid: int, device_ids: str): setup_logging("ERROR") + isolation_pid = os.getpid() while psutil.pid_exists(isolated_pid): - devices_pids = get_pids_running_on_system_devices() + devices_pids = get_pids_running_on_system_devices(device_ids=device_ids) devices_pids = {pid for pid in devices_pids if psutil.pid_exists(pid)} isolated_children_pids = {child.pid for child in psutil.Process(isolated_pid).children(recursive=True)} isolation_children_pids = {child.pid for child in psutil.Process(isolation_pid).children(recursive=True)} @@ -141,19 +142,32 @@ def device_isolation(enabled: bool, isolated_pid: int): yield return + if is_nvidia_system(): + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None) + elif is_rocm_system(): + device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", None) + else: + raise ValueError("Device isolation is only supported on NVIDIA and AMD GPUs") + + if device_ids is None: + raise ValueError( + "Device isolation requires CUDA_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES to be set but none were found." + ) + isolation_process = Process( target=assert_system_devices_isolation, - kwargs={"isolated_pid": isolated_pid}, + kwargs={"isolated_pid": isolated_pid, "device_ids": device_ids}, daemon=True, ) isolation_process.start() - LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}.") + LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}") + LOGGER.info(f"\t+ Isolating device(s) [{device_ids}]") yield - LOGGER.info("\t+ Closing device(s) isolation process...") - - isolation_process.kill() - isolation_process.join() - isolation_process.close() + if isolation_process.is_alive(): + LOGGER.info("\t+ Closing device(s) isolation process...") + isolation_process.kill() + isolation_process.join() + isolation_process.close() diff --git a/optimum_benchmark/launchers/process/config.py b/optimum_benchmark/launchers/process/config.py index 7d2a7c3a..9d101051 100644 --- a/optimum_benchmark/launchers/process/config.py +++ b/optimum_benchmark/launchers/process/config.py @@ -13,6 +13,8 @@ class ProcessConfig(LauncherConfig): start_method: str = "spawn" - def __post_init__(self) -> None: + def __post_init__(self): + super().__post_init__() + if self.start_method not in ["spawn", "fork"]: raise ValueError(f"start_method must be one of ['spawn', 'fork'], got {self.start_method}") diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py index c1fbfc38..ff816315 100644 --- a/optimum_benchmark/launchers/torchrun/config.py +++ b/optimum_benchmark/launchers/torchrun/config.py @@ -48,7 +48,9 @@ class TorchrunConfig(LauncherConfig): # address of the local node if any. If not set, a lookup on the local machine's FQDN will be performed. local_addr: Optional[str] = None - def __post_init__(self) -> None: + def __post_init__(self): + super().__post_init__() + if self.start_method not in ["spawn", "fork"]: raise ValueError(f"start_method must be one of ['spawn', 'fork'], got {self.start_method}")