Skip to content

Commit

Permalink
Explicitly passing visible devices to isolation process (#177)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Apr 8, 2024
1 parent 1f9d645 commit 379b5ad
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 24 deletions.
3 changes: 3 additions & 0 deletions examples/pytorch_bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ defaults:

experiment_name: pytorch_bert

launcher:
device_isolation: true

benchmark:
latency: true
memory: true
Expand Down
7 changes: 3 additions & 4 deletions optimum_benchmark/backends/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@ class BackendConfig(ABC):
version: str
_target_: str

model: Optional[str] = None

task: Optional[str] = None
model: Optional[str] = None
library: Optional[str] = None

device: Optional[str] = None
Expand Down Expand Up @@ -74,11 +73,11 @@ def __post_init__(self):
if is_nvidia_system():
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids

elif is_rocm_system():
# https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html
# ROCR_VISIBLE_DEVICES is better than HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES
os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids
else:
raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")

if self.library not in ["transformers", "diffusers", "timm"]:
raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}")
Expand Down
8 changes: 6 additions & 2 deletions optimum_benchmark/launchers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@ class LauncherConfig(ABC):
device_isolation: bool = False

def __post_init__(self):
if not is_nvidia_system() and not is_rocm_system():
raise ValueError("Device isolation is not supported on NVIDIA or ROCm systems")
if self.device_isolation and not is_nvidia_system() and not is_rocm_system():
raise ValueError(
"Device isolation is only supported on NVIDIA and ROCm systems. "
"Please set `device_isolation` to False or make sure your drivers "
"are correctly installed by running `nvidia-smi` or `rocm-smi`."
)


LauncherConfigT = TypeVar("LauncherConfigT", bound=LauncherConfig)
3 changes: 3 additions & 0 deletions optimum_benchmark/launchers/inline/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@
class InlineConfig(LauncherConfig):
name: str = "inline"
_target_: str = "optimum_benchmark.launchers.inline.launcher.InlineLauncher"

def __post_init__(self):
super().__post_init__()
46 changes: 30 additions & 16 deletions optimum_benchmark/launchers/isolation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def isolation_signal_handler(signum, frame):
signal.signal(signal.SIGUSR1, isolation_signal_handler)


def get_nvidia_devices_pids() -> Set[int]:
def get_nvidia_devices_pids(device_ids: str) -> Set[int]:
if not is_pynvml_available():
raise ValueError(
"The library pynvml is required to get the pids running on NVIDIA GPUs, but is not installed. "
Expand All @@ -41,7 +41,7 @@ def get_nvidia_devices_pids() -> Set[int]:
pynvml.nvmlInit()

devices_pids = set()
devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
devices_ids = list(map(int, device_ids.split(",")))

for device_id in devices_ids:
device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
Expand All @@ -54,7 +54,7 @@ def get_nvidia_devices_pids() -> Set[int]:
return devices_pids


def get_amd_devices_pids() -> Set[int]:
def get_amd_devices_pids(device_ids: str) -> Set[int]:
if not is_amdsmi_available():
raise ValueError(
"The library amdsmi is required to get the pids running on AMD GPUs, but is not installed. "
Expand All @@ -64,7 +64,7 @@ def get_amd_devices_pids() -> Set[int]:
amdsmi.amdsmi_init()

devices_pids = set()
devices_ids = [int(device_id) for device_id in os.environ["ROCR_VISIBLE_DEVICES"].split(",")]
devices_ids = list(map(int, device_ids.split(",")))

processor_handles = amdsmi.amdsmi_get_processor_handles()
for device_id in devices_ids:
Expand Down Expand Up @@ -93,24 +93,25 @@ def get_amd_devices_pids() -> Set[int]:
return devices_pids


def get_pids_running_on_system_devices() -> Set[int]:
def get_pids_running_on_system_devices(device_ids: str) -> Set[int]:
"""Returns the set of pids running on the system device(s)."""
if is_nvidia_system():
devices_pids = get_nvidia_devices_pids()
devices_pids = get_nvidia_devices_pids(device_ids)
elif is_rocm_system():
devices_pids = get_amd_devices_pids()
devices_pids = get_amd_devices_pids(device_ids)
else:
raise ValueError("get_pids_running_on_system_device is only supported on NVIDIA and AMD GPUs")

return devices_pids


def assert_system_devices_isolation(isolated_pid: int) -> None:
def assert_system_devices_isolation(isolated_pid: int, device_ids: str):
setup_logging("ERROR")

isolation_pid = os.getpid()

while psutil.pid_exists(isolated_pid):
devices_pids = get_pids_running_on_system_devices()
devices_pids = get_pids_running_on_system_devices(device_ids=device_ids)
devices_pids = {pid for pid in devices_pids if psutil.pid_exists(pid)}
isolated_children_pids = {child.pid for child in psutil.Process(isolated_pid).children(recursive=True)}
isolation_children_pids = {child.pid for child in psutil.Process(isolation_pid).children(recursive=True)}
Expand Down Expand Up @@ -141,19 +142,32 @@ def device_isolation(enabled: bool, isolated_pid: int):
yield
return

if is_nvidia_system():
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
elif is_rocm_system():
device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", None)
else:
raise ValueError("Device isolation is only supported on NVIDIA and AMD GPUs")

if device_ids is None:
raise ValueError(
"Device isolation requires CUDA_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES to be set but none were found."
)

isolation_process = Process(
target=assert_system_devices_isolation,
kwargs={"isolated_pid": isolated_pid},
kwargs={"isolated_pid": isolated_pid, "device_ids": device_ids},
daemon=True,
)
isolation_process.start()

LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}.")
LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}")
LOGGER.info(f"\t+ Isolating device(s) [{device_ids}]")

yield

LOGGER.info("\t+ Closing device(s) isolation process...")

isolation_process.kill()
isolation_process.join()
isolation_process.close()
if isolation_process.is_alive():
LOGGER.info("\t+ Closing device(s) isolation process...")
isolation_process.kill()
isolation_process.join()
isolation_process.close()
4 changes: 3 additions & 1 deletion optimum_benchmark/launchers/process/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ class ProcessConfig(LauncherConfig):

start_method: str = "spawn"

def __post_init__(self) -> None:
def __post_init__(self):
super().__post_init__()

if self.start_method not in ["spawn", "fork"]:
raise ValueError(f"start_method must be one of ['spawn', 'fork'], got {self.start_method}")
4 changes: 3 additions & 1 deletion optimum_benchmark/launchers/torchrun/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ class TorchrunConfig(LauncherConfig):
# address of the local node if any. If not set, a lookup on the local machine's FQDN will be performed.
local_addr: Optional[str] = None

def __post_init__(self) -> None:
def __post_init__(self):
super().__post_init__()

if self.start_method not in ["spawn", "fork"]:
raise ValueError(f"start_method must be one of ['spawn', 'fork'], got {self.start_method}")

Expand Down

0 comments on commit 379b5ad

Please sign in to comment.