From 2723460860ff97af1b81d889321a3fd90acec028 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 9 Dec 2024 11:31:43 +0100 Subject: [PATCH] fix --- optimum_benchmark/trackers/latency.py | 52 +++++++++++++++++++-------- tests/test_api.py | 31 +++++++++------- 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index b25ef53d..29bb1bf9 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -234,19 +234,19 @@ def session(self): self.start_time = None def count(self) -> int: - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" assert len(self.start_events) == len(self.end_events) return len(self.start_events) def elapsed(self): - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" return time.perf_counter() - self.start_time @contextmanager def track(self): - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" if self.is_pytorch_cuda: start_event = torch.cuda.Event(enable_timing=True) @@ -321,7 +321,7 @@ def session(self): self.start_time = None def count(self) -> int: - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" assert ( len(self.prefill_start_events) == len(self.prefill_end_events) @@ -332,12 +332,14 @@ def count(self) -> int: return len(self.prefill_start_events) def elapsed(self): - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" return time.perf_counter() - self.start_time @contextmanager def track(self): + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" + if self.is_pytorch_cuda: start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) @@ -357,6 +359,8 @@ def track(self): self.per_token_end_events.extend(self.per_token_events[1:]) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" + if self.is_pytorch_cuda: event = torch.cuda.Event(enable_timing=True) event.record() @@ -364,7 +368,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): event = time.perf_counter() if len(self.prefill_start_events) == len(self.prefill_end_events): - # on the first call, there will be the same number of prefill/decode start/end events + # on the first call (prefill), there will be the same number of prefill/decode start/end events self.prefill_end_events.append(event) self.decode_start_events.append(event) @@ -373,6 +377,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): return scores def get_prefill_latency(self) -> Latency: + assert len(self.prefill_start_events) == len(self.prefill_end_events) > 0 + if self.is_pytorch_cuda: torch.cuda.synchronize() @@ -391,6 +397,8 @@ def get_prefill_latency(self) -> Latency: return Latency.from_values(latencies, unit=LATENCY_UNIT) def get_decode_latency(self) -> Latency: + assert len(self.decode_start_events) == len(self.decode_end_events) > 0 + if self.is_pytorch_cuda: torch.cuda.synchronize() @@ -409,6 +417,8 @@ def get_decode_latency(self) -> Latency: return Latency.from_values(latencies, unit=LATENCY_UNIT) def get_per_token_latency(self) -> Latency: + assert len(self.per_token_start_events) == len(self.per_token_end_events) > 0 + if self.is_pytorch_cuda: torch.cuda.synchronize() @@ -464,18 +474,20 @@ def session(self): self.start_time = None def count(self) -> int: - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" assert len(self.call_start_events) == len(self.call_start_events) return len(self.call_start_events) def elapsed(self): - assert self.start_time is not None + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" return time.perf_counter() - self.start_time @contextmanager def track(self): + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" + if self.is_pytorch_cuda: start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) @@ -495,6 +507,8 @@ def track(self): self.per_step_end_events.extend(self.per_step_events[1:]) def __call__(self, pipeline, step_index, timestep, callback_kwargs): + assert self.start_time is not None, "This method can only be called inside of a '.session()' context" + if self.is_pytorch_cuda: event = torch.cuda.Event(enable_timing=True) event.record() @@ -506,6 +520,8 @@ def __call__(self, pipeline, step_index, timestep, callback_kwargs): return callback_kwargs def get_step_latency(self) -> Latency: + assert len(self.per_step_start_events) == len(self.per_step_end_events) > 0 + if self.is_pytorch_cuda: torch.cuda.synchronize() @@ -524,6 +540,8 @@ def get_step_latency(self) -> Latency: return Latency.from_values(latencies, unit=LATENCY_UNIT) def get_call_latency(self) -> Latency: + assert len(self.call_start_events) == len(self.call_end_events) > 0 + if self.is_pytorch_cuda: torch.cuda.synchronize() @@ -559,20 +577,24 @@ def __init__(self, device: str, backend: str) -> None: def on_step_begin(self, *args, **kwargs): if self.is_pytorch_cuda: - self.start_events.append(torch.cuda.Event(enable_timing=True)) - self.end_events.append(torch.cuda.Event(enable_timing=True)) - self.start_events[-1].record() + event = torch.cuda.Event(enable_timing=True) + event.record() else: - self.start_events.append(time.perf_counter()) + event = time.perf_counter() + + self.start_events.append(event) def on_step_end(self, *args, **kwargs): if self.is_pytorch_cuda: - self.end_events[-1].record() + event = torch.cuda.Event(enable_timing=True) + event.record() else: - self.end_events.append(time.perf_counter()) + event = time.perf_counter() + + self.end_events.append(event) def get_latency(self) -> Latency: - assert len(self.start_events) == len(self.end_events) >= 0 + assert len(self.start_events) == len(self.end_events) > 0 if self.is_pytorch_cuda: torch.cuda.synchronize() diff --git a/tests/test_api.py b/tests/test_api.py index 9122eb49..a0bb4754 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -23,7 +23,7 @@ from optimum_benchmark.generators.input_generator import InputGenerator from optimum_benchmark.import_utils import get_git_revision_hash from optimum_benchmark.system_utils import is_nvidia_system, is_rocm_system -from optimum_benchmark.trackers import LatencyTracker, MemoryTracker +from optimum_benchmark.trackers import LatencySessionTracker, MemoryTracker PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", "optimum-benchmark/local") @@ -55,6 +55,9 @@ @pytest.mark.parametrize("scenario", ["training", "inference"]) @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS) def test_api_launch(device, scenario, library, task, model): + if scenario == "training" and library != "transformers": + pytest.skip("Training is only supported with transformers library models") + benchmark_name = f"{device}_{scenario}_{library}_{task}_{model}" if device == "cuda": @@ -65,24 +68,26 @@ def test_api_launch(device, scenario, library, task, model): elif is_nvidia_system(): device_isolation_action = "error" device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + else: + raise RuntimeError("Using CUDA device on a machine that is neither NVIDIA nor ROCM.") else: device_isolation_action = None device_isolation = False device_ids = None - launcher_config = ProcessConfig(device_isolation=device_isolation, device_isolation_action=device_isolation_action) + launcher_config = ProcessConfig( + device_isolation=device_isolation, + device_isolation_action=device_isolation_action, + ) if scenario == "training": - if library == "transformers": - scenario_config = TrainingConfig( - memory=True, - latency=True, - energy=not is_rocm_system(), - warmup_steps=2, - max_steps=5, - ) - else: - pytest.skip("Training scenario is only available for Transformers library") + scenario_config = TrainingConfig( + memory=True, + latency=True, + energy=not is_rocm_system(), + warmup_steps=2, + max_steps=5, + ) elif scenario == "inference": scenario_config = InferenceConfig( @@ -227,7 +232,7 @@ def test_api_dataset_generator(library, task, model): @pytest.mark.parametrize("device", ["cpu", "cuda"]) @pytest.mark.parametrize("backend", ["pytorch", "other"]) def test_api_latency_tracker(device, backend): - tracker = LatencyTracker(device=device, backend=backend) + tracker = LatencySessionTracker(device=device, backend=backend) with tracker.session(): while tracker.elapsed() < 2: