logging and fixes

huggingface · Apr 14, 2024 · b2b67fd · b2b67fd
1 parent 4e2035d
commit b2b67fd
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 69 deletions.
diff --git a/optimum_benchmark/backends/llm_swarm/backend.py b/optimum_benchmark/backends/llm_swarm/backend.py
@@ -1,5 +1,4 @@
 import asyncio
-from collections import OrderedDict
 import gc
 from logging import getLogger
 from typing import Any, Dict, List

diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
@@ -1,6 +1,6 @@
-from collections import OrderedDict
 import gc
 import os
+from collections import OrderedDict
 from logging import getLogger
 from tempfile import TemporaryDirectory
 from typing import Any, Dict
@@ -9,7 +9,6 @@
 from hydra.utils import get_class
 from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion
 from optimum.intel.neural_compressor.quantization import INCQuantizer
-from transformers.utils import ModelOutput
 
 from ...generators.dataset_generator import DatasetGenerator
 from ..base import Backend
@@ -159,15 +158,15 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
 
     @torch.inference_mode()
-    def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model(**input, **kwargs)
 
     @torch.inference_mode()
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(**inputs, **kwargs)
 
     @torch.inference_mode()
-    def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(**input, **kwargs)
 
     def clean(self) -> None:

diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -6,7 +6,6 @@
 import torch
 from hydra.utils import get_class
 from safetensors.torch import save_file
-from transformers.utils import ModelOutput
 
 from ..base import Backend
 from .config import TRTLLMConfig
@@ -80,7 +79,7 @@ def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
             seed=kwargs.get("seed", 42),
         )
 
-    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(
             input_ids=inputs.get("input_ids"),
             attention_mask=inputs.get("attention_mask"),

diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py
@@ -231,9 +231,6 @@ def run_per_token_text_generation_latency_tracking(self, backend: Backend[Backen
         prefill_volume = self.atomic_prefill_volume
         decode_volume = self.atomic_decode_volume
 
-        LOGGER.info(f"count: {latency_tracker.count()}")
-        LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")
-
         self.report.per_token.latency = per_token_latency
         self.report.prefill.latency = prefill_latency
         self.report.decode.latency = decode_latency
@@ -259,9 +256,6 @@ def run_text_generation_latency_tracking(self, backend: Backend[BackendConfigT])
         prefill_latency = latency_tracker.get_latency()
         prefill_volume = self.atomic_prefill_volume
 
-        LOGGER.info(f"count: {latency_tracker.count()}")
-        LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")
-
         self.report.prefill.latency = prefill_latency
         self.report.prefill.throughput = Throughput.from_latency(
             prefill_latency, prefill_volume, unit=TEXT_GENERATION_THROUGHPUT_UNIT
@@ -275,9 +269,6 @@ def run_text_generation_latency_tracking(self, backend: Backend[BackendConfigT])
         decode_latency = generate_latency - prefill_latency
         decode_volume = self.atomic_decode_volume
 
-        LOGGER.info(f"count: {latency_tracker.count()}")
-        LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")
-
         self.report.decode.latency = decode_latency
         self.report.decode.throughput = Throughput.from_latency(
             decode_latency, decode_volume, unit=TEXT_GENERATION_THROUGHPUT_UNIT
@@ -293,9 +284,6 @@ def run_image_diffusion_latency_tracking(self, backend: Backend[BackendConfigT])
         call_latency = latency_tracker.get_latency()
         call_volume = self.atomic_call_volume
 
-        LOGGER.info(f"count: {latency_tracker.count()}")
-        LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")
-
         self.report.call.latency = call_latency
         self.report.call.throughput = Throughput.from_latency(
             call_latency, call_volume, unit=IMAGE_DIFFUSION_THROUGHPUT_UNIT
@@ -311,9 +299,6 @@ def run_latency_inference_tracking(self, backend: Backend[BackendConfigT]):
         forward_latency = latency_tracker.get_latency()
         forward_volume = self.atomic_forward_volume
 
-        LOGGER.info(f"count: {latency_tracker.count()}")
-        LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")
-
         self.report.forward.latency = forward_latency
         self.report.forward.throughput = Throughput.from_latency(
             forward_latency, forward_volume, unit=INFERENCE_THROUGHPUT_UNIT

diff --git a/optimum_benchmark/benchmarks/inference/config.py b/optimum_benchmark/benchmarks/inference/config.py
@@ -16,9 +16,18 @@ class InferenceConfig(BenchmarkConfig):
     _target_: str = "optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark"
 
     # benchmark options
-    duration: int = field(default=10, metadata={"help": "Minimum duration of the benchmark in seconds"})
-    iterations: int = field(default=10, metadata={"help": "Minimum number of iterations to run the benchmark"})
-    warmup_runs: int = field(default=10, metadata={"help": "Number of warmup runs to perform before benchmarking"})
+    iterations: int = field(
+        default=10,
+        metadata={"help": "Minimum number of iterations to run the benchmark, set to 0 to disable this constraint"},
+    )
+    duration: int = field(
+        default=10,
+        metadata={"help": "Minimum duration of the benchmark in seconds, set to 0 to disable this constraint"},
+    )
+    warmup_runs: int = field(
+        default=10,
+        metadata={"help": "Number of warmup runs to perform before benchmarking, set to 0 to disable warmup"},
+    )
 
     # input/output config
     input_shapes: Dict[str, Any] = field(

diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -47,10 +47,11 @@ def aggregate(energies: List["Energy"]) -> "Energy":
         return Energy(cpu=cpu, gpu=gpu, ram=ram, total=total, unit=ENERGY_UNIT)
 
     def log(self, prefix: str = "forward"):
-        LOGGER.info(f"\t\t+ {prefix} CPU energy: {self.cpu:f} ({self.unit})")
-        LOGGER.info(f"\t\t+ {prefix} GPU energy: {self.gpu:f} ({self.unit})")
-        LOGGER.info(f"\t\t+ {prefix} RAM energy: {self.ram:f} ({self.unit})")
-        LOGGER.info(f"\t\t+ {prefix} total energy: {self.total:f} ({self.unit})")
+        LOGGER.info(f"\t\t+ {prefix} energy consumption:")
+        LOGGER.info(f"\t\t\t+ CPU: {self.cpu:f} ({self.unit})")
+        LOGGER.info(f"\t\t\t+ GPU: {self.gpu:f} ({self.unit})")
+        LOGGER.info(f"\t\t\t+ RAM: {self.ram:f} ({self.unit})")
+        LOGGER.info(f"\t\t\t+ total: {self.total:f} ({self.unit})")
 
     def __sub__(self, other: "Energy") -> "Energy":
         """Enables subtraction of two Energy instances using the '-' operator."""
@@ -89,8 +90,8 @@ def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency":
     def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency":
         return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit)
 
-    def log(self, prefix: str = "forward"):
-        LOGGER.info(f"\t\t+ {prefix} efficiency: {self.value:f} ({self.unit})")
+    def log(self, prefix: str = "method"):
+        LOGGER.info(f"\t\t+ {prefix} energy efficiency: {self.value:f} ({self.unit})")
 
 
 class EnergyTracker:
@@ -115,7 +116,7 @@ def __init__(self, backend: str, device: str, device_ids: Optional[str] = None):
         self.total_energy = None
 
     @contextmanager
-    def track(self, interval=1, file_prefix="method"):
+    def track(self, interval: int = 1, file_prefix: str = "method"):
         if not is_codecarbon_available():
             raise ValueError(
                 "The library codecarbon is required to run energy benchmark, but is not installed. "

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
@@ -9,6 +9,7 @@
 if is_torch_distributed_available():
     import torch.distributed
 
+import numpy as np
 import torch
 from transformers import LogitsProcessor, TrainerCallback
 
@@ -24,9 +25,14 @@ class Latency:
     unit: Latency_Unit_Literal
 
     count: int
+    total: float
     mean: float
     stdev: float
-    total: float
+    p50: float
+    p90: float
+    p95: float
+    p99: float
+
     values: List[float]
 
     def __getitem__(self, index) -> float:
@@ -60,18 +66,30 @@ def aggregate(latencies: List["Latency"]) -> "Latency":
 
     @staticmethod
     def from_values(values: List[float], unit: str) -> "Latency":
-        count = len(values)
-        total = sum(values)
-        mean = total / count if count > 0 else 0
-        stdev = (sum((val - mean) ** 2 for val in values) / count) ** 0.5 if count > 1 else 0
-        return Latency(count=count, mean=mean, stdev=stdev, values=values, total=total, unit=unit)
-
-    def log(self, prefix: str = "forward"):
+        return Latency(
+            unit=unit,
+            count=len(values),
+            total=sum(values),
+            mean=np.mean(values),
+            stdev=np.std(values),
+            p50=np.percentile(values, 50),
+            p90=np.percentile(values, 90),
+            p95=np.percentile(values, 95),
+            p99=np.percentile(values, 99),
+            values=values,
+        )
+
+    def log(self, prefix: str = "method"):
+        stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0
         LOGGER.info(f"\t\t+ {prefix} latency:")
-        LOGGER.info(f"\t\t\t- count: {self.count}")
-        LOGGER.info(f"\t\t\t- total: {self.total:f} {self.unit}")
-        LOGGER.info(f"\t\t\t- mean: {self.mean:f} {self.unit}")
-        LOGGER.info(f"\t\t\t- stdev: {self.stdev:f} {self.unit}")
+        LOGGER.info(f"\t\t\t+ count: {self.count}")
+        LOGGER.info(f"\t\t\t+ total: {self.total:f} {self.unit}")
+        LOGGER.info(f"\t\t\t+ mean: {self.mean:f} {self.unit}")
+        LOGGER.info(f"\t\t\t+ stdev: {self.stdev:f} {self.unit} ({stdev_percentage:.2f}%)")
+        LOGGER.info(f"\t\t\t+ p50: {self.p50:f} {self.unit}")
+        LOGGER.info(f"\t\t\t+ p90: {self.p90:f} {self.unit}")
+        LOGGER.info(f"\t\t\t+ p95: {self.p95:f} {self.unit}")
+        LOGGER.info(f"\t\t\t+ p99: {self.p99:f} {self.unit}")
 
 
 @dataclass
@@ -97,7 +115,7 @@ def from_latency(latency: Latency, volume: int, unit: str) -> "Throughput":
         value = volume / latency.mean if latency.mean > 0 else 0
         return Throughput(value=value, unit=unit)
 
-    def log(self, prefix: str = "forward"):
+    def log(self, prefix: str = "method"):
         LOGGER.info(f"\t\t+ {prefix} throughput: {self.value:f} {self.unit}")
 
 

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -81,15 +81,17 @@ def aggregate(memories: List["Memory"]) -> "Memory":
         )
 
     def log(self, prefix: str = "forward"):
-        LOGGER.info(f"\t\t+ {prefix} max RAM memory: {self.max_ram:f} ({self.unit})")
+        LOGGER.info(f"\t\t+ {prefix} memory:")
+        if self.max_ram is not None:
+            LOGGER.info(f"\t\t\t- max RAM: {self.max_ram:f} ({self.unit})")
         if self.max_global_vram is not None:
-            LOGGER.info(f"\t\t+ {prefix} max global VRAM memory: {self.max_global_vram:f} ({self.unit})")
+            LOGGER.info(f"\t\t\t- max global VRAM: {self.max_global_vram:f} ({self.unit})")
         if self.max_process_vram is not None:
-            LOGGER.info(f"\t\t+ {prefix} max process VRAM memory: {self.max_process_vram:f} ({self.unit})")
+            LOGGER.info(f"\t\t\t- max process VRAM: {self.max_process_vram:f} ({self.unit})")
         if self.max_reserved is not None:
-            LOGGER.info(f"\t\t+ {prefix} max reserved memory: {self.max_reserved:f} ({self.unit})")
+            LOGGER.info(f"\t\t\t- max reserved memory: {self.max_reserved:f} ({self.unit})")
         if self.max_allocated is not None:
-            LOGGER.info(f"\t\t+ {prefix} max allocated memory: {self.max_allocated:f} ({self.unit})")
+            LOGGER.info(f"\t\t\t- max allocated memory: {self.max_allocated:f} ({self.unit})")
 
 
 class MemoryTracker:
@@ -105,7 +107,7 @@ def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):
 
         if self.device == "cuda":
             self.device_ids = list(map(int, self.device_ids.split(",")))
-            LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}")
+            LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices {self.device_ids}")
 
         if self.track_cuda_pytorch_memory:
             self.num_pytorch_devices = torch.cuda.device_count()
@@ -195,24 +197,14 @@ def _cpu_memory(self):
         self.max_ram_memory = parent_connection.recv()
 
     def get_max_memory(self):
-        if self.track_cuda_pytorch_memory:
-            return Memory(
-                unit=MEMORY_UNIT,
-                max_ram=self.max_ram_memory,
-                max_global_vram=self.max_global_vram_memory,
-                max_process_vram=self.max_process_vram_memory,
-                max_reserved=self.max_reserved_memory,
-                max_allocated=self.max_allocated_memory,
-            )
-        elif self.device == "cuda":
-            return Memory(
-                unit=MEMORY_UNIT,
-                max_ram=self.max_ram_memory,
-                max_global_vram=self.max_global_vram_memory,
-                max_process_vram=self.max_process_vram_memory,
-            )
-        else:
-            return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory)
+        return Memory(
+            unit=MEMORY_UNIT,
+            max_ram=self.max_ram_memory,
+            max_global_vram=self.max_global_vram_memory,
+            max_process_vram=self.max_process_vram_memory,
+            max_reserved=self.max_reserved_memory,
+            max_allocated=self.max_allocated_memory,
+        )
 
 
 def monitor_cpu_ram_memory(monitored_pid: int, connection: Connection, interval: float = 0.001):