Skip to content

Commit

Permalink
logging and fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Apr 14, 2024
1 parent 4e2035d commit b2b67fd
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 69 deletions.
1 change: 0 additions & 1 deletion optimum_benchmark/backends/llm_swarm/backend.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
from collections import OrderedDict
import gc
from logging import getLogger
from typing import Any, Dict, List
Expand Down
7 changes: 3 additions & 4 deletions optimum_benchmark/backends/neural_compressor/backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import OrderedDict
import gc
import os
from collections import OrderedDict
from logging import getLogger
from tempfile import TemporaryDirectory
from typing import Any, Dict
Expand All @@ -9,7 +9,6 @@
from hydra.utils import get_class
from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion
from optimum.intel.neural_compressor.quantization import INCQuantizer
from transformers.utils import ModelOutput

from ...generators.dataset_generator import DatasetGenerator
from ..base import Backend
Expand Down Expand Up @@ -159,15 +158,15 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

@torch.inference_mode()
def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
return self.pretrained_model(**input, **kwargs)

@torch.inference_mode()
def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
return self.pretrained_model.generate(**inputs, **kwargs)

@torch.inference_mode()
def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
return self.pretrained_model.generate(**input, **kwargs)

def clean(self) -> None:
Expand Down
3 changes: 1 addition & 2 deletions optimum_benchmark/backends/tensorrt_llm/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import torch
from hydra.utils import get_class
from safetensors.torch import save_file
from transformers.utils import ModelOutput

from ..base import Backend
from .config import TRTLLMConfig
Expand Down Expand Up @@ -80,7 +79,7 @@ def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
seed=kwargs.get("seed", 42),
)

def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
return self.pretrained_model.generate(
input_ids=inputs.get("input_ids"),
attention_mask=inputs.get("attention_mask"),
Expand Down
15 changes: 0 additions & 15 deletions optimum_benchmark/benchmarks/inference/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,6 @@ def run_per_token_text_generation_latency_tracking(self, backend: Backend[Backen
prefill_volume = self.atomic_prefill_volume
decode_volume = self.atomic_decode_volume

LOGGER.info(f"count: {latency_tracker.count()}")
LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")

self.report.per_token.latency = per_token_latency
self.report.prefill.latency = prefill_latency
self.report.decode.latency = decode_latency
Expand All @@ -259,9 +256,6 @@ def run_text_generation_latency_tracking(self, backend: Backend[BackendConfigT])
prefill_latency = latency_tracker.get_latency()
prefill_volume = self.atomic_prefill_volume

LOGGER.info(f"count: {latency_tracker.count()}")
LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")

self.report.prefill.latency = prefill_latency
self.report.prefill.throughput = Throughput.from_latency(
prefill_latency, prefill_volume, unit=TEXT_GENERATION_THROUGHPUT_UNIT
Expand All @@ -275,9 +269,6 @@ def run_text_generation_latency_tracking(self, backend: Backend[BackendConfigT])
decode_latency = generate_latency - prefill_latency
decode_volume = self.atomic_decode_volume

LOGGER.info(f"count: {latency_tracker.count()}")
LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")

self.report.decode.latency = decode_latency
self.report.decode.throughput = Throughput.from_latency(
decode_latency, decode_volume, unit=TEXT_GENERATION_THROUGHPUT_UNIT
Expand All @@ -293,9 +284,6 @@ def run_image_diffusion_latency_tracking(self, backend: Backend[BackendConfigT])
call_latency = latency_tracker.get_latency()
call_volume = self.atomic_call_volume

LOGGER.info(f"count: {latency_tracker.count()}")
LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")

self.report.call.latency = call_latency
self.report.call.throughput = Throughput.from_latency(
call_latency, call_volume, unit=IMAGE_DIFFUSION_THROUGHPUT_UNIT
Expand All @@ -311,9 +299,6 @@ def run_latency_inference_tracking(self, backend: Backend[BackendConfigT]):
forward_latency = latency_tracker.get_latency()
forward_volume = self.atomic_forward_volume

LOGGER.info(f"count: {latency_tracker.count()}")
LOGGER.info(f"elapsed: {latency_tracker.elapsed()}")

self.report.forward.latency = forward_latency
self.report.forward.throughput = Throughput.from_latency(
forward_latency, forward_volume, unit=INFERENCE_THROUGHPUT_UNIT
Expand Down
15 changes: 12 additions & 3 deletions optimum_benchmark/benchmarks/inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,18 @@ class InferenceConfig(BenchmarkConfig):
_target_: str = "optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark"

# benchmark options
duration: int = field(default=10, metadata={"help": "Minimum duration of the benchmark in seconds"})
iterations: int = field(default=10, metadata={"help": "Minimum number of iterations to run the benchmark"})
warmup_runs: int = field(default=10, metadata={"help": "Number of warmup runs to perform before benchmarking"})
iterations: int = field(
default=10,
metadata={"help": "Minimum number of iterations to run the benchmark, set to 0 to disable this constraint"},
)
duration: int = field(
default=10,
metadata={"help": "Minimum duration of the benchmark in seconds, set to 0 to disable this constraint"},
)
warmup_runs: int = field(
default=10,
metadata={"help": "Number of warmup runs to perform before benchmarking, set to 0 to disable warmup"},
)

# input/output config
input_shapes: Dict[str, Any] = field(
Expand Down
15 changes: 8 additions & 7 deletions optimum_benchmark/trackers/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ def aggregate(energies: List["Energy"]) -> "Energy":
return Energy(cpu=cpu, gpu=gpu, ram=ram, total=total, unit=ENERGY_UNIT)

def log(self, prefix: str = "forward"):
LOGGER.info(f"\t\t+ {prefix} CPU energy: {self.cpu:f} ({self.unit})")
LOGGER.info(f"\t\t+ {prefix} GPU energy: {self.gpu:f} ({self.unit})")
LOGGER.info(f"\t\t+ {prefix} RAM energy: {self.ram:f} ({self.unit})")
LOGGER.info(f"\t\t+ {prefix} total energy: {self.total:f} ({self.unit})")
LOGGER.info(f"\t\t+ {prefix} energy consumption:")
LOGGER.info(f"\t\t\t+ CPU: {self.cpu:f} ({self.unit})")
LOGGER.info(f"\t\t\t+ GPU: {self.gpu:f} ({self.unit})")
LOGGER.info(f"\t\t\t+ RAM: {self.ram:f} ({self.unit})")
LOGGER.info(f"\t\t\t+ total: {self.total:f} ({self.unit})")

def __sub__(self, other: "Energy") -> "Energy":
"""Enables subtraction of two Energy instances using the '-' operator."""
Expand Down Expand Up @@ -89,8 +90,8 @@ def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency":
def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency":
return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit)

def log(self, prefix: str = "forward"):
LOGGER.info(f"\t\t+ {prefix} efficiency: {self.value:f} ({self.unit})")
def log(self, prefix: str = "method"):
LOGGER.info(f"\t\t+ {prefix} energy efficiency: {self.value:f} ({self.unit})")


class EnergyTracker:
Expand All @@ -115,7 +116,7 @@ def __init__(self, backend: str, device: str, device_ids: Optional[str] = None):
self.total_energy = None

@contextmanager
def track(self, interval=1, file_prefix="method"):
def track(self, interval: int = 1, file_prefix: str = "method"):
if not is_codecarbon_available():
raise ValueError(
"The library codecarbon is required to run energy benchmark, but is not installed. "
Expand Down
44 changes: 31 additions & 13 deletions optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
if is_torch_distributed_available():
import torch.distributed

import numpy as np
import torch
from transformers import LogitsProcessor, TrainerCallback

Expand All @@ -24,9 +25,14 @@ class Latency:
unit: Latency_Unit_Literal

count: int
total: float
mean: float
stdev: float
total: float
p50: float
p90: float
p95: float
p99: float

values: List[float]

def __getitem__(self, index) -> float:
Expand Down Expand Up @@ -60,18 +66,30 @@ def aggregate(latencies: List["Latency"]) -> "Latency":

@staticmethod
def from_values(values: List[float], unit: str) -> "Latency":
count = len(values)
total = sum(values)
mean = total / count if count > 0 else 0
stdev = (sum((val - mean) ** 2 for val in values) / count) ** 0.5 if count > 1 else 0
return Latency(count=count, mean=mean, stdev=stdev, values=values, total=total, unit=unit)

def log(self, prefix: str = "forward"):
return Latency(
unit=unit,
count=len(values),
total=sum(values),
mean=np.mean(values),
stdev=np.std(values),
p50=np.percentile(values, 50),
p90=np.percentile(values, 90),
p95=np.percentile(values, 95),
p99=np.percentile(values, 99),
values=values,
)

def log(self, prefix: str = "method"):
stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0
LOGGER.info(f"\t\t+ {prefix} latency:")
LOGGER.info(f"\t\t\t- count: {self.count}")
LOGGER.info(f"\t\t\t- total: {self.total:f} {self.unit}")
LOGGER.info(f"\t\t\t- mean: {self.mean:f} {self.unit}")
LOGGER.info(f"\t\t\t- stdev: {self.stdev:f} {self.unit}")
LOGGER.info(f"\t\t\t+ count: {self.count}")
LOGGER.info(f"\t\t\t+ total: {self.total:f} {self.unit}")
LOGGER.info(f"\t\t\t+ mean: {self.mean:f} {self.unit}")
LOGGER.info(f"\t\t\t+ stdev: {self.stdev:f} {self.unit} ({stdev_percentage:.2f}%)")
LOGGER.info(f"\t\t\t+ p50: {self.p50:f} {self.unit}")
LOGGER.info(f"\t\t\t+ p90: {self.p90:f} {self.unit}")
LOGGER.info(f"\t\t\t+ p95: {self.p95:f} {self.unit}")
LOGGER.info(f"\t\t\t+ p99: {self.p99:f} {self.unit}")


@dataclass
Expand All @@ -97,7 +115,7 @@ def from_latency(latency: Latency, volume: int, unit: str) -> "Throughput":
value = volume / latency.mean if latency.mean > 0 else 0
return Throughput(value=value, unit=unit)

def log(self, prefix: str = "forward"):
def log(self, prefix: str = "method"):
LOGGER.info(f"\t\t+ {prefix} throughput: {self.value:f} {self.unit}")


Expand Down
40 changes: 16 additions & 24 deletions optimum_benchmark/trackers/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,17 @@ def aggregate(memories: List["Memory"]) -> "Memory":
)

def log(self, prefix: str = "forward"):
LOGGER.info(f"\t\t+ {prefix} max RAM memory: {self.max_ram:f} ({self.unit})")
LOGGER.info(f"\t\t+ {prefix} memory:")
if self.max_ram is not None:
LOGGER.info(f"\t\t\t- max RAM: {self.max_ram:f} ({self.unit})")
if self.max_global_vram is not None:
LOGGER.info(f"\t\t+ {prefix} max global VRAM memory: {self.max_global_vram:f} ({self.unit})")
LOGGER.info(f"\t\t\t- max global VRAM: {self.max_global_vram:f} ({self.unit})")
if self.max_process_vram is not None:
LOGGER.info(f"\t\t+ {prefix} max process VRAM memory: {self.max_process_vram:f} ({self.unit})")
LOGGER.info(f"\t\t\t- max process VRAM: {self.max_process_vram:f} ({self.unit})")
if self.max_reserved is not None:
LOGGER.info(f"\t\t+ {prefix} max reserved memory: {self.max_reserved:f} ({self.unit})")
LOGGER.info(f"\t\t\t- max reserved memory: {self.max_reserved:f} ({self.unit})")
if self.max_allocated is not None:
LOGGER.info(f"\t\t+ {prefix} max allocated memory: {self.max_allocated:f} ({self.unit})")
LOGGER.info(f"\t\t\t- max allocated memory: {self.max_allocated:f} ({self.unit})")


class MemoryTracker:
Expand All @@ -105,7 +107,7 @@ def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):

if self.device == "cuda":
self.device_ids = list(map(int, self.device_ids.split(",")))
LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}")
LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices {self.device_ids}")

if self.track_cuda_pytorch_memory:
self.num_pytorch_devices = torch.cuda.device_count()
Expand Down Expand Up @@ -195,24 +197,14 @@ def _cpu_memory(self):
self.max_ram_memory = parent_connection.recv()

def get_max_memory(self):
if self.track_cuda_pytorch_memory:
return Memory(
unit=MEMORY_UNIT,
max_ram=self.max_ram_memory,
max_global_vram=self.max_global_vram_memory,
max_process_vram=self.max_process_vram_memory,
max_reserved=self.max_reserved_memory,
max_allocated=self.max_allocated_memory,
)
elif self.device == "cuda":
return Memory(
unit=MEMORY_UNIT,
max_ram=self.max_ram_memory,
max_global_vram=self.max_global_vram_memory,
max_process_vram=self.max_process_vram_memory,
)
else:
return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory)
return Memory(
unit=MEMORY_UNIT,
max_ram=self.max_ram_memory,
max_global_vram=self.max_global_vram_memory,
max_process_vram=self.max_process_vram_memory,
max_reserved=self.max_reserved_memory,
max_allocated=self.max_allocated_memory,
)


def monitor_cpu_ram_memory(monitored_pid: int, connection: Connection, interval: float = 0.001):
Expand Down

0 comments on commit b2b67fd

Please sign in to comment.