Skip to content

Commit

Permalink
Support of optimum-nvidia's trt-llm (#98)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Jan 2, 2024
1 parent 487583f commit 9b3fc4d
Show file tree
Hide file tree
Showing 13 changed files with 350 additions and 42 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/test_tensorrt_llm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: TensorRT-LLM Tests

on:
workflow_dispatch:
push:
branches: [main]
pull_request:
types: [opened, reopened, synchronize]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
pull_image_and_run_gpu_tests:
runs-on: hf-dgx-01
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Pull image
run: docker pull huggingface/optimum-nvidia:latest

- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--env USER_ID=$(id -u)
--env GROUP_ID=$(id -g)
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
--entrypoint /bin/bash
huggingface/optimum-nvidia:latest
-c "pip install -e .[test] && pytest -k 'tensorrt_llm' -x && chown -R $USER_ID:$GROUP_ID ."
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,5 @@ data/
version.txt

actions-runner/
experiments/
experiments/
.engine/
33 changes: 33 additions & 0 deletions examples/trt_llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
defaults:
- backend: tensorrt # default backend
- launcher: process # default launcher
- benchmark: inference # default benchmark
- experiment # inheriting experiment schema
- _self_ # for hydra 1.1 compatibility
- override hydra/job_logging: colorlog # colorful logging
- override hydra/hydra_logging: colorlog # colorful logging

experiment_name: trt_llama
model: NousResearch/Llama-2-7b-hf
device: cuda

backend:
continuous_isolation: false

benchmark:
input_shapes:
batch_size: 1
sequence_length: 64
new_tokens: 128

hydra:
run:
dir: runs/${experiment_name}
sweep:
dir: sweeps/${experiment_name}
job:
chdir: true
env_set:
OVERRIDE_BENCHMARKS: 1
CUDA_VISIBLE_DEVICES: 0
CUDA_DEVICE_ORDER: PCI_BUS_ID
9 changes: 6 additions & 3 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import Any, Callable, ClassVar, Dict, Generic, Optional, Union

import numpy as np
from optimum.exporters import TasksManager
from transformers import (
AutoConfig,
AutoProcessor,
Expand All @@ -20,7 +19,11 @@
)
from transformers.utils import ModelOutput

from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS
from ..task_utils import (
DIFFUSION_TASKS,
TEXT_GENERATION_TASKS,
get_model_class_for_task,
)
from .config import BackendConfigT
from .isolation_utils import check_cuda_continuous_isolation
from .utils import (
Expand Down Expand Up @@ -85,7 +88,7 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any
else:
self.pretrained_generation_config = None

self.automodel_class = TasksManager.get_model_class_for_task(
self.automodel_class = get_model_class_for_task(
framework="pt", # TODO: make this configurable to add support for other frameworks
task=self.task,
library=self.library,
Expand Down
Empty file.
77 changes: 77 additions & 0 deletions optimum_benchmark/backends/tensorrt/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from logging import getLogger
from typing import Any, Dict

from hydra.utils import get_class
from transformers.utils import ModelOutput

from ..base import Backend
from .config import TRTConfig
from .utils import MODEL_TYPE_TO_TRTMODEL

LOGGER = getLogger("tensorrt")


class TRTBackend(Backend):
NAME: str = "tensorrt"

def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None:
super().__init__(model, task, device, hub_kwargs)
self.validate_device()
self.validate_model_type()

def validate_model_type(self) -> None:
if self.model_type not in MODEL_TYPE_TO_TRTMODEL:
raise NotImplementedError(f"TRTBackend does not support model_type {self.model_type}")

def validate_device(self) -> None:
if self.device != "cuda":
raise NotImplementedError(f"TRTBackend only supports device cuda, got {self.device}")

def configure(self, config: TRTConfig) -> None:
super().configure(config)

self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTMODEL[self.model_type])
ortmodel_name = self.trtmodel_class.__name__
LOGGER.info(
f"\t+ Inferred TRTModel class {ortmodel_name} for task {self.task} and model_type {self.model_type}"
)

# TODO: save engine path for reuse, then maybe re build with max_prompt_size
self.load_trtmodel_from_pretrained()

@property
def trtmodel_kwargs(self) -> Dict[str, Any]:
return {}

def load_trtmodel_from_pretrained(self) -> None:
self.pretrained_model = self.trtmodel_class.from_pretrained(
self.model,
**self.trtmodel_kwargs,
**self.hub_kwargs,
)

def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
return self.pretrained_model.generate(
input_ids=input.get("input_ids", None),
attention_mask=input.get("attention_mask", None),
max_new_tokens=1,
)

def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
return self.pretrained_model.generate(
# spelling args to avoid conflict
input_ids=input.get("inputs", None), # diff api
attention_mask=input.get("attention_mask", None),
max_new_tokens=kwargs.get("max_new_tokens", -1),
min_length=kwargs.get("min_new_tokens", -1), # diff api
num_beams=kwargs.get("num_beams", 1),
temperature=kwargs.get("temperature", 1.0),
top_k=kwargs.get("top_k", 50),
top_p=kwargs.get("top_p", 1.0),
repetition_penalty=kwargs.get("repetition_penalty", 1.0),
length_penalty=kwargs.get("length_penalty", 1.0),
seed=kwargs.get("seed", 42),
pad_token_id=kwargs.get("pad_token_id", 0),
bos_token_id=kwargs.get("bos_token_id", 1),
eos_token_id=kwargs.get("eos_token_id", 2),
)
18 changes: 18 additions & 0 deletions optimum_benchmark/backends/tensorrt/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import dataclass
from logging import getLogger

from omegaconf import OmegaConf

from ...import_utils import tesnorrt_version
from ..config import BackendConfig

LOGGER = getLogger("tensorrt")

OmegaConf.register_new_resolver("tensorrt_version", tesnorrt_version)


@dataclass
class TRTConfig(BackendConfig):
name: str = "tensorrt"
version: str = "${tensorrt_version:}"
_target_: str = "optimum_benchmark.backends.tensorrt.backend.TRTBackend"
1 change: 1 addition & 0 deletions optimum_benchmark/backends/tensorrt/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
MODEL_TYPE_TO_TRTMODEL = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"}
16 changes: 9 additions & 7 deletions optimum_benchmark/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .backends.onnxruntime.config import ORTConfig
from .backends.openvino.config import OVConfig
from .backends.pytorch.config import PyTorchConfig
from .backends.tensorrt.config import TRTConfig
from .backends.text_generation_inference.config import TGIConfig
from .benchmarks.inference.config import InferenceConfig
from .benchmarks.training.config import TrainingConfig
Expand Down Expand Up @@ -129,6 +130,7 @@ def __post_init__(self) -> None:
cs.store(name="experiment", node=ExperimentConfig)
#
cs.store(group="backend", name="openvino", node=OVConfig)
cs.store(group="backend", name="tensorrt", node=TRTConfig)
cs.store(group="backend", name="pytorch", node=PyTorchConfig)
cs.store(group="backend", name="onnxruntime", node=ORTConfig)
cs.store(group="backend", name="neural-compressor", node=INCConfig)
Expand All @@ -142,14 +144,14 @@ def __post_init__(self) -> None:
cs.store(group="launcher", name="torchrun", node=TorchrunConfig)


def run(experiment: "ExperimentConfig") -> "Benchmark":
def run(experiment: ExperimentConfig):
# Instantiate the experiment config to trigger __post_init__
experiment: ExperimentConfig = OmegaConf.to_object(experiment)
OmegaConf.save(experiment, "hydra_config.yaml", resolve=True)

# Allocate requested backend
backend_factory: Type["Backend"] = get_class(experiment.backend._target_)
backend: "Backend" = backend_factory(
backend_factory: Type[Backend] = get_class(experiment.backend._target_)
backend: Backend = backend_factory(
task=experiment.task,
model=experiment.model,
device=experiment.device,
Expand All @@ -165,8 +167,8 @@ def run(experiment: "ExperimentConfig") -> "Benchmark":
raise e

# Allocate requested benchmark
benchmark_factory: Type["Benchmark"] = get_class(experiment.benchmark._target_)
benchmark: "Benchmark" = benchmark_factory()
benchmark_factory: Type[Benchmark] = get_class(experiment.benchmark._target_)
benchmark: Benchmark = benchmark_factory()

try:
# Configure the benchmark
Expand Down Expand Up @@ -195,8 +197,8 @@ def run_with_launcher(experiment: DictConfig):
# Instantiate the experiment config to trigger __post_init__
experiment.launcher = OmegaConf.to_object(experiment.launcher)

launcher_factory: Type["Launcher"] = get_class(experiment.launcher._target_)
launcher: "Launcher" = launcher_factory()
launcher_factory: Type[Launcher] = get_class(experiment.launcher._target_)
launcher: Launcher = launcher_factory()

try:
launcher.configure(experiment.launcher)
Expand Down
10 changes: 10 additions & 0 deletions optimum_benchmark/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
_optimum_available = importlib.util.find_spec("optimum") is not None
_torch_available = importlib.util.find_spec("torch") is not None
_onnx_available = importlib.util.find_spec("onnx") is not None
_tensorrt_available = importlib.util.find_spec("tensorrt") is not None
_peft_available = importlib.util.find_spec("peft") is not None
_py3nvml_available = importlib.util.find_spec("py3nvml") is not None
_torch_distributed_available = importlib.util.find_spec("torch.distributed") is not None
Expand All @@ -18,6 +19,10 @@
_amdsmi_available = importlib.util.find_spec("amdsmi") is not None


def is_tensorrt_available():
return _tensorrt_available


def is_peft_available():
return _peft_available

Expand Down Expand Up @@ -63,6 +68,11 @@ def torch_version():
return importlib.metadata.version("torch")


def tesnorrt_version():
if is_tensorrt_available():
return importlib.metadata.version("tensorrt")


def onnxruntime_version():
try:
return "ort:" + importlib.metadata.version("onnxruntime")
Expand Down
Loading

0 comments on commit 9b3fc4d

Please sign in to comment.