Support of optimum-nvidia's trt-llm (#98)

huggingface · Jan 2, 2024 · 9b3fc4d · 9b3fc4d
1 parent 487583f
commit 9b3fc4d
Show file tree

Hide file tree

Showing 13 changed files with 350 additions and 42 deletions.
diff --git a/.github/workflows/test_tensorrt_llm.yaml b/.github/workflows/test_tensorrt_llm.yaml
@@ -0,0 +1,38 @@
+name: TensorRT-LLM Tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [main]
+  pull_request:
+    types: [opened, reopened, synchronize]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pull_image_and_run_gpu_tests:
+    runs-on: hf-dgx-01
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Pull image
+        run: docker pull huggingface/optimum-nvidia:latest
+
+      - name: Run tests
+        run: docker run
+          --rm
+          --net host
+          --pid host
+          --shm-size 64G
+          --env USE_CUDA="1"
+          --env USER_ID=$(id -u)
+          --env GROUP_ID=$(id -g)
+          --volume $(pwd):/workspace/optimum-benchmark
+          --workdir /workspace/optimum-benchmark
+          --gpus '"device=0,1"'
+          --entrypoint /bin/bash
+          huggingface/optimum-nvidia:latest
+          -c "pip install -e .[test] && pytest -k 'tensorrt_llm' -x && chown -R $USER_ID:$GROUP_ID ."
diff --git a/.gitignore b/.gitignore
@@ -168,4 +168,5 @@ data/
 version.txt
 
 actions-runner/
-experiments/
+experiments/
+.engine/
diff --git a/examples/trt_llama.yaml b/examples/trt_llama.yaml
@@ -0,0 +1,33 @@
+defaults:
+  - backend: tensorrt # default backend
+  - launcher: process # default launcher
+  - benchmark: inference # default benchmark
+  - experiment # inheriting experiment schema
+  - _self_ # for hydra 1.1 compatibility
+  - override hydra/job_logging: colorlog # colorful logging
+  - override hydra/hydra_logging: colorlog # colorful logging
+
+experiment_name: trt_llama
+model: NousResearch/Llama-2-7b-hf
+device: cuda
+
+backend:
+  continuous_isolation: false
+
+benchmark:
+  input_shapes:
+    batch_size: 1
+    sequence_length: 64
+  new_tokens: 128
+
+hydra:
+  run:
+    dir: runs/${experiment_name}
+  sweep:
+    dir: sweeps/${experiment_name}
+  job:
+    chdir: true
+    env_set:
+      OVERRIDE_BENCHMARKS: 1
+      CUDA_VISIBLE_DEVICES: 0
+      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -8,7 +8,6 @@
 from typing import Any, Callable, ClassVar, Dict, Generic, Optional, Union
 
 import numpy as np
-from optimum.exporters import TasksManager
 from transformers import (
     AutoConfig,
     AutoProcessor,
@@ -20,7 +19,11 @@
 )
 from transformers.utils import ModelOutput
 
-from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS
+from ..task_utils import (
+    DIFFUSION_TASKS,
+    TEXT_GENERATION_TASKS,
+    get_model_class_for_task,
+)
 from .config import BackendConfigT
 from .isolation_utils import check_cuda_continuous_isolation
 from .utils import (
@@ -85,7 +88,7 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any
         else:
             self.pretrained_generation_config = None
 
-        self.automodel_class = TasksManager.get_model_class_for_task(
+        self.automodel_class = get_model_class_for_task(
             framework="pt",  # TODO: make this configurable to add support for other frameworks
             task=self.task,
             library=self.library,

diff --git a/optimum_benchmark/backends/tensorrt/__init__.py b/optimum_benchmark/backends/tensorrt/__init__.py
diff --git a/optimum_benchmark/backends/tensorrt/backend.py b/optimum_benchmark/backends/tensorrt/backend.py
@@ -0,0 +1,77 @@
+from logging import getLogger
+from typing import Any, Dict
+
+from hydra.utils import get_class
+from transformers.utils import ModelOutput
+
+from ..base import Backend
+from .config import TRTConfig
+from .utils import MODEL_TYPE_TO_TRTMODEL
+
+LOGGER = getLogger("tensorrt")
+
+
+class TRTBackend(Backend):
+    NAME: str = "tensorrt"
+
+    def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None:
+        super().__init__(model, task, device, hub_kwargs)
+        self.validate_device()
+        self.validate_model_type()
+
+    def validate_model_type(self) -> None:
+        if self.model_type not in MODEL_TYPE_TO_TRTMODEL:
+            raise NotImplementedError(f"TRTBackend does not support model_type {self.model_type}")
+
+    def validate_device(self) -> None:
+        if self.device != "cuda":
+            raise NotImplementedError(f"TRTBackend only supports device cuda, got {self.device}")
+
+    def configure(self, config: TRTConfig) -> None:
+        super().configure(config)
+
+        self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTMODEL[self.model_type])
+        ortmodel_name = self.trtmodel_class.__name__
+        LOGGER.info(
+            f"\t+ Inferred TRTModel class {ortmodel_name} for task {self.task} and model_type {self.model_type}"
+        )
+
+        # TODO: save engine path for reuse, then maybe re build with max_prompt_size
+        self.load_trtmodel_from_pretrained()
+
+    @property
+    def trtmodel_kwargs(self) -> Dict[str, Any]:
+        return {}
+
+    def load_trtmodel_from_pretrained(self) -> None:
+        self.pretrained_model = self.trtmodel_class.from_pretrained(
+            self.model,
+            **self.trtmodel_kwargs,
+            **self.hub_kwargs,
+        )
+
+    def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+        return self.pretrained_model.generate(
+            input_ids=input.get("input_ids", None),
+            attention_mask=input.get("attention_mask", None),
+            max_new_tokens=1,
+        )
+
+    def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+        return self.pretrained_model.generate(
+            # spelling args to avoid conflict
+            input_ids=input.get("inputs", None),  # diff api
+            attention_mask=input.get("attention_mask", None),
+            max_new_tokens=kwargs.get("max_new_tokens", -1),
+            min_length=kwargs.get("min_new_tokens", -1),  # diff api
+            num_beams=kwargs.get("num_beams", 1),
+            temperature=kwargs.get("temperature", 1.0),
+            top_k=kwargs.get("top_k", 50),
+            top_p=kwargs.get("top_p", 1.0),
+            repetition_penalty=kwargs.get("repetition_penalty", 1.0),
+            length_penalty=kwargs.get("length_penalty", 1.0),
+            seed=kwargs.get("seed", 42),
+            pad_token_id=kwargs.get("pad_token_id", 0),
+            bos_token_id=kwargs.get("bos_token_id", 1),
+            eos_token_id=kwargs.get("eos_token_id", 2),
+        )
diff --git a/optimum_benchmark/backends/tensorrt/config.py b/optimum_benchmark/backends/tensorrt/config.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+from logging import getLogger
+
+from omegaconf import OmegaConf
+
+from ...import_utils import tesnorrt_version
+from ..config import BackendConfig
+
+LOGGER = getLogger("tensorrt")
+
+OmegaConf.register_new_resolver("tensorrt_version", tesnorrt_version)
+
+
+@dataclass
+class TRTConfig(BackendConfig):
+    name: str = "tensorrt"
+    version: str = "${tensorrt_version:}"
+    _target_: str = "optimum_benchmark.backends.tensorrt.backend.TRTBackend"
diff --git a/optimum_benchmark/backends/tensorrt/utils.py b/optimum_benchmark/backends/tensorrt/utils.py
@@ -0,0 +1 @@
+MODEL_TYPE_TO_TRTMODEL = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"}
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
@@ -12,6 +12,7 @@
 from .backends.onnxruntime.config import ORTConfig
 from .backends.openvino.config import OVConfig
 from .backends.pytorch.config import PyTorchConfig
+from .backends.tensorrt.config import TRTConfig
 from .backends.text_generation_inference.config import TGIConfig
 from .benchmarks.inference.config import InferenceConfig
 from .benchmarks.training.config import TrainingConfig
@@ -129,6 +130,7 @@ def __post_init__(self) -> None:
 cs.store(name="experiment", node=ExperimentConfig)
 #
 cs.store(group="backend", name="openvino", node=OVConfig)
+cs.store(group="backend", name="tensorrt", node=TRTConfig)
 cs.store(group="backend", name="pytorch", node=PyTorchConfig)
 cs.store(group="backend", name="onnxruntime", node=ORTConfig)
 cs.store(group="backend", name="neural-compressor", node=INCConfig)
@@ -142,14 +144,14 @@ def __post_init__(self) -> None:
 cs.store(group="launcher", name="torchrun", node=TorchrunConfig)
 
 
-def run(experiment: "ExperimentConfig") -> "Benchmark":
+def run(experiment: ExperimentConfig):
     # Instantiate the experiment config to trigger __post_init__
     experiment: ExperimentConfig = OmegaConf.to_object(experiment)
     OmegaConf.save(experiment, "hydra_config.yaml", resolve=True)
 
     # Allocate requested backend
-    backend_factory: Type["Backend"] = get_class(experiment.backend._target_)
-    backend: "Backend" = backend_factory(
+    backend_factory: Type[Backend] = get_class(experiment.backend._target_)
+    backend: Backend = backend_factory(
         task=experiment.task,
         model=experiment.model,
         device=experiment.device,
@@ -165,8 +167,8 @@ def run(experiment: "ExperimentConfig") -> "Benchmark":
         raise e
 
     # Allocate requested benchmark
-    benchmark_factory: Type["Benchmark"] = get_class(experiment.benchmark._target_)
-    benchmark: "Benchmark" = benchmark_factory()
+    benchmark_factory: Type[Benchmark] = get_class(experiment.benchmark._target_)
+    benchmark: Benchmark = benchmark_factory()
 
     try:
         # Configure the benchmark
@@ -195,8 +197,8 @@ def run_with_launcher(experiment: DictConfig):
     # Instantiate the experiment config to trigger __post_init__
     experiment.launcher = OmegaConf.to_object(experiment.launcher)
 
-    launcher_factory: Type["Launcher"] = get_class(experiment.launcher._target_)
-    launcher: "Launcher" = launcher_factory()
+    launcher_factory: Type[Launcher] = get_class(experiment.launcher._target_)
+    launcher: Launcher = launcher_factory()
 
     try:
         launcher.configure(experiment.launcher)

diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
@@ -7,6 +7,7 @@
 _optimum_available = importlib.util.find_spec("optimum") is not None
 _torch_available = importlib.util.find_spec("torch") is not None
 _onnx_available = importlib.util.find_spec("onnx") is not None
+_tensorrt_available = importlib.util.find_spec("tensorrt") is not None
 _peft_available = importlib.util.find_spec("peft") is not None
 _py3nvml_available = importlib.util.find_spec("py3nvml") is not None
 _torch_distributed_available = importlib.util.find_spec("torch.distributed") is not None
@@ -18,6 +19,10 @@
 _amdsmi_available = importlib.util.find_spec("amdsmi") is not None
 
 
+def is_tensorrt_available():
+    return _tensorrt_available
+
+
 def is_peft_available():
     return _peft_available
 
@@ -63,6 +68,11 @@ def torch_version():
         return importlib.metadata.version("torch")
 
 
+def tesnorrt_version():
+    if is_tensorrt_available():
+        return importlib.metadata.version("tensorrt")
+
+
 def onnxruntime_version():
     try:
         return "ort:" + importlib.metadata.version("onnxruntime")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		MODEL_TYPE_TO_TRTMODEL = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"}