From a38800198210c4295bed1109ab3678bf4339061d Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Dec 2024 18:30:36 +0100 Subject: [PATCH] max batch size --- .../backends/tensorrt_llm/backend.py | 16 ++++++++-------- .../backends/tensorrt_llm/config.py | 9 ++++----- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index 8cd046eb..60b82675 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -128,20 +128,20 @@ def trtllm_kwargs(self): if self.config.gpus_per_node is not None: kwargs["gpus_per_node"] = self.config.gpus_per_node - if self.config.use_cuda_graph is not None: - kwargs["use_cuda_graph"] = self.config.use_cuda_graph + if self.config.max_batch_size is not None: + kwargs["max_batch_size"] = self.config.max_batch_size - if self.config.optimization_level is not None: - kwargs["optimization_level"] = self.config.optimization_level + if self.config.max_new_tokens is not None: + kwargs["max_new_tokens"] = self.config.max_new_tokens if self.config.max_prompt_length is not None: kwargs["max_prompt_length"] = self.config.max_prompt_length - if self.config.max_new_tokens is not None: - kwargs["max_new_tokens"] = self.config.max_new_tokens + if self.config.optimization_level is not None: + kwargs["optimization_level"] = self.config.optimization_level - if self.config.max_beam_width is not None: - kwargs["max_beam_width"] = self.config.max_beam_width + if self.config.use_cuda_graph is not None: + kwargs["use_cuda_graph"] = self.config.use_cuda_graph return kwargs diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py index 84d119af..2497d5d4 100644 --- a/optimum_benchmark/backends/tensorrt_llm/config.py +++ b/optimum_benchmark/backends/tensorrt_llm/config.py @@ -22,12 +22,11 @@ class TRTLLMConfig(BackendConfig): use_fp8: Optional[bool] = None world_size: Optional[int] = None gpus_per_node: Optional[int] = None - use_cuda_graph: Optional[bool] = None - optimization_level: Optional[int] = None - max_prompt_length: Optional[int] = None - max_new_tokens: Optional[int] = None max_batch_size: Optional[int] = None - max_beam_width: Optional[int] = None + max_new_tokens: Optional[int] = None + max_prompt_length: Optional[int] = None + optimization_level: Optional[int] = None + use_cuda_graph: Optional[bool] = None def __post_init__(self) -> None: super().__post_init__()