Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix trt llm #308

Merged
merged 3 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/test_cli_cuda_tensorrt_llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:

- name: Install dependencies
run: |
pip install -e .[testing,tesnsorrt-llm]
pip install -e .[testing]

- name: Run tests
run: |
Expand All @@ -57,7 +57,6 @@ jobs:
}}
name: Run examples
run: |
rm -rf /root/.cache/huggingface
pytest tests/test_examples.py -x -s -k "cli and cuda and trt"

cli_cuda_tensorrt_llm_multi_gpu_tests:
Expand All @@ -84,7 +83,7 @@ jobs:

- name: Install dependencies
run: |
pip install -e .[testing,tesnsorrt-llm]
pip install -e .[testing]

- name: Run tests (sequential)
run: |
Expand Down
4 changes: 2 additions & 2 deletions examples/cuda_tgi_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ backend:
device: cuda
device_ids: 0
cuda_graphs: 0 # remove for better perf but bigger memory footprint
no_weights: false # investigate later
no_weights: true
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0

scenario:
input_shapes:
batch_size: 4
batch_size: 1
sequence_length: 64

generate_kwargs:
Expand Down
2 changes: 1 addition & 1 deletion examples/cuda_trt_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ backend:

scenario:
input_shapes:
batch_size: 4
batch_size: 1
sequence_length: 64

generate_kwargs:
Expand Down
2 changes: 1 addition & 1 deletion examples/cuda_vllm_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ backend:

scenario:
input_shapes:
batch_size: 4
batch_size: 1
sequence_length: 64

generate_kwargs:
Expand Down
8 changes: 6 additions & 2 deletions optimum_benchmark/backends/tensorrt_llm/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def download_pretrained_model(self) -> None:
def prepare_generation_config(self) -> None:
self.generation_config.eos_token_id = None
self.generation_config.pad_token_id = None

model_cache_folder = f"models/{self.config.model}".replace("/", "--")
model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}"
snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
Expand Down Expand Up @@ -91,7 +90,6 @@ def create_no_weights_model(self) -> None:
self.logger.info("\t+ Modifying generation config for fixed length generation")
self.generation_config.eos_token_id = None
self.generation_config.pad_token_id = None
self.logger.info("\t+ Saving new pretrained generation config")
self.generation_config.save_pretrained(save_directory=self.no_weights_model)

def load_trtllm_with_no_weights(self) -> None:
Expand Down Expand Up @@ -128,6 +126,12 @@ def trtllm_kwargs(self):
if self.config.gpus_per_node is not None:
kwargs["gpus_per_node"] = self.config.gpus_per_node

if self.config.max_input_len is not None:
kwargs["max_input_len"] = self.config.max_input_len

if self.config.max_output_len is not None:
kwargs["max_output_len"] = self.config.max_output_len

if self.config.max_batch_size is not None:
kwargs["max_batch_size"] = self.config.max_batch_size

Expand Down
2 changes: 2 additions & 0 deletions optimum_benchmark/backends/tensorrt_llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class TRTLLMConfig(BackendConfig):
use_fp8: Optional[bool] = None
world_size: Optional[int] = None
gpus_per_node: Optional[int] = None
max_input_len: Optional[int] = None
max_output_len: Optional[int] = None
max_batch_size: Optional[int] = None
max_new_tokens: Optional[int] = None
max_prompt_length: Optional[int] = None
Expand Down
34 changes: 17 additions & 17 deletions optimum_benchmark/task_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,13 @@ def infer_task_from_model_name_or_path(
elif library_name == "timm":
inferred_task_name = "image-classification"

elif library_name == "diffusers":
diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
target_class_name = diffusers_config["_class_name"]
elif library_name == "transformers":
transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
target_class_name = transformers_config["architectures"][0]

for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
for _, pipeline_class_name in pipeline_mapping.items():
if target_class_name == pipeline_class_name:
for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items():
for _, model_class_name in model_mapping.items():
if target_class_name == model_class_name:
inferred_task_name = task_name
break
if inferred_task_name is not None:
Expand All @@ -257,13 +257,13 @@ def infer_task_from_model_name_or_path(
if inferred_task_name is None:
raise KeyError(f"Could not find the proper task name for target class name {target_class_name}.")

elif library_name == "transformers":
transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
target_class_name = transformers_config["architectures"][0]
elif library_name == "diffusers":
diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
target_class_name = diffusers_config["_class_name"]

for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items():
for _, model_class_name in model_mapping.items():
if target_class_name == model_class_name:
for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
for _, pipeline_class_name in pipeline_mapping.items():
if target_class_name == pipeline_class_name or (pipeline_class_name in target_class_name):
inferred_task_name = task_name
break
if inferred_task_name is not None:
Expand Down Expand Up @@ -293,13 +293,17 @@ def infer_model_type_from_model_name_or_path(
timm_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
inferred_model_type = timm_config["architecture"]

elif library_name == "transformers":
transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
inferred_model_type = transformers_config["model_type"]

elif library_name == "diffusers":
diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
target_class_name = diffusers_config["_class_name"]

for _, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
for pipeline_type, pipeline_class_name in pipeline_mapping.items():
if target_class_name == pipeline_class_name:
if target_class_name == pipeline_class_name or (pipeline_class_name in target_class_name):
inferred_model_type = pipeline_type
break
if inferred_model_type is not None:
Expand All @@ -308,8 +312,4 @@ def infer_model_type_from_model_name_or_path(
if inferred_model_type is None:
raise KeyError(f"Could not find the proper model type for target class name {target_class_name}.")

elif library_name == "transformers":
transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
inferred_model_type = transformers_config["model_type"]

return inferred_model_type
Loading