Skip to content

Commit

Permalink
Fix VLLM data-parallel (#541)
Browse files Browse the repository at this point in the history
* make bleur lazy

* make tokenizer lazy too

* fix_ray

* fix tensor_paralel > 1

* remove debug statements

* bump vllm

---------

Co-authored-by: Hynek Kydlicek <kydlicek.hynek@huggingface.co>
  • Loading branch information
hynky1999 and Hynek Kydlicek authored Feb 6, 2025
1 parent 3c9b0c9 commit 86f6225
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ nanotron = [
"tensorboardX"
]
tensorboardX = ["tensorboardX"]
vllm = ["vllm", "ray", "more_itertools"]
vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
quality = ["ruff==v0.2.2","pre-commit"]
tests = ["pytest==7.4.0"]
dev = ["lighteval[accelerate,quality,tests,multilingual,math]"]
Expand Down
7 changes: 5 additions & 2 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def __init__(
self._config = config
self.use_chat_template = config.use_chat_template
self.data_parallel_size = int(config.data_parallel_size)
self.tensor_parallel_size = int(config.tensor_parallel_size)

self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
self._tokenizer = self._create_auto_tokenizer(config, env_config)
Expand Down Expand Up @@ -184,7 +185,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
"seed": 1234,
}
if int(config.data_parallel_size) > 1:
self.model_args["worker_use_ray"] = True
self.model_args["distributed_executor_backend"] = "ray"
self._batch_size = "auto"
return None

Expand Down Expand Up @@ -331,7 +332,9 @@ def _generate(
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
# Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set,
# as VLLM complains about no GPUs available.
@ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None)
def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
llm = LLM(**model_args)
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
Expand Down

0 comments on commit 86f6225

Please sign in to comment.