Skip to content

Commit

Permalink
[CI/Test] improve robustness of test (hf_runner) (vllm-project#5347)
Browse files Browse the repository at this point in the history
[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (vllm-project#5347)
  • Loading branch information
youkaichao authored Jun 8, 2024
1 parent c96fc06 commit 9fb900f
Show file tree
Hide file tree
Showing 14 changed files with 48 additions and 61 deletions.
5 changes: 2 additions & 3 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,8 @@ def test_models(
if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
pytest.skip("Skipping non-eager test for FlashInferBackend.")

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(model,
dtype=dtype,
Expand Down
5 changes: 2 additions & 3 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ def test_models(
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(
model,
Expand Down
17 changes: 7 additions & 10 deletions tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,8 @@ def test_chunked_prefill_recompute(
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(
model,
Expand Down Expand Up @@ -82,9 +81,8 @@ def test_preemption(
) -> None:
"""By default, recompute preemption is enabled"""

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(
model,
Expand Down Expand Up @@ -137,10 +135,9 @@ def test_swap(
) -> None:
"""Use beam search enables swapping."""
example_prompts = example_prompts[:1]
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens)

vllm_model = vllm_runner(
model,
Expand Down
5 changes: 4 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,10 @@ def generate_greedy_logprobs_limit(
def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
return self.model.encode(prompts)

def __del__(self):
def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
del self.model
cleanup()

Expand Down
5 changes: 2 additions & 3 deletions tests/distributed/test_basic_distributed_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ def test_models(
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
enforce_eager = backend_by_env_var == "FLASHINFER"

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(
model,
Expand Down
5 changes: 2 additions & 3 deletions tests/distributed/test_chunked_prefill_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ def test_models(
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(
model,
Expand Down
5 changes: 2 additions & 3 deletions tests/models/test_big_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ def test_models(
dtype: str,
max_tokens: int,
) -> None:
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
Expand Down
5 changes: 2 additions & 3 deletions tests/models/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ def test_models(
model: str,
dtype: str,
) -> None:
hf_model = hf_runner(model, dtype=dtype, is_embedding_model=True)
hf_outputs = hf_model.encode(example_prompts)
del hf_model
with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts)

vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.encode(example_prompts)
Expand Down
9 changes: 4 additions & 5 deletions tests/models/test_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,10 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
"""
model_id, vlm_config = model_and_config

hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True)
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
max_tokens,
images=hf_images)
del hf_model
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
max_tokens,
images=hf_images)

vllm_image_prompts = [
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
Expand Down
7 changes: 3 additions & 4 deletions tests/models/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@ def test_models(
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)

vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
Expand Down
5 changes: 2 additions & 3 deletions tests/models/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ def test_models(
# To pass the small model tests, we need full precision.
assert dtype == "float"

hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
Expand Down
7 changes: 3 additions & 4 deletions tests/samplers/test_beam_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ def test_beam_search_single_input(
beam_width: int,
) -> None:
example_prompts = example_prompts[:1]
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens)

vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
Expand Down
11 changes: 5 additions & 6 deletions tests/samplers/test_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,11 @@ def test_get_prompt_logprobs(
max_num_batched_tokens = chunked_prefill_token_size

max_tokens = 5
hf_model = hf_runner(model, dtype=dtype)
hf_logprobs = hf_model.generate_greedy_logprobs(
example_prompts,
max_tokens=max_tokens,
)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_logprobs = hf_model.generate_greedy_logprobs(
example_prompts,
max_tokens=max_tokens,
)

vllm_model = vllm_runner(
model,
Expand Down
18 changes: 8 additions & 10 deletions tests/tensorizer_loader/test_tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,14 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(

def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
tmp_path):
hf_model = hf_runner(model_ref)
model_path = tmp_path / (model_ref + ".tensors")
max_tokens = 50
outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
with open_stream(model_path, "wb+") as stream:
serializer = TensorSerializer(stream)
serializer.write_module(hf_model.model)
del hf_model
gc.collect()
torch.cuda.empty_cache()
with hf_runner(model_ref) as hf_model:
model_path = tmp_path / (model_ref + ".tensors")
max_tokens = 50
outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
with open_stream(model_path, "wb+") as stream:
serializer = TensorSerializer(stream)
serializer.write_module(hf_model.model)

loaded_hf_model = vllm_runner(model_ref,
load_format="tensorizer",
model_loader_extra_config=TensorizerConfig(
Expand Down

0 comments on commit 9fb900f

Please sign in to comment.