Skip to content

Commit

Permalink
pas big modeling tests with bigger atol/rtol for accelerators
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Feb 6, 2025
1 parent d3e24c5 commit 00cc283
Showing 1 changed file with 27 additions and 24 deletions.
51 changes: 27 additions & 24 deletions tests/test_big_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
logger = logging.getLogger(__name__)
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"

ATOL = 1e-5 if torch_device == "cpu" else 1e-4
RTOL = 1e-5 if torch_device == "cpu" else 1e-4


class ModelForTest(nn.Module):
def __init__(self):
Expand Down Expand Up @@ -199,14 +202,14 @@ def test_cpu_offload(self):

cpu_offload(model, execution_device=device)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

# Clean up for next test.
remove_hook_from_submodules(model)

cpu_offload(model, execution_device=device, offload_buffers=True)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

def test_cpu_offload_with_unused_submodules(self):
model = ModelWithUnusedSubModulesForTest()
Expand All @@ -217,7 +220,7 @@ def test_cpu_offload_with_unused_submodules(self):

cpu_offload(model, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"])
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

# Clean up for next test.
remove_hook_from_submodules(model)
Expand All @@ -229,7 +232,7 @@ def test_cpu_offload_with_unused_submodules(self):
preload_module_classes=["ModuleWithUnusedSubModules"],
)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@slow
@require_non_cpu
Expand All @@ -252,15 +255,15 @@ def test_disk_offload(self):
with TemporaryDirectory() as tmp_dir:
disk_offload(model, tmp_dir, execution_device=device)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

# Clean up for next test.
remove_hook_from_submodules(model)

with TemporaryDirectory() as tmp_dir:
disk_offload(model, tmp_dir, execution_device=device, offload_buffers=True)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

def test_disk_offload_with_unused_submodules(self):
model = ModelWithUnusedSubModulesForTest()
Expand All @@ -274,7 +277,7 @@ def test_disk_offload_with_unused_submodules(self):
model, tmp_dir, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"]
)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

# Clean up for next test.
remove_hook_from_submodules(model)
Expand All @@ -288,7 +291,7 @@ def test_disk_offload_with_unused_submodules(self):
preload_module_classes=["ModuleWithUnusedSubModules"],
)
output = model(x)
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@slow
@require_non_cpu
Expand Down Expand Up @@ -325,8 +328,8 @@ def test_dispatch_model_and_remove_hook(self):
cm.records[0].message,
)
output_bis = model(x.to(torch_device))
assert torch.allclose(expected, output.cpu(), atol=1e-5)
assert torch.allclose(expected, output_bis.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)
torch.testing.assert_close(expected, output_bis.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_dispatch_model(self):
Expand All @@ -339,7 +342,7 @@ def test_dispatch_model(self):
with TemporaryDirectory() as tmp_dir:
dispatch_model(model, device_map, offload_dir=tmp_dir)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_dispatch_model_with_non_persistent_buffers(self):
Expand All @@ -351,7 +354,7 @@ def test_dispatch_model_with_non_persistent_buffers(self):
with TemporaryDirectory() as tmp_dir:
dispatch_model(model, device_map, offload_dir=tmp_dir, offload_buffers=True)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_dispatch_model_tied_weights(self):
Expand Down Expand Up @@ -412,7 +415,7 @@ def test_dispatch_model_tied_weights_memory(self):

with torch.no_grad():
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_cuda
def test_dispatch_model_tied_weights_memory_with_nested_offload_cpu(self):
Expand Down Expand Up @@ -491,7 +494,7 @@ def forward(self, x):
except Exception as e:
raise e

assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

torch.cuda.empty_cache()

Expand Down Expand Up @@ -587,7 +590,7 @@ def forward(self, x):
except Exception as e:
raise e

assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

torch.cuda.empty_cache()

Expand Down Expand Up @@ -619,7 +622,7 @@ def test_dispatch_model_multi_devices(self):
with TemporaryDirectory() as tmp_dir:
dispatch_model(model, device_map, offload_dir=tmp_dir)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_dispatch_model_copy(self):
Expand All @@ -638,7 +641,7 @@ def test_dispatch_model_copy(self):
assert original_model.id == original_output_id
assert copied_model.id == copied_output_id
assert copied_model.linear1.forward is not original_model.linear1.forward
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_dispatch_model_move_offloaded_model(self):
Expand Down Expand Up @@ -718,7 +721,7 @@ def test_dispatch_model_with_unused_submodules(self):
model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_multi_device
def test_dispatch_model_with_unused_submodules_multi_device(self):
Expand All @@ -733,7 +736,7 @@ def test_dispatch_model_with_unused_submodules_multi_device(self):
model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_dispatch_model_force_hooks(self):
Expand All @@ -745,7 +748,7 @@ def test_dispatch_model_force_hooks(self):

dispatch_model(model, device_map, force_hooks=True)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_load_checkpoint_and_dispatch(self):
Expand All @@ -767,7 +770,7 @@ def test_load_checkpoint_and_dispatch(self):
assert new_model.linear2.weight.device == torch.device(torch_device)

output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_multi_device
def test_load_checkpoint_and_dispatch_multi_device(self):
Expand All @@ -791,7 +794,7 @@ def test_load_checkpoint_and_dispatch_multi_device(self):
assert new_model.linear4.weight.device == torch.device(torch_device.replace(":0", ":1"))

output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
Expand All @@ -817,7 +820,7 @@ def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
assert new_model.linear4.linear.weight.device == torch.device(torch_device)

output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_multi_device
def test_load_checkpoint_and_dispatch_multi_device_with_unused_submodules(self):
Expand All @@ -843,7 +846,7 @@ def test_load_checkpoint_and_dispatch_multi_device_with_unused_submodules(self):
assert new_model.linear4.linear.weight.device == torch.device(torch_device.replace(":0", ":1"))

output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

@require_non_cpu
def test_cpu_offload_with_hook(self):
Expand Down

0 comments on commit 00cc283

Please sign in to comment.