Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fdy/fix ascendspeed #744

Merged
merged 6 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions dipu/tests/python/individual_scripts/test_rt_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,28 @@ def demo_allgather_gloo(rank, world_size, port):
cleanup()


def test_special_group_stuck(rank, world_size):
import torch_dipu

print(f"test special group stuck on rank {rank} ")

setup(rank, world_size)

# ranks check require len(ranks) <= world_size
if world_size >= 2:
# torch 2.0 gloo pg has such a limitition. pass in duplicated rank will stuck.
# but huawei do.
ranks_dup = [rank, rank]
group = torch.distributed.new_group(ranks_dup)
print(group)
dist.destroy_process_group(group)

cleanup()


if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
# world_size = 1
# demo_allreduce(0, world_size)
# demo_basic_ddp(0, world_size)

port = random.randint(10000, 60000)

world_size = 1
Expand All @@ -311,3 +328,5 @@ def demo_allgather_gloo(rank, world_size, port):
# run_demo(demo_bcast, world_size, port)

# run_demo(demo_model_parallel, world_size)

# run_demo(test_special_group_stuck, world_size)
3 changes: 3 additions & 0 deletions dipu/tests/python/individual_scripts/test_rt_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ def test_type():
res = isinstance(s4, torch.cuda.FloatTensor)
assert res == True

assert dev1 in s1.type()
assert s1.device.type == dev1


def test_device_copy():
import torch_dipu
Expand Down
33 changes: 33 additions & 0 deletions dipu/torch_dipu/dipu/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,40 @@ def _wrap_get_backend(group: Optional[ProcessGroup] = None) -> str:
return ret


# dicl not support coalescing now. so torch2.1 batch_isend_irecv crash.
# Todo: remove after support coalesce.
def _wrap_batch_isend_irecv(p2p_op_list):
dist.distributed_c10d._check_p2p_op_list(p2p_op_list)
reqs = []
for p2p_op in p2p_op_list:
work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
if work:
reqs.append(work)
return reqs


# huawei AscendSpeed pass rank list like [0, 0], which cause gloo pg
# creation fail in torch 2.0. actually it's huawei's problem, such list
# is not valid, but nothing else we can do.
# torch 2.1 not create gloo sub-device-pg when create dicl pg and no stuck happen on pg creation.
# so we keep it's behavior. but even created. it still stuck when try to do any real comm.
_raw_new_group = dist.new_group


def _wrap_new_group(
ranks=None, timeout=default_pg_timeout, backend=None, pg_options=None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

能不能加上 typing,其他地方也是

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

和 pytorch 现有的保持一致的。它有的我们都加了, 没有的暂时 follow吧。

):
ranks = list(set(ranks)) # dedup
return _raw_new_group(ranks, timeout, backend, pg_options)


def apply_dist_patch():
dist.get_backend = _wrap_get_backend
dist.init_process_group = _wrap_init_process_groups
dist.ProcessGroup._register_backend = _wrapped_register_backend
# rm batch_isend_irecv after coalse ready
if dipu.get_dipu_torch_version() != dipu.torch_ver_200:
dist.batch_isend_irecv = _wrap_batch_isend_irecv

if dipu.get_dipu_torch_version() == dipu.torch_ver_200:
dist.new_group = _wrap_new_group
14 changes: 13 additions & 1 deletion dipu/torch_dipu/dipu/tensor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2023, DeepLink.
import torch

from .device import __diputype__
from .device import __diputype__, __dipu_device_type__
from torch_dipu import _C, mockcuda


Expand All @@ -16,8 +16,20 @@ def __set_default_tensor_type(type=torch.FloatTensor):
_default_tensor_type = type


_raw_tensor_type = torch.Tensor.type


def _wrap_tensor_type(self, *args, **kwargs):
ret = _raw_tensor_type(self, *args, **kwargs)
if isinstance(ret, str):
return ret.replace(__dipu_device_type__, "cuda")
else:
return ret


# need enhance, seems change tensor define is need
def apply_tensor_type_patch():
torch.set_default_tensor_type = __set_default_tensor_type
if mockcuda:
_C._mockCudaTensor()
torch.Tensor.type = _wrap_tensor_type