DeepLink-org · fandaoyi · Mar 25, 2024 · Mar 24, 2024 · Mar 24, 2024 · Mar 24, 2024
@@ -289,11 +289,28 @@ def demo_allgather_gloo(rank, world_size, port):
     cleanup()
 
 
+def test_special_group_stuck(rank, world_size):
+    import torch_dipu
+
+    print(f"test special group stuck on rank {rank} ")
+
+    setup(rank, world_size)
+
+    # ranks check require len(ranks) <= world_size
+    if world_size >= 2:
+        # torch 2.0 gloo pg has such a limitition. pass in duplicated rank will stuck.
+        # but huawei do.
+        ranks_dup = [rank, rank]
+        group = torch.distributed.new_group(ranks_dup)
+        print(group)
+        dist.destroy_process_group(group)
+
+    cleanup()
+
+
 if __name__ == "__main__":
     n_gpus = torch.cuda.device_count()
-    # world_size = 1
-    # demo_allreduce(0, world_size)
-    # demo_basic_ddp(0, world_size)
+
     port = random.randint(10000, 60000)
 
     world_size = 1
@@ -311,3 +328,5 @@ def demo_allgather_gloo(rank, world_size, port):
     # run_demo(demo_bcast, world_size, port)
 
     # run_demo(demo_model_parallel, world_size)
+
+    # run_demo(test_special_group_stuck, world_size)
@@ -152,6 +152,9 @@ def test_type():
     res = isinstance(s4, torch.cuda.FloatTensor)
     assert res == True
 
+    assert dev1 in s1.type()
+    assert s1.device.type == dev1
+
 
 def test_device_copy():
     import torch_dipu

@@ -86,7 +86,40 @@ def _wrap_get_backend(group: Optional[ProcessGroup] = None) -> str:
         return ret
 
 
+# dicl not support coalescing now. so torch2.1 batch_isend_irecv crash.
+# Todo: remove after support coalesce.
+def _wrap_batch_isend_irecv(p2p_op_list):
+    dist.distributed_c10d._check_p2p_op_list(p2p_op_list)
+    reqs = []
+    for p2p_op in p2p_op_list:
+        work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
+        if work:
+            reqs.append(work)
+    return reqs
+
+
+# huawei AscendSpeed pass rank list like [0, 0], which cause gloo pg
+# creation fail in torch 2.0. actually it's huawei's problem, such list
+# is not valid, but nothing else we can do.
+# torch 2.1 not create gloo sub-device-pg when create dicl pg and no stuck happen on pg creation.
+# so we keep it's behavior. but even created. it still stuck when try to do any real comm.
+_raw_new_group = dist.new_group
+
+
+def _wrap_new_group(
+    ranks=None, timeout=default_pg_timeout, backend=None, pg_options=None
+):
+    ranks = list(set(ranks))  # dedup
+    return _raw_new_group(ranks, timeout, backend, pg_options)
+
+
 def apply_dist_patch():
     dist.get_backend = _wrap_get_backend
     dist.init_process_group = _wrap_init_process_groups
     dist.ProcessGroup._register_backend = _wrapped_register_backend
+    # rm batch_isend_irecv after coalse ready
+    if dipu.get_dipu_torch_version() != dipu.torch_ver_200:
+        dist.batch_isend_irecv = _wrap_batch_isend_irecv
+
+    if dipu.get_dipu_torch_version() == dipu.torch_ver_200:
+        dist.new_group = _wrap_new_group
@@ -1,7 +1,7 @@
 # Copyright (c) 2023, DeepLink.
 import torch
 
-from .device import __diputype__
+from .device import __diputype__, __dipu_device_type__
 from torch_dipu import _C, mockcuda
 
 
@@ -16,8 +16,20 @@ def __set_default_tensor_type(type=torch.FloatTensor):
     _default_tensor_type = type
 
 
+_raw_tensor_type = torch.Tensor.type
+
+
+def _wrap_tensor_type(self, *args, **kwargs):
+    ret = _raw_tensor_type(self, *args, **kwargs)
+    if isinstance(ret, str):
+        return ret.replace(__dipu_device_type__, "cuda")
+    else:
+        return ret
+
+
 # need enhance, seems change tensor define is need
 def apply_tensor_type_patch():
     torch.set_default_tensor_type = __set_default_tensor_type
     if mockcuda:
         _C._mockCudaTensor()
+    torch.Tensor.type = _wrap_tensor_type