feat(DIPU): add ExpandableSegment for vendor CUDA (#922)

* feat(DIPU): add ExpandableSegment for vendor CUDA * clang-format * fit AscendExpandableSegment for peers * del addPeerAccess and add copy code comment * change file name
DeepLink-org · Aug 8, 2024 · f059b4a · f059b4a
1 parent c24c3a3
commit f059b4a
Show file tree

Hide file tree

Showing 6 changed files with 237 additions and 8 deletions.
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingDeviceAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingDeviceAllocator.cpp
@@ -526,6 +526,7 @@ class DeviceCachingAllocator {
 
   // all live expandable segments
   std::vector<ExpandableSegment*> expandable_segments_;
+  std::vector<int> devices_with_peer_access_;
 
   bool set_fraction = false;
 
@@ -1112,8 +1113,8 @@ class DeviceCachingAllocator {
       }
     }
     auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
-    expandable_segments_.emplace_back(
-        createExpandableSegment(device, stream, segment_size));
+    expandable_segments_.emplace_back(createExpandableSegment(
+        device, stream, segment_size, devices_with_peer_access_));
 
     ExpandableSegment* es = expandable_segments_.back();
     Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());

diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/ExpandableSegment.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/ExpandableSegment.h
@@ -3,6 +3,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <vector>
 
 #include <c10/util/Exception.h>
 
@@ -115,18 +116,21 @@ class ExpandableSegment {
   virtual ~ExpandableSegment() = default;
   virtual SegmentRange map(SegmentRange range) = 0;
   virtual SegmentRange unmap(SegmentRange range) = 0;
+  virtual void addPeer(int) = 0;
   virtual char* ptr() const = 0;
   virtual size_t size() const = 0;
 };
 
 DIPU_WEAK ExpandableSegment* vendorCreateExpandableSegment(
-    int device, deviceStream_t stream, size_t size);
+    int device, deviceStream_t stream, size_t size, std::vector<int> peers);
 
 inline ExpandableSegment* createExpandableSegment(int device,
                                                   deviceStream_t stream,
-                                                  size_t size) {
+                                                  size_t size,
+                                                  std::vector<int> peers) {
   if (vendorCreateExpandableSegment) {
-    return vendorCreateExpandableSegment(device, stream, size);
+    return vendorCreateExpandableSegment(device, stream, size,
+                                         std::move(peers));
   }
   TORCH_CHECK(false, "not support expandable segment");
 }

diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/AscendExpandableSegment.cpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/AscendExpandableSegment.cpp
@@ -1,5 +1,7 @@
 // Copyright (c) 2024, DeepLink.
 
+#include <vector>
+
 #include "csrc_dipu/runtime/core/allocator/ExpandableSegment.h"
 #include "csrc_dipu/runtime/devproxy/deviceproxy.h"
 #include "csrc_dipu/vendor/ascend/basecommimpl.hpp"
@@ -8,7 +10,8 @@ namespace dipu {
 
 class AscendExpandableSegment : public ExpandableSegment {
  public:
-  AscendExpandableSegment(int device, deviceStream_t stream, size_t size)
+  AscendExpandableSegment(int device, deviceStream_t stream, size_t size,
+                          std::vector<int> peers)
       : device_(device),
         stream_(stream),
         // 2MB for small pool, 20MB for large pool
@@ -81,6 +84,7 @@ class AscendExpandableSegment : public ExpandableSegment {
 
   char* ptr() const override { return static_cast<char*>(ptr_); }
   size_t size() const override { return max_handles_ * segment_size_; }
+  void addPeer(int device) override {}
 
  public:
   ~AscendExpandableSegment() noexcept override {
@@ -154,8 +158,9 @@ class AscendExpandableSegment : public ExpandableSegment {
 
 ExpandableSegment* vendorCreateExpandableSegment(int device,
                                                  deviceStream_t stream,
-                                                 size_t size) {
-  return new AscendExpandableSegment(device, stream, size);
+                                                 size_t size,
+                                                 std::vector<int> peers) {
+  return new AscendExpandableSegment(device, stream, size, std::move(peers));
 }
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDAExpandableSegment.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDAExpandableSegment.cpp
@@ -0,0 +1,189 @@
+// Copyright (c) 2024, DeepLink.
+#include <iostream>
+#include <vector>
+
+#include "csrc_dipu/runtime/core/allocator/ExpandableSegment.h"
+#include "csrc_dipu/runtime/devproxy/deviceproxy.h"
+#include "csrc_dipu/vendor/cuda/basecuda.hpp"
+
+namespace dipu {
+
+// ----------------------------------------------------------------------------
+// Code from pytorch2.1.1 c10/cuda/CUDACachingAllocator.cpp
+// ----------------------------------------------------------------------------
+
+class CUDAExpandableSegment : public ExpandableSegment {
+ public:
+  CUDAExpandableSegment(int device, deviceStream_t stream, size_t size,
+                        std::vector<int> peers)
+      : device_(device),
+        stream_(stream),
+        // 2MB for small pool, 20MB for large pool
+        segment_size_(size),
+        peers_(std::move(peers)) {
+    devapis::DIPUDeviceProperties prop = devproxy::getDeviceProperties(device_);
+    // we allocate enough address space for 1 1/8 the total memory on the GPU.
+    // This allows for some cases where we have to unmap pages earlier in the
+    // segment to put them at the end.
+    max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
+    DIPU_DRIVER_CHECK(cuMemAddressReserve(&ptr_, segment_size_ * max_handles_,
+                                          0ULL, 0, 0ULL));
+  }
+  // begin must be aligned to segment_size_.
+  // returns the actual range mapped, which may be
+  // greater than requested if size is not aligned to segment_size_.
+  // return size of 0 indicates OOM
+  SegmentRange map(SegmentRange range) override {
+    auto begin = segmentLeft(range.ptr);
+    auto end = segmentRight(range.ptr + range.size);
+    TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr);
+    if (begin == end) {
+      return rangeFromHandles(begin, end);
+    }
+    while (end > handles_.size()) {
+      handles_.emplace_back(c10::nullopt);
+    }
+    for (auto i : c10::irange(begin, end)) {
+      TORCH_INTERNAL_ASSERT(!handles_.at(i));
+      CUmemGenericAllocationHandle handle = 0;
+      CUmemAllocationProp prop = {};
+      prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+      prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+      prop.location.id = device_;
+      auto status = cuMemCreate(&handle, segment_size_, &prop, 0);
+      if (status == CUDA_ERROR_OUT_OF_MEMORY) {
+        for (auto j : c10::irange(begin, i)) {
+          auto h = handles_.at(j).value();
+          handles_.at(j) = c10::nullopt;
+          DIPU_DRIVER_CHECK(cuMemRelease(h));
+        }
+        trimHandles();
+        return rangeFromHandles(begin, begin);
+      }
+      DIPU_DRIVER_CHECK(status);
+      handles_.at(i) = handle;
+    }
+
+    for (auto i : c10::irange(begin, end)) {
+      DIPU_DRIVER_CHECK(cuMemMap(ptr_ + i * segment_size_, segment_size_, 0,
+                                 handles_.at(i).value(), 0ULL));
+    }
+    setAccess(device_, begin, end);
+    for (auto p : peers_) {
+      setAccess(p, begin, end);
+    }
+    return rangeFromHandles(begin, end);
+  }
+
+  // unmaps all the completely empty segment_size_ segments between
+  // [begin, begin + size), returns the offset where the range begin,
+  // and the actual size unmapped (multiple of segment_size_)
+  SegmentRange unmap(SegmentRange range) override {
+    auto begin = segmentRight(range.ptr);
+    auto end = segmentLeft(range.ptr + range.size);
+    if (begin >= end) {
+      return SegmentRange{range.ptr, 0};
+    }
+    unmapHandles(begin, end);
+    return rangeFromHandles(begin, end);
+  }
+
+  char* ptr() const { return (char*)ptr_; }
+  size_t size() const override { return max_handles_ * segment_size_; }
+  void addPeer(int device) override {
+    peers_.push_back(device);
+    forEachAllocatedRange(
+        [&](size_t begin, size_t end) { setAccess(device, begin, end); });
+  }
+
+ public:
+  ~CUDAExpandableSegment() noexcept override {
+    forEachAllocatedRange(
+        [&](size_t begin, size_t end) { unmapHandles(begin, end); });
+    DIPU_DRIVER_CHECK(cuMemAddressFree(ptr_, segment_size_ * max_handles_));
+  }
+
+ private:
+  void setAccess(int device, size_t begin, size_t end) {
+    CUmemAccessDesc desc;
+    desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    desc.location.id = device;
+    desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    DIPU_DRIVER_CHECK(cuMemSetAccess(ptr_ + begin * segment_size_,
+                                     (end - begin) * segment_size_, &desc, 1));
+  }
+
+  void unmapHandles(size_t begin, size_t end) {
+    // note: unlike cudaFree, MemUnmap and MemRelease do
+    // not appear to synchronize in all cases, so we have to wait for the
+    // stream to finish before this memory is truly free.
+
+    // cannot call c10::cuda::stream_synchronize because
+    // it might grab the GIL which can lead to a deadlock
+    // Locking order must be GIL -> Allocator Lock
+    devproxy::syncStream(stream_);
+    for (auto i : c10::irange(begin, end)) {
+      // aclrtDrvMemHandle h = handles_.at(i).value();
+      CUmemGenericAllocationHandle h = handles_.at(i).value();
+      handles_.at(i) = c10::nullopt;
+      DIPU_DRIVER_CHECK(cuMemUnmap(ptr_ + segment_size_ * i, segment_size_));
+      DIPU_DRIVER_CHECK(cuMemRelease(h));
+    }
+    trimHandles();
+  }
+
+  void trimHandles() {
+    while (!handles_.empty() && !handles_.back()) {
+      handles_.pop_back();
+    }
+  }
+
+  void forEachAllocatedRange(std::function<void(size_t, size_t)> fn) {
+    size_t start = 0;
+    for (auto i : c10::irange(handles_.size())) {
+      if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
+        start = i;
+      }
+      if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
+        fn(start, i + 1);
+      }
+    }
+  }
+
+  size_t numSegments(size_t size) const {
+    return (size + segment_size_ - 1) / segment_size_;
+  }
+
+  size_t segmentLeft(const char* p) const {
+    auto size = p - ptr();
+    return size / segment_size_;
+  }
+
+  size_t segmentRight(const char* p) const {
+    auto size = p - ptr();
+    return numSegments(size);
+  }
+
+  SegmentRange rangeFromHandles(size_t begin, size_t end) const {
+    return {ptr() + segment_size_ * begin, segment_size_ * (end - begin)};
+  }
+
+  int device_;
+  deviceStream_t stream_;
+  CUdeviceptr ptr_{};
+  size_t max_handles_;
+  size_t segment_size_;
+  std::vector<c10::optional<CUmemGenericAllocationHandle>> handles_;
+  // devices on which this memory should be mapped in addition
+  // to the device where the physical memory lives (device_).
+  std::vector<int> peers_;
+};
+
+ExpandableSegment* vendorCreateExpandableSegment(int device,
+                                                 deviceStream_t stream,
+                                                 size_t size,
+                                                 std::vector<int> peers) {
+  return new CUDAExpandableSegment(device, stream, size, peers);
+}
+
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/basecuda.hpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/basecuda.hpp
@@ -0,0 +1,7 @@
+#pragma once
+#include <cuda.h>
+#define NVML_NO_UNVERSIONED_FUNC_DEFS
+#include <nvml.h>
+
+#include <csrc_dipu/common.h>
+#include <csrc_dipu/runtime/device/diclapis.h>
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h
@@ -11,6 +11,29 @@
 
 namespace dipu {
 
+// ----------------------------------------------------------------------------
+// Code from pytorch2.1.1 c10/cuda/driver_api.h begin
+// ----------------------------------------------------------------------------
+
+#define DIPU_DRIVER_CHECK(EXPR)                       \
+  do {                                                \
+    CUresult __err = EXPR;                            \
+    if (__err != ::CUDA_SUCCESS) {                    \
+      const char* err_str;                            \
+      CUresult get_error_str_err C10_UNUSED =         \
+          cuGetErrorString(__err, &err_str);          \
+      if (get_error_str_err != ::CUDA_SUCCESS) {      \
+        AT_ERROR("CUDA driver error: unknown error"); \
+      } else {                                        \
+        AT_ERROR("CUDA driver error: ", err_str);     \
+      }                                               \
+    }                                                 \
+  } while (0)
+
+// ----------------------------------------------------------------------------
+// Code from pytorch2.1.1 c10/cuda/driver_api.h end
+// ----------------------------------------------------------------------------
+
 #define DIPU_CALLCUDA(Expr)                                              \
   {                                                                      \
     cudaError_t ret = Expr;                                              \