Skip to content

Commit

Permalink
feat(DIPU): add ExpandableSegment for vendor CUDA (#922)
Browse files Browse the repository at this point in the history
* feat(DIPU): add ExpandableSegment for vendor CUDA

* clang-format

* fit AscendExpandableSegment for peers

* del addPeerAccess and add copy code comment

* change file name
  • Loading branch information
Gong-air authored Aug 8, 2024
1 parent c24c3a3 commit f059b4a
Show file tree
Hide file tree
Showing 6 changed files with 237 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ class DeviceCachingAllocator {

// all live expandable segments
std::vector<ExpandableSegment*> expandable_segments_;
std::vector<int> devices_with_peer_access_;

bool set_fraction = false;

Expand Down Expand Up @@ -1112,8 +1113,8 @@ class DeviceCachingAllocator {
}
}
auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
expandable_segments_.emplace_back(
createExpandableSegment(device, stream, segment_size));
expandable_segments_.emplace_back(createExpandableSegment(
device, stream, segment_size, devices_with_peer_access_));

ExpandableSegment* es = expandable_segments_.back();
Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cstddef>
#include <cstdint>
#include <vector>

#include <c10/util/Exception.h>

Expand Down Expand Up @@ -115,18 +116,21 @@ class ExpandableSegment {
virtual ~ExpandableSegment() = default;
virtual SegmentRange map(SegmentRange range) = 0;
virtual SegmentRange unmap(SegmentRange range) = 0;
virtual void addPeer(int) = 0;
virtual char* ptr() const = 0;
virtual size_t size() const = 0;
};

DIPU_WEAK ExpandableSegment* vendorCreateExpandableSegment(
int device, deviceStream_t stream, size_t size);
int device, deviceStream_t stream, size_t size, std::vector<int> peers);

inline ExpandableSegment* createExpandableSegment(int device,
deviceStream_t stream,
size_t size) {
size_t size,
std::vector<int> peers) {
if (vendorCreateExpandableSegment) {
return vendorCreateExpandableSegment(device, stream, size);
return vendorCreateExpandableSegment(device, stream, size,
std::move(peers));
}
TORCH_CHECK(false, "not support expandable segment");
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// Copyright (c) 2024, DeepLink.

#include <vector>

#include "csrc_dipu/runtime/core/allocator/ExpandableSegment.h"
#include "csrc_dipu/runtime/devproxy/deviceproxy.h"
#include "csrc_dipu/vendor/ascend/basecommimpl.hpp"
Expand All @@ -8,7 +10,8 @@ namespace dipu {

class AscendExpandableSegment : public ExpandableSegment {
public:
AscendExpandableSegment(int device, deviceStream_t stream, size_t size)
AscendExpandableSegment(int device, deviceStream_t stream, size_t size,
std::vector<int> peers)
: device_(device),
stream_(stream),
// 2MB for small pool, 20MB for large pool
Expand Down Expand Up @@ -81,6 +84,7 @@ class AscendExpandableSegment : public ExpandableSegment {

char* ptr() const override { return static_cast<char*>(ptr_); }
size_t size() const override { return max_handles_ * segment_size_; }
void addPeer(int device) override {}

public:
~AscendExpandableSegment() noexcept override {
Expand Down Expand Up @@ -154,8 +158,9 @@ class AscendExpandableSegment : public ExpandableSegment {

ExpandableSegment* vendorCreateExpandableSegment(int device,
deviceStream_t stream,
size_t size) {
return new AscendExpandableSegment(device, stream, size);
size_t size,
std::vector<int> peers) {
return new AscendExpandableSegment(device, stream, size, std::move(peers));
}

} // namespace dipu
189 changes: 189 additions & 0 deletions dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDAExpandableSegment.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// Copyright (c) 2024, DeepLink.
#include <iostream>
#include <vector>

#include "csrc_dipu/runtime/core/allocator/ExpandableSegment.h"
#include "csrc_dipu/runtime/devproxy/deviceproxy.h"
#include "csrc_dipu/vendor/cuda/basecuda.hpp"

namespace dipu {

// ----------------------------------------------------------------------------
// Code from pytorch2.1.1 c10/cuda/CUDACachingAllocator.cpp
// ----------------------------------------------------------------------------

class CUDAExpandableSegment : public ExpandableSegment {
public:
CUDAExpandableSegment(int device, deviceStream_t stream, size_t size,
std::vector<int> peers)
: device_(device),
stream_(stream),
// 2MB for small pool, 20MB for large pool
segment_size_(size),
peers_(std::move(peers)) {
devapis::DIPUDeviceProperties prop = devproxy::getDeviceProperties(device_);
// we allocate enough address space for 1 1/8 the total memory on the GPU.
// This allows for some cases where we have to unmap pages earlier in the
// segment to put them at the end.
max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
DIPU_DRIVER_CHECK(cuMemAddressReserve(&ptr_, segment_size_ * max_handles_,
0ULL, 0, 0ULL));
}
// begin must be aligned to segment_size_.
// returns the actual range mapped, which may be
// greater than requested if size is not aligned to segment_size_.
// return size of 0 indicates OOM
SegmentRange map(SegmentRange range) override {
auto begin = segmentLeft(range.ptr);
auto end = segmentRight(range.ptr + range.size);
TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr);
if (begin == end) {
return rangeFromHandles(begin, end);
}
while (end > handles_.size()) {
handles_.emplace_back(c10::nullopt);
}
for (auto i : c10::irange(begin, end)) {
TORCH_INTERNAL_ASSERT(!handles_.at(i));
CUmemGenericAllocationHandle handle = 0;
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = device_;
auto status = cuMemCreate(&handle, segment_size_, &prop, 0);
if (status == CUDA_ERROR_OUT_OF_MEMORY) {
for (auto j : c10::irange(begin, i)) {
auto h = handles_.at(j).value();
handles_.at(j) = c10::nullopt;
DIPU_DRIVER_CHECK(cuMemRelease(h));
}
trimHandles();
return rangeFromHandles(begin, begin);
}
DIPU_DRIVER_CHECK(status);
handles_.at(i) = handle;
}

for (auto i : c10::irange(begin, end)) {
DIPU_DRIVER_CHECK(cuMemMap(ptr_ + i * segment_size_, segment_size_, 0,
handles_.at(i).value(), 0ULL));
}
setAccess(device_, begin, end);
for (auto p : peers_) {
setAccess(p, begin, end);
}
return rangeFromHandles(begin, end);
}

// unmaps all the completely empty segment_size_ segments between
// [begin, begin + size), returns the offset where the range begin,
// and the actual size unmapped (multiple of segment_size_)
SegmentRange unmap(SegmentRange range) override {
auto begin = segmentRight(range.ptr);
auto end = segmentLeft(range.ptr + range.size);
if (begin >= end) {
return SegmentRange{range.ptr, 0};
}
unmapHandles(begin, end);
return rangeFromHandles(begin, end);
}

char* ptr() const { return (char*)ptr_; }
size_t size() const override { return max_handles_ * segment_size_; }
void addPeer(int device) override {
peers_.push_back(device);
forEachAllocatedRange(
[&](size_t begin, size_t end) { setAccess(device, begin, end); });
}

public:
~CUDAExpandableSegment() noexcept override {
forEachAllocatedRange(
[&](size_t begin, size_t end) { unmapHandles(begin, end); });
DIPU_DRIVER_CHECK(cuMemAddressFree(ptr_, segment_size_ * max_handles_));
}

private:
void setAccess(int device, size_t begin, size_t end) {
CUmemAccessDesc desc;
desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
desc.location.id = device;
desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
DIPU_DRIVER_CHECK(cuMemSetAccess(ptr_ + begin * segment_size_,
(end - begin) * segment_size_, &desc, 1));
}

void unmapHandles(size_t begin, size_t end) {
// note: unlike cudaFree, MemUnmap and MemRelease do
// not appear to synchronize in all cases, so we have to wait for the
// stream to finish before this memory is truly free.

// cannot call c10::cuda::stream_synchronize because
// it might grab the GIL which can lead to a deadlock
// Locking order must be GIL -> Allocator Lock
devproxy::syncStream(stream_);
for (auto i : c10::irange(begin, end)) {
// aclrtDrvMemHandle h = handles_.at(i).value();
CUmemGenericAllocationHandle h = handles_.at(i).value();
handles_.at(i) = c10::nullopt;
DIPU_DRIVER_CHECK(cuMemUnmap(ptr_ + segment_size_ * i, segment_size_));
DIPU_DRIVER_CHECK(cuMemRelease(h));
}
trimHandles();
}

void trimHandles() {
while (!handles_.empty() && !handles_.back()) {
handles_.pop_back();
}
}

void forEachAllocatedRange(std::function<void(size_t, size_t)> fn) {
size_t start = 0;
for (auto i : c10::irange(handles_.size())) {
if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
start = i;
}
if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
fn(start, i + 1);
}
}
}

size_t numSegments(size_t size) const {
return (size + segment_size_ - 1) / segment_size_;
}

size_t segmentLeft(const char* p) const {
auto size = p - ptr();
return size / segment_size_;
}

size_t segmentRight(const char* p) const {
auto size = p - ptr();
return numSegments(size);
}

SegmentRange rangeFromHandles(size_t begin, size_t end) const {
return {ptr() + segment_size_ * begin, segment_size_ * (end - begin)};
}

int device_;
deviceStream_t stream_;
CUdeviceptr ptr_{};
size_t max_handles_;
size_t segment_size_;
std::vector<c10::optional<CUmemGenericAllocationHandle>> handles_;
// devices on which this memory should be mapped in addition
// to the device where the physical memory lives (device_).
std::vector<int> peers_;
};

ExpandableSegment* vendorCreateExpandableSegment(int device,
deviceStream_t stream,
size_t size,
std::vector<int> peers) {
return new CUDAExpandableSegment(device, stream, size, peers);
}

} // namespace dipu
7 changes: 7 additions & 0 deletions dipu/torch_dipu/csrc_dipu/vendor/cuda/basecuda.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once
#include <cuda.h>
#define NVML_NO_UNVERSIONED_FUNC_DEFS
#include <nvml.h>

#include <csrc_dipu/common.h>
#include <csrc_dipu/runtime/device/diclapis.h>
23 changes: 23 additions & 0 deletions dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@

namespace dipu {

// ----------------------------------------------------------------------------
// Code from pytorch2.1.1 c10/cuda/driver_api.h begin
// ----------------------------------------------------------------------------

#define DIPU_DRIVER_CHECK(EXPR) \
do { \
CUresult __err = EXPR; \
if (__err != ::CUDA_SUCCESS) { \
const char* err_str; \
CUresult get_error_str_err C10_UNUSED = \
cuGetErrorString(__err, &err_str); \
if (get_error_str_err != ::CUDA_SUCCESS) { \
AT_ERROR("CUDA driver error: unknown error"); \
} else { \
AT_ERROR("CUDA driver error: ", err_str); \
} \
} \
} while (0)

// ----------------------------------------------------------------------------
// Code from pytorch2.1.1 c10/cuda/driver_api.h end
// ----------------------------------------------------------------------------

#define DIPU_CALLCUDA(Expr) \
{ \
cudaError_t ret = Expr; \
Expand Down

0 comments on commit f059b4a

Please sign in to comment.