Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

try fix tgs agitate #751

Merged
merged 23 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

namespace dipu {

constexpr size_t kMaxAsyncResourcePoolLength = 3;

template <class T>
class AsyncResourcePool {
public:
Expand All @@ -31,7 +29,11 @@ class AsyncResourcePoolImpl : public AsyncResourcePool<T> {
public:
void add(const T& t, std::deque<DIPUEvent>& events) override {
std::lock_guard<mutex_t> lk(mutex_);
list_.emplace_back(t, std::move(events));
if (events.empty()) {
list_.emplace_front(t, std::move(events));
} else {
list_.emplace_back(t, std::move(events));
}
}

T get() override {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@
#include <utility>
#include <vector>

#include "csrc_dipu/utils/env.hpp"

#include "DIPUCachingAllocator.h"
#include "DIPUSpinMutex.h"

namespace dipu {

// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
const size_t kMaxExtendSize = get_env_or_default("DIPU_MAX_EXTEND_SIZE", 1024)
<< 20U;

class BFCachingAllocatorImpl {
public:
using allocate_fn_t = std::function<void*(size_t)>;
Expand All @@ -29,7 +35,6 @@ class BFCachingAllocatorImpl {
static constexpr size_t kMinAllocationSize = 512;
static constexpr size_t kMaxInternalFragmentation = 8U << 20U; // 8MB
static constexpr size_t kMinExtendSize = 8U << 20U; // 8MB
static constexpr size_t kMaxExtendSize = 1U << 30U; // 1GB

size_t cachedBytes = 0;
size_t allocatedBytes = 0;
Expand Down Expand Up @@ -441,6 +446,33 @@ class BFCachingAllocator : public CacheAllocator {
}
}

bool try_empty_resource_pool() const {
using namespace std::chrono_literals;
std::lock_guard<mutex_t> lk(resource_pool_mutex_);
auto start = std::chrono::steady_clock::now();
constexpr auto maxWaitTime = 32us;
while (!async_mem_pool()->empty()) {
if (!async_mem_pool()->ready()) {
auto now = std::chrono::steady_clock::now();
auto elapsed = now - start;
if (elapsed < maxWaitTime) {
std::this_thread::yield();
continue;
}
return false;
}
const auto block = async_mem_pool()->get();
void* ptr = std::get<0>(block);
int id = static_cast<int>(std::get<1>(block));
DIPU_DEBUG_ALLOCATOR(
8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr
<< " ,id:" << id << " ,allocator:" << this
<< ", device:" << device());
impl->releaseRaw(ptr, id);
}
return true;
}

void check_impl() const {
if (impl) {
return;
Expand Down Expand Up @@ -507,7 +539,7 @@ class BFCachingAllocator : public CacheAllocator {
c10::DataPtr allocate(size_t size) const override {
restore();
if (async_mem_pool()->size() > kMaxAsyncResourcePoolLength) {
empty_resource_pool();
try_empty_resource_pool();
}
size = getMemoryAlignmentStrategy()->roundBytes(size);
std::tuple<void*, int, size_t> block = impl->allocateRaw(size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,18 @@

#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/runtime/devproxy/deviceproxy.h"
#include "csrc_dipu/utils/env.hpp"

namespace dipu {

// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
std::mutex DIPURawDeviceAllocator::mutex_;

constexpr size_t kDefaultMaxAsyncResourcePoolLength = 64;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
const size_t kMaxAsyncResourcePoolLength = get_env_or_default(
"DIPU_MAX_ASYNC_RESOURCE_POOL_LENGTH", kDefaultMaxAsyncResourcePoolLength);

namespace {

// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace dipu {

constexpr size_t kDefaultMermoryAlignment = 512;

extern const size_t kMaxAsyncResourcePoolLength;

class MemoryAlignmentStrategy {
size_t kBytesAlign = kDefaultMermoryAlignment;
size_t alpha = 1; // reserved
Expand Down
18 changes: 18 additions & 0 deletions dipu/torch_dipu/csrc_dipu/utils/env.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) 2024, DeepLink.
#pragma once
#include <sstream>

namespace dipu {

template <typename T>
T get_env_or_default(const char* env_name, const T& default_value) {
const char* env = std::getenv(env_name);
if (env == nullptr) {
return default_value;
}
T value = default_value;
std::istringstream(env) >> value;
return value;
}

} // namespace dipu