From 9a5680888d4c25d22cfcbe3ce1edf6dccf5c2a0c Mon Sep 17 00:00:00 2001 From: Liu Liu Date: Tue, 19 Nov 2024 14:25:36 -0500 Subject: [PATCH] Add API and logic to remap GPUs inside the framework. --- lib/nnc/ccv_nnc.h | 17 ++++--- lib/nnc/ccv_nnc_cmd.c | 9 ++++ lib/nnc/gpu/ccv_nnc_compat.cu | 92 ++++++++++++++++++++++++++++------- lib/nnc/gpu/ccv_nnc_compat.h | 1 + test/int/nnc/cifar.tests.c | 3 ++ 5 files changed, 96 insertions(+), 26 deletions(-) diff --git a/lib/nnc/ccv_nnc.h b/lib/nnc/ccv_nnc.h index a1a556473..df47989d2 100644 --- a/lib/nnc/ccv_nnc.h +++ b/lib/nnc/ccv_nnc.h @@ -894,6 +894,15 @@ void ccv_nnc_set_queue_watermark(int state); * @return How many in-flight GPU commands can have. */ CCV_WARN_UNUSED(int) ccv_nnc_queue_watermark(void); +/** + * Set the device mapping to use custom order for device rather than driver imposed order. This is helpful + * to manage code where which GPU to use have no control over. The previous permutation is cleared up on + * each call and you can set 0 size device map to clear up all custom mapping. + * @param type Currently, only CCV_NNC_STREAM_CONTEXT_GPU on NVIDIA systems are supported. + * @param device_map The array of device map, maximum 64 devices. + * @param size The size of the array, only first 64 will be used. + */ +void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size); /** * Quantize a given memory region of a given datatype / memory resides, into nbits palette. * @param input The input memory region, it can be CCV_64F, CCV_32F or CCV_16F. @@ -1073,14 +1082,6 @@ void ccv_nnc_stream_signal_free(ccv_nnc_stream_signal_t* const signal); * @return The number of devices. */ CCV_WARN_UNUSED(int) ccv_nnc_device_count(const int type); -/** - * Remap a source device as the destination device. - * @param type The type of devices (CCV_NNC_STREAM_CONTEXT_GPU / CCV_NNC_STREAM_CONTEXT_CPU) - * @param source The original device id. - * @param destination The new device id. - * @return 0 if the device remap is successful, -1 if it is not. - */ -CCV_WARN_UNUSED(int) ccv_nnc_device_remap(const int type, const int source, const int destination); /** * The neighbor discovery function that will be called with the device id. */ diff --git a/lib/nnc/ccv_nnc_cmd.c b/lib/nnc/ccv_nnc_cmd.c index d7708e448..3320255f9 100644 --- a/lib/nnc/ccv_nnc_cmd.c +++ b/lib/nnc/ccv_nnc_cmd.c @@ -730,3 +730,12 @@ void ccv_nnc_set_queue_watermark(int watermark) ccv_nnc_mps_set_queue_watermark(watermark); #endif } + +void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size) +{ + if (type != CCV_STREAM_CONTEXT_GPU) + return; +#ifdef HAVE_CUDA + cusetdevicemap(device_map, size); +#endif +} diff --git a/lib/nnc/gpu/ccv_nnc_compat.cu b/lib/nnc/gpu/ccv_nnc_compat.cu index d864273fc..e6d8c7a29 100644 --- a/lib/nnc/gpu/ccv_nnc_compat.cu +++ b/lib/nnc/gpu/ccv_nnc_compat.cu @@ -6,6 +6,7 @@ extern "C" { } static void cutrigmp(void); +static int cudevicemap(const int device_id); #ifdef HAVE_CUDNN struct cudnn_free_list_s { @@ -16,8 +17,10 @@ KHASH_MAP_INIT_INT(cudnn_free, struct cudnn_free_list_s*); static pthread_mutex_t g_cudnn_mutex = PTHREAD_MUTEX_INITIALIZER; static khash_t(cudnn_free)* g_cudnn = 0; -cudnnHandle_t cudnn_get(const int type) +static cudnnHandle_t cudnn_get(int type) { + const int device_id = cudevicemap(CCV_STREAM_GET_DEVICE_ID(type)); + CCV_STREAM_SET_DEVICE_ID(type, device_id); pthread_mutex_lock(&g_cudnn_mutex); if (!g_cudnn) g_cudnn = kh_init(cudnn_free); @@ -45,8 +48,10 @@ cudnnHandle_t cudnn_get(const int type) return cudnn; } -void cudnn_save(const int type, cudnnHandle_t cudnn) +static void cudnn_save(int type, cudnnHandle_t cudnn) { + const int device_id = cudevicemap(CCV_STREAM_GET_DEVICE_ID(type)); + CCV_STREAM_SET_DEVICE_ID(type, device_id); pthread_mutex_lock(&g_cudnn_mutex); int ret; khiter_t i = kh_put(cudnn_free, g_cudnn, type, &ret); @@ -57,7 +62,7 @@ void cudnn_save(const int type, cudnnHandle_t cudnn) pthread_mutex_unlock(&g_cudnn_mutex); } -void cudnn_pressure(const int device_id) +static void cudnn_pressure(const int device_id) { pthread_mutex_lock(&g_cudnn_mutex); if (g_cudnn) @@ -160,6 +165,54 @@ void cuunregmp(const int slot) pthread_mutex_unlock(&g_mp_mutex); } +static int cuda_device_map[64] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +static int cuda_device_reverse_map[64] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +void cusetdevicemap(const int* const device_map, const int size) +{ + int i; + for (i = 0; i < sizeof(cuda_device_reverse_map) / sizeof(cuda_device_reverse_map[0]); i++) + cuda_device_reverse_map[i] = -1; + for (i = size; i < sizeof(cuda_device_map) / sizeof(cuda_device_map[0]); i++) + cuda_device_map[i] = -1; + for (i = 0; i < ccv_min(sizeof(cuda_device_map) / sizeof(cuda_device_map[0]), size); i++) + { + cuda_device_map[i] = device_map[i]; + cuda_device_reverse_map[device_map[i]] = i; + } +} + +static int cudevicemap(const int device_id) +{ + if (device_id >= sizeof(cuda_device_map) / sizeof(cuda_device_map[0])) + return device_id; + const int new_device_id = cuda_device_map[device_id]; + if (new_device_id < 0) + return device_id; + return new_device_id; +} + +static int cudevicereversemap(const int device_id) +{ + if (device_id >= sizeof(cuda_device_reverse_map) / sizeof(cuda_device_reverse_map[0])) + return device_id; + const int new_device_id = cuda_device_reverse_map[device_id]; + if (new_device_id < 0) + return device_id; + return new_device_id; +} + static void cutrigmp(void) { int device_id; @@ -170,7 +223,7 @@ static void cutrigmp(void) { cump_t* const mp = (cump_t*)ccv_array_get(g_mp_h, i); if (mp->device_id == device_id && mp->func) - mp->func(device_id, mp->ctx); + mp->func(cudevicereversemap(device_id), mp->ctx); } pthread_mutex_unlock(&g_mp_mutex); // Set back the device id. @@ -183,7 +236,7 @@ static void cutrigmp(void) void* cumalloc(int device, size_t size) { void* ptr = 0; - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); cudaMalloc(&ptr, size); if (ptr == 0) { @@ -195,14 +248,14 @@ void* cumalloc(int device, size_t size) void cufree(int device, void* ptr) { - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); CUDA_ENFORCE(cudaFree(ptr)); } void cudevice(int device) { if (device >= 0) - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); } void cumemcpy(void* dest, const int dest_type, const void* src, const int src_type, size_t n) @@ -211,18 +264,18 @@ void cumemcpy(void* dest, const int dest_type, const void* src, const int src_ty return; if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_CPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_GPU_MEMORY) { const int device_b = CCV_TENSOR_GET_DEVICE_ID(dest_type); - CUDA_ENFORCE(cudaSetDevice(device_b)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_b))); CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)); } else if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_GPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_CPU_MEMORY) { const int device_a = CCV_TENSOR_GET_DEVICE_ID(src_type); - CUDA_ENFORCE(cudaSetDevice(device_a)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_a))); CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)); } else if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_CPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_CPU_MEMORY) CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyHostToHost)); else if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_GPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_GPU_MEMORY) { const int device_a = CCV_TENSOR_GET_DEVICE_ID(src_type); const int device_b = CCV_TENSOR_GET_DEVICE_ID(dest_type); - CUDA_ENFORCE(cudaSetDevice(device_b)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_b))); if (device_a == device_b) CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice)); else @@ -334,7 +387,7 @@ static ccv_nnc_stream_context_device_local_t* _ccv_nnc_stream_compat_device_loca } return stream_compat->_heap_gpus + device_id; } else { - CUDA_ENFORCE(cudaSetDevice(device_id)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_id))); return &stream_compat->_inline_gpu; } } @@ -359,7 +412,7 @@ ccv_nnc_stream_signal_t* ccv_nnc_init_stream_signal(ccv_nnc_stream_signal_t* con assert(CCV_STREAM_GET_CONTEXT(((int*)signal)[0]) == CCV_STREAM_CONTEXT_GPU); ccv_nnc_stream_compat_signal_t* compat_signal = (ccv_nnc_stream_compat_signal_t*)ccrealloc(signal, sizeof(ccv_nnc_stream_compat_signal_t)); const int device = CCV_STREAM_GET_DEVICE_ID(compat_signal->super.type); - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); CUDA_ENFORCE(cudaEventCreateWithFlags(&compat_signal->event, cudaEventDisableTiming)); return (ccv_nnc_stream_signal_t*)compat_signal; } @@ -388,7 +441,7 @@ void ccv_nnc_deinit_stream_signal(ccv_nnc_stream_signal_t* const signal) { ccv_nnc_stream_compat_signal_t* compat_signal = (ccv_nnc_stream_compat_signal_t*)signal; const int device = CCV_STREAM_GET_DEVICE_ID(compat_signal->super.type); - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); CUDA_ENFORCE(cudaEventDestroy(compat_signal->event)); } @@ -458,6 +511,7 @@ void* ccv_nnc_stream_compat_get_workspace(const ccv_nnc_stream_context_t* const return device_local->workspace; int device_id; CUDA_ENFORCE(cudaGetDevice(&device_id)); + device_id = cudevicereversemap(device_id); device_local->workspace_size = workspace_size; if (device_local->workspace) CUDA_ENFORCE(cudaFree(device_local->workspace)); @@ -494,7 +548,7 @@ void ccv_nnc_stream_compat_drain(ccv_nnc_stream_context_t* const stream_context) stream_compat->_heap_gpus[i].workspace_size = 0; } } else if (stream_compat->_inline_gpu.workspace) { - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); CUDA_ENFORCE(cudaFree(stream_compat->_inline_gpu.workspace)); stream_compat->_inline_gpu.workspace = 0; stream_compat->_inline_gpu.workspace_size = 0; @@ -613,7 +667,7 @@ void ccv_nnc_deinit_stream_context(ccv_nnc_stream_context_t* const stream_contex cuunregmp(stream_compat->_heap_gpus[i].mp_hook - 1); } } else { - CUDA_ENFORCE(cudaSetDevice(device)); + CUDA_ENFORCE(cudaSetDevice(cudevicemap(device))); if (stream_compat->_inline_gpu.workspace) { CUDA_ENFORCE(cudaFree(stream_compat->_inline_gpu.workspace)); @@ -657,6 +711,7 @@ int ccv_nnc_stream_context_get_device(const ccv_nnc_stream_context_t* const stre { int device = 0; CUDA_ENFORCE(cudaGetDevice(&device)); + device = cudevicereversemap(device); return device; } const ccv_nnc_stream_context_compat_t* stream_compat = (const ccv_nnc_stream_context_compat_t*)stream_context; @@ -838,6 +893,7 @@ cudnnHandle_t ccv_nnc_stream_context_get_cudnn(const ccv_nnc_stream_context_t* c { int device_id; CUDA_ENFORCE(cudaGetDevice(&device_id)); + device_id = cudevicereversemap(device_id); device_local->mp_hook = curegmp(device_id, _ccv_nnc_device_local_drain, device_local) + 1; } } @@ -1440,14 +1496,14 @@ ncclComm_t ccv_nnc_nccl_get_comm(ccv_nnc_stream_context_t* const stream, const i stream_compat->super.resource_container[0] = (ccv_nnc_stream_resource_container_t*)cccalloc(1, sizeof(ccv_nnc_stream_resource_container_compat_t)); ccv_nnc_stream_resource_container_compat_t* const resource_container_compat = (ccv_nnc_stream_resource_container_compat_t*)stream_compat->super.resource_container[0]; if (resource_container_compat->comms && resource_container_compat->comm_count == device_count) - return resource_container_compat->comms[device_id]; + return resource_container_compat->comms[cudevicemap(device_id)]; if (resource_container_compat->comms) resource_container_compat->comms = (ncclComm_t*)ccrealloc(resource_container_compat->comms, sizeof(ncclComm_t) * device_count); else resource_container_compat->comms = (ncclComm_t*)ccmalloc(sizeof(ncclComm_t) * device_count); _ccv_nnc_nccl_redo_comms(resource_container_compat->comms, resource_container_compat->comm_count, device_count); resource_container_compat->comm_count = device_count; - return resource_container_compat->comms[device_id]; + return resource_container_compat->comms[cudevicemap(device_id)]; } else { static ncclComm_t comms[CCV_TENSOR_GET_DEVICE_ID(CCV_COMPUTE_DEVICE_ANY)]; static int comm_count = 0; @@ -1456,7 +1512,7 @@ ncclComm_t ccv_nnc_nccl_get_comm(ccv_nnc_stream_context_t* const stream, const i _ccv_nnc_nccl_redo_comms(comms, comm_count, device_count); comm_count = device_count; } - return comms[device_id]; + return comms[cudevicemap(device_id)]; } } diff --git a/lib/nnc/gpu/ccv_nnc_compat.h b/lib/nnc/gpu/ccv_nnc_compat.h index 9639c5a7e..ecf6b94a3 100644 --- a/lib/nnc/gpu/ccv_nnc_compat.h +++ b/lib/nnc/gpu/ccv_nnc_compat.h @@ -33,6 +33,7 @@ typedef void(*cump_f)(int device_id, void* const context); int curegmp(int device_id, cump_f func, void* const context); // register memory pressure handler void cuunregmp(const int id); // un-register memory pressure handler. void cusetprofiler(int state); +void cusetdevicemap(const int* const device_map, const int size); // Stream context CCV_WARN_UNUSED(ccv_nnc_stream_context_t*) ccv_nnc_init_stream_context(ccv_nnc_stream_context_t* const stream_context); diff --git a/test/int/nnc/cifar.tests.c b/test/int/nnc/cifar.tests.c index 39106c89d..b86215aeb 100644 --- a/test/int/nnc/cifar.tests.c +++ b/test/int/nnc/cifar.tests.c @@ -82,6 +82,8 @@ static ccv_cnnp_model_t* _cifar_10_dawn(const int softmax) static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) { ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1); + int device_map[4] = {3, 2, 1, 0}; + ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, device_map, 4); const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); if (device_count < 1) return -1; @@ -226,6 +228,7 @@ static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set ccv_nnc_stream_context_free(stream_contexts[1]); for (i = 0; i < device_count; i++) ccv_nnc_tensor_free(cpu_outputs[i]); + ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, 0, 0); return correct; }