From 41439f389a741ce176558bb61ca0b1db8e83fbff Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 12 Sep 2024 12:17:02 +0200 Subject: [PATCH] util/gpu_t.cuh: fix launch_coop(f, {0, 0},...). As it turns out cudaOccupancyMaxPotentialBlockSize doesn't actually cater to cudaLaunchCooperativeKernel, because suggested grid size can be twice the amount of SMs, which is unsuitable for cooperative launches. --- util/gpu_t.cuh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/util/gpu_t.cuh b/util/gpu_t.cuh index 1694055..62db9e3 100644 --- a/util/gpu_t.cuh +++ b/util/gpu_t.cuh @@ -118,11 +118,10 @@ public: if (gpu_props(gpu_id).sharedMemPerBlock < shared_sz) CUDA_OK(cudaFuncSetAttribute(f, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_sz)); if (gridDim.x == 0 || blockDim.x == 0) { - int blockSize, minGridSize; - - CUDA_OK(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, f)); - if (blockDim.x == 0) blockDim.x = blockSize; - if (gridDim.x == 0) gridDim.x = minGridSize; + cudaFuncAttributes attr; + CUDA_OK(cudaFuncGetAttributes(&attr, f)); + if (blockDim.x == 0) blockDim.x = attr.maxThreadsPerBlock; + if (gridDim.x == 0) gridDim.x = sm_count(); } void* va_args[sizeof...(args)] = { &args... }; CUDA_OK(cudaLaunchCooperativeKernel((const void*)f, gridDim, blockDim,