From 64479c4a8b2e00dd1865766795528ea25b21880c Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 18 Sep 2024 12:42:14 +0200 Subject: [PATCH] fixup! Add polynomial/div_by_x_minus_z.cuh. --- polynomial/div_by_x_minus_z.cuh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/polynomial/div_by_x_minus_z.cuh b/polynomial/div_by_x_minus_z.cuh index b2a6b02..99e155d 100644 --- a/polynomial/div_by_x_minus_z.cuh +++ b/polynomial/div_by_x_minus_z.cuh @@ -9,7 +9,7 @@ #include #include -template __global__ +template __global__ __launch_bounds__(BSZ) void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z) { struct my { @@ -335,11 +335,16 @@ template void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z, const stream_t& s) { - cudaFuncAttributes attr; - CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z)); + constexpr int BSZ = sizeof(fr_t) <= 16 ? 1024 : 0; int gridDim = s.sm_count(); - int blockDim = attr.maxThreadsPerBlock; + int blockDim = BSZ; + + if (BSZ == 0) { + cudaFuncAttributes attr; + CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z)); + blockDim = attr.maxThreadsPerBlock; + } if (gridDim > blockDim) // there are no such large GPUs, not for now... gridDim = blockDim; @@ -355,7 +360,7 @@ void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z, size_t sharedSz = sizeof(fr_t) * max(blockDim/WARP_SZ, gridDim); sharedSz += sizeof(fr_t) * WARP_SZ; - s.launch_coop(d_div_by_x_minus_z, {gridDim, blockDim, sharedSz}, + s.launch_coop(d_div_by_x_minus_z, {gridDim, blockDim, sharedSz}, d_inout, len, z); } #endif