chore(gpu): refactor scalar rotate to track noise/degree

zama-ai · Feb 11, 2025 · 6544124 · 6544124
1 parent 1243aea
commit 6544124
Show file tree

Hide file tree

Showing 11 changed files with 113 additions and 102 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -291,8 +291,8 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
 
 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
+    CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks);
 
 void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
                                               uint32_t const *gpu_indexes,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -94,44 +94,42 @@ __host__ void array_rotate_left(Torus *array_out, Torus *array_in,
 // calculation is not inplace, so `dst` and `src` must not be the same
 // one block is responsible to process single lwe ciphertext
 template <typename Torus>
-__host__ void host_radix_blocks_rotate_right(cudaStream_t const *streams,
-                                             uint32_t const *gpu_indexes,
-                                             uint32_t gpu_count,
-                                             CudaRadixCiphertextFFI *dst,
-                                             CudaRadixCiphertextFFI *src,
-                                             uint32_t rotations) {
+__host__ void host_radix_blocks_rotate_right(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
+    CudaRadixCiphertextFFI *src, uint32_t rotations, uint32_t num_blocks) {
   if (src == dst) {
     PANIC("Cuda error (blocks_rotate_right): the source and destination "
           "pointers should be different");
   }
   if (dst->lwe_dimension != src->lwe_dimension)
     PANIC("Cuda error: input and output should have the same "
           "lwe dimension")
+  if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
+    PANIC("Cuda error: input and output should have more blocks than asked for "
+          "in the "
+          "function call")
 
   auto lwe_size = src->lwe_dimension + 1;
 
   cuda_set_device(gpu_indexes[0]);
-  radix_blocks_rotate_right<Torus>
-      <<<src->num_radix_blocks, 1024, 0, streams[0]>>>(
-          (Torus *)dst->ptr, (Torus *)src->ptr, rotations,
-          dst->num_radix_blocks, lwe_size);
+  radix_blocks_rotate_right<Torus><<<num_blocks, 1024, 0, streams[0]>>>(
+      (Torus *)dst->ptr, (Torus *)src->ptr, rotations, num_blocks, lwe_size);
   check_cuda_error(cudaGetLastError());
 
   // Rotate degrees and noise to follow blocks
-  array_rotate_right(dst->degrees, src->degrees, rotations,
-                     dst->num_radix_blocks);
+  array_rotate_right(dst->degrees, src->degrees, rotations, num_blocks);
   array_rotate_right(dst->noise_levels, src->noise_levels, rotations,
-                     dst->num_radix_blocks);
+                     num_blocks);
 }
 
 // rotate radix ciphertext left with specific value
 // calculation is not inplace, so `dst` and `src` must not be the same
 template <typename Torus>
-__host__ void
-host_radix_blocks_rotate_left(cudaStream_t const *streams,
-                              uint32_t const *gpu_indexes, uint32_t gpu_count,
-                              CudaRadixCiphertextFFI *dst,
-                              CudaRadixCiphertextFFI *src, uint32_t value) {
+__host__ void host_radix_blocks_rotate_left(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
+    CudaRadixCiphertextFFI *src, uint32_t value, uint32_t num_blocks) {
   if (src == dst) {
     PANIC("Cuda error (blocks_rotate_left): the source and destination "
           "pointers should be different");
@@ -140,20 +138,21 @@ host_radix_blocks_rotate_left(cudaStream_t const *streams,
   if (dst->lwe_dimension != src->lwe_dimension)
     PANIC("Cuda error: input and output should have the same "
           "lwe dimension")
+  if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
+    PANIC("Cuda error: input and output should have more blocks than asked for "
+          "in the "
+          "function call")
 
   auto lwe_size = src->lwe_dimension + 1;
 
   cuda_set_device(gpu_indexes[0]);
-  radix_blocks_rotate_left<Torus>
-      <<<src->num_radix_blocks, 1024, 0, streams[0]>>>(
-          (Torus *)dst->ptr, (Torus *)src->ptr, value, dst->num_radix_blocks,
-          lwe_size);
+  radix_blocks_rotate_left<Torus><<<num_blocks, 1024, 0, streams[0]>>>(
+      (Torus *)dst->ptr, (Torus *)src->ptr, value, num_blocks, lwe_size);
   check_cuda_error(cudaGetLastError());
 
   // Rotate degrees and noise to follow blocks
-  array_rotate_left(dst->degrees, src->degrees, value, dst->num_radix_blocks);
-  array_rotate_left(dst->noise_levels, src->noise_levels, value,
-                    dst->num_radix_blocks);
+  array_rotate_left(dst->degrees, src->degrees, value, num_blocks);
+  array_rotate_left(dst->noise_levels, src->noise_levels, value, num_blocks);
 }
 
 // rotate radix ciphertext right with specific value
@@ -1836,7 +1835,7 @@ void host_propagate_single_sub_borrow(cudaStream_t const *streams,
 
   host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
                                         step_output, generates_or_propagates, 1,
-                                        num_blocks, big_lwe_size);
+                                        num_blocks);
   cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                     gpu_indexes[0]);
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -22,14 +22,13 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
 
 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
+    CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks) {
 
   host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), n,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, n,
       (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), num_blocks);
+      (uint64_t **)(ksks));
 }
 
 void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -26,18 +26,14 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
 template <typename Torus>
 __host__ void host_integer_radix_scalar_rotate_kb_inplace(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array, uint32_t n,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
     int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks) {
+    Torus *const *ksks) {
 
+  auto num_blocks = lwe_array->num_radix_blocks;
   auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
   auto message_modulus = params.message_modulus;
 
-  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
   size_t num_bits_in_message = (size_t)log2_int(message_modulus);
   size_t total_num_bits = num_bits_in_message * num_blocks;
   n = n % total_num_bits;
@@ -48,7 +44,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
   size_t rotations = n / num_bits_in_message;
   size_t shift_within_block = n % num_bits_in_message;
 
-  Torus *rotated_buffer = (Torus *)mem->tmp_rotated->ptr;
+  auto rotated_buffer = mem->tmp_rotated;
 
   // rotate right all the blocks in radix ciphertext
   // copy result in new buffer
@@ -57,56 +53,56 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
   // one block is responsible to process single lwe ciphertext
   if (mem->shift_type == LEFT_SHIFT) {
     // rotate right as the blocks are from LSB to MSB
-    legacy_host_radix_blocks_rotate_right<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks);
 
-    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, streams[0],
-                                 gpu_indexes[0]);
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             lwe_array, 0, num_blocks,
+                                             rotated_buffer, 0, num_blocks);
 
     if (shift_within_block == 0) {
       return;
     }
 
     auto receiver_blocks = lwe_array;
     auto giver_blocks = rotated_buffer;
-    legacy_host_radix_blocks_rotate_right<Torus>(
-        streams, gpu_indexes, gpu_count, giver_blocks, lwe_array, 1, num_blocks,
-        big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          giver_blocks, lwe_array, 1,
+                                          num_blocks);
 
     auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
 
-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
-        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
+        giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
         lut_bivariate->params.message_modulus);
 
   } else {
     // rotate left as the blocks are from LSB to MSB
-    legacy_host_radix_blocks_rotate_left<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks);
 
-    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, streams[0],
-                                 gpu_indexes[0]);
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             lwe_array, 0, num_blocks,
+                                             rotated_buffer, 0, num_blocks);
 
     if (shift_within_block == 0) {
       return;
     }
 
     auto receiver_blocks = lwe_array;
     auto giver_blocks = rotated_buffer;
-    legacy_host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
-                                                giver_blocks, lwe_array, 1,
-                                                num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         giver_blocks, lwe_array, 1,
+                                         num_blocks);
 
     auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
 
-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
-        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
+        giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
         lut_bivariate->params.message_modulus);
   }
 }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -142,8 +142,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
 
   if (mem->shift_type == LEFT_SHIFT) {
     // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right<Torus>(
-        streams, gpu_indexes, gpu_count, &rotated_buffer, lwe_array, rotations);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          &rotated_buffer, lwe_array, rotations,
+                                          num_blocks);
 
     // create trivial assign for value = 0
     if (rotations > 0)
@@ -177,7 +178,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
   } else {
     // right shift
     host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
-                                         &rotated_buffer, lwe_array, rotations);
+                                         &rotated_buffer, lwe_array, rotations,
+                                         num_blocks);
 
     // rotate left as the blocks are from LSB to MSB
     // create trivial assign for value = 0

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -108,7 +108,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
         PANIC("Cuda error: incorrect number of blocks")
       host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
                                             rotated_input, input_bits_b,
-                                            rotations);
+                                            rotations, total_nb_bits);
 
       if (rotations > 0)
         set_zero_radix_ciphertext_slice_async<Torus>(
@@ -120,7 +120,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
         PANIC("Cuda error: incorrect number of blocks")
       host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
                                            rotated_input, input_bits_b,
-                                           rotations);
+                                           rotations, total_nb_bits);
 
       if (mem->is_signed)
         for (int i = 0; i < rotations; i++) {
@@ -140,13 +140,13 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
       // rotate right as the blocks are from LSB to MSB
       host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
                                             rotated_input, input_bits_b,
-                                            rotations);
+                                            rotations, total_nb_bits);
       break;
     case RIGHT_ROTATE:
       // rotate left as the blocks are from LSB to MSB
       host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
                                            rotated_input, input_bits_b,
-                                           rotations);
+                                           rotations, total_nb_bits);
       break;
     default:
       PANIC("Unknown operation")

diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -726,12 +726,11 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array: *mut ffi::c_void,
+        lwe_array: *mut CudaRadixCiphertextFFI,
         n: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        num_blocks: u32,
     );
 }
 unsafe extern "C" {

diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -165,21 +165,6 @@ impl CudaRadixCiphertextInfo {
                 .collect(),
         }
     }
-    pub(crate) fn after_scalar_rotate(&self) -> Self {
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .map(|left| CudaBlockInfo {
-                    degree: Degree::new(left.message_modulus.0 - 1),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect(),
-        }
-    }
     pub(crate) fn after_min_max(&self) -> Self {
         Self {
             blocks: self