chore(gpu): track noise/degree in scalar bitops

zama-ai · Feb 6, 2025 · f0d9f7c · f0d9f7c
1 parent 75b5b8d
commit f0d9f7c
Show file tree

Hide file tree

Showing 11 changed files with 140 additions and 149 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -250,9 +250,10 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
 
 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
+    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
     uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);
+    void *const *ksks);
 
 void cleanup_cuda_integer_bitop(void *const *streams,
                                 uint32_t const *gpu_indexes, uint32_t gpu_count,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -20,15 +20,10 @@ __host__ void host_integer_radix_bitop_kb(
     void *const *bsks, Torus *const *ksks) {
 
   auto lut = mem_ptr->lut;
-
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
-      bsks, ksks, lut, lut->params.message_modulus);
-
+  uint64_t degrees[lwe_array_1->num_radix_blocks];
   if (mem_ptr->op == BITOP_TYPE::BITAND) {
     for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
-      lwe_array_out->degrees[i] =
-          std::min(lwe_array_1->degrees[i], lwe_array_2->degrees[i]);
+      degrees[i] = std::min(lwe_array_1->degrees[i], lwe_array_2->degrees[i]);
     }
   } else if (mem_ptr->op == BITOP_TYPE::BITOR) {
     for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -41,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
           result = max | j;
         }
       }
-      lwe_array_out->degrees[i] = result;
+      degrees[i] = result;
     }
   } else if (mem_ptr->op == BITXOR) {
     for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -55,9 +50,16 @@ __host__ void host_integer_radix_bitop_kb(
           result = max ^ j;
         }
       }
-      lwe_array_out->degrees[i] = result;
+      degrees[i] = result;
     }
   }
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
+      bsks, ksks, lut, lut->params.message_modulus);
+
+  memcpy(lwe_array_out->degrees, degrees,
+         lwe_array_out->num_radix_blocks * sizeof(uint64_t));
 }
 
 template <typename Torus>

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -122,7 +122,7 @@ __host__ void host_integer_radix_cmux_kb(
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
-      mem_ptr->message_extract_lut);
+      mem_ptr->message_extract_lut, num_radix_blocks);
   delete mem_true;
   delete mem_false;
 }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -353,12 +353,15 @@ __host__ void pack_bivariate_blocks_with_single_block(
   check_cuda_error(cudaGetLastError());
 }
 
+/// num_radix_blocks corresponds to the number of blocks on which to apply the
+/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
+/// the input and output numbers of blocks
 template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
     CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
-    Torus *const *ksks, int_radix_lut<Torus> *lut) {
+    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
   // apply_lookup_table
   auto params = lut->params;
   auto pbs_type = params.pbs_type;
@@ -378,11 +381,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
     PANIC("Cuda error: input and output radix ciphertexts should have the same "
           "lwe dimension")
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
+      num_radix_blocks > lwe_array_in->num_radix_blocks)
+    PANIC("Cuda error: num radix blocks on which lut is applied should be "
+          "smaller or equal"
+          " to the number of input and output radix blocks")
 
   // In the case of extracting a single LWE this parameters are dummy
   uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
-  uint32_t num_radix_blocks = lwe_array_in->num_radix_blocks;
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
   std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
@@ -452,7 +459,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
                            lut->num_blocks * sizeof(Torus), streams[0],
                            gpu_indexes[0]);
-  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
+  for (uint i = 0; i < num_radix_blocks; i++) {
     lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
   }
@@ -1888,7 +1895,7 @@ void host_apply_univariate_lut_kb(cudaStream_t const *streams,
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
-      mem);
+      mem, radix_lwe_out->num_radix_blocks);
 }
 
 template <typename Torus>

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -36,8 +36,8 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
   if (input_radix->num_radix_blocks < end_lwe_index - start_lwe_index + 1)
     PANIC("Cuda error: input radix should have more blocks than the specified "
           "range")
-  if (start_lwe_index >= end_lwe_index)
-    PANIC("Cuda error: slice range should be strictly positive")
+  if (start_lwe_index > end_lwe_index)
+    PANIC("Cuda error: slice range should be non negative")
 
   auto lwe_size = input_radix->lwe_dimension + 1;
   output_radix->num_radix_blocks = end_lwe_index - start_lwe_index + 1;
@@ -80,4 +80,29 @@ void copy_radix_ciphertext_to_larger_output_slice_async(
   }
 }
 
+// end_lwe_index is inclusive
+template <typename Torus>
+void set_zero_radix_ciphertext_async(cudaStream_t const stream,
+                                     uint32_t const gpu_index,
+                                     CudaRadixCiphertextFFI *radix,
+                                     const uint32_t start_lwe_index,
+                                     const uint32_t end_lwe_index) {
+  if (radix->num_radix_blocks < end_lwe_index - start_lwe_index + 1)
+    PANIC("Cuda error: input radix should have more blocks than the specified "
+          "range")
+  if (start_lwe_index > end_lwe_index)
+    PANIC("Cuda error: slice range should be non negative")
+
+  auto lwe_size = radix->lwe_dimension + 1;
+  auto num_blocks_to_set = end_lwe_index - start_lwe_index + 1;
+  auto lwe_array_out_block = (Torus *)radix->ptr + start_lwe_index * lwe_size;
+  cuda_memset_async(lwe_array_out_block, 0,
+                    num_blocks_to_set * lwe_size * sizeof(Torus), stream,
+                    gpu_index);
+  memset(&radix->degrees[start_lwe_index], 0,
+         num_blocks_to_set * sizeof(uint64_t));
+  memset(&radix->noise_levels[start_lwe_index], 0,
+         num_blocks_to_set * sizeof(uint64_t));
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -2,15 +2,14 @@
 
 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
+    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
     uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
+    void *const *ksks) {
 
   host_integer_radix_scalar_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_input),
-      static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      lwe_ciphertext_count, op);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
+      lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
+      num_clear_blocks, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks));
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -7,45 +7,82 @@
 template <typename Torus>
 __host__ void host_integer_radix_scalar_bitop_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
-    Torus const *clear_blocks, uint32_t num_clear_blocks,
-    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
-    uint32_t num_radix_blocks, BITOP_TYPE op) {
+    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
+    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
+    void *const *bsks, Torus *const *ksks) {
 
+  if (output->num_radix_blocks != input->num_radix_blocks)
+    PANIC("Cuda error: input and output num radix blocks must be equal")
+  if (output->lwe_dimension != input->lwe_dimension)
+    PANIC("Cuda error: input and output num radix blocks must be equal")
   auto lut = mem_ptr->lut;
-  auto params = lut->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-
-  uint32_t lwe_size = big_lwe_dimension + 1;
+  auto op = mem_ptr->op;
+  auto num_radix_blocks = output->num_radix_blocks;
 
   if (num_clear_blocks == 0) {
     if (op == SCALAR_BITAND) {
-      cuda_memset_async(lwe_array_out, 0,
-                        num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
-                        gpu_indexes[0]);
+      set_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
+                                             0, num_radix_blocks - 1);
     } else {
-      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
-                                   num_radix_blocks * lwe_size * sizeof(Torus),
-                                   streams[0], gpu_indexes[0]);
+      if (input != output)
+        copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
+            streams[0], gpu_indexes[0], output, input, 0);
     }
   } else {
     // We have all possible LUTs pre-computed and we use the decomposed scalar
     // as index to recover the right one
+    uint64_t degrees[num_clear_blocks];
+    uint64_t clear_degrees[num_clear_blocks];
+    cuda_memcpy_async_to_cpu(&clear_degrees, clear_blocks,
+                             num_clear_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    if (mem_ptr->op == BITOP_TYPE::SCALAR_BITAND) {
+      for (uint i = 0; i < num_clear_blocks; i++) {
+        degrees[i] = std::min(clear_degrees[i], input->degrees[i]);
+      }
+    } else if (mem_ptr->op == BITOP_TYPE::SCALAR_BITOR) {
+      for (uint i = 0; i < num_clear_blocks; i++) {
+        auto max = std::max(clear_degrees[i], input->degrees[i]);
+        auto min = std::min(clear_degrees[i], input->degrees[i]);
+        auto result = max;
+
+        for (uint j = 0; j < min + 1; j++) {
+          if (max | j > result) {
+            result = max | j;
+          }
+        }
+        degrees[i] = result;
+      }
+    } else if (mem_ptr->op == SCALAR_BITXOR) {
+      for (uint i = 0; i < num_clear_blocks; i++) {
+        auto max = std::max(clear_degrees[i], input->degrees[i]);
+        auto min = std::min(clear_degrees[i], input->degrees[i]);
+        auto result = max;
+
+        // Try every possibility to find the worst case
+        for (uint j = 0; j < min + 1; j++) {
+          if (max ^ j > result) {
+            result = max ^ j;
+          }
+        }
+        degrees[i] = result;
+      }
+    }
     cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                  num_clear_blocks * sizeof(Torus), streams[0],
                                  gpu_indexes[0]);
     lut->broadcast_lut(streams, gpu_indexes, 0);
 
-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
-        ksks, num_clear_blocks, lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, output, input, bsks, ksks, lut,
+        num_clear_blocks);
+    memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
 
     if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
-      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
-      cuda_memset_async(lwe_array_out_block, 0,
-                        (num_radix_blocks - num_clear_blocks) * lwe_size *
-                            sizeof(Torus),
-                        streams[0], gpu_indexes[0]);
+      set_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
+                                             num_clear_blocks,
+                                             num_radix_blocks - 1);
     }
   }
 }

diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -639,15 +639,13 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_input: *const ffi::c_void,
+        lwe_array_out: *mut CudaRadixCiphertextFFI,
+        lwe_array_input: *const CudaRadixCiphertextFFI,
         clear_blocks: *const ffi::c_void,
         num_clear_blocks: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        lwe_ciphertext_count: u32,
-        op: BITOP_TYPE,
     );
 }
 unsafe extern "C" {

diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -311,89 +311,6 @@ impl CudaRadixCiphertextInfo {
                 .collect(),
         }
     }
-    pub(crate) fn after_scalar_bitand<T>(&self, scalar: T) -> Self
-    where
-        T: DecomposableInto<u8>,
-    {
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
-            .iter_as::<u8>()
-            .chain(std::iter::repeat(0u8));
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: left
-                        .degree
-                        .after_bitand(Degree::new(u64::from(scalar_block))),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
-
-    pub(crate) fn after_scalar_bitor<T>(&self, scalar: T) -> Self
-    where
-        T: DecomposableInto<u8>,
-    {
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
-            .iter_as::<u8>()
-            .chain(std::iter::repeat(0u8));
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: left
-                        .degree
-                        .after_bitor(Degree::new(u64::from(scalar_block))),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
-
-    pub(crate) fn after_scalar_bitxor<T>(&self, scalar: T) -> Self
-    where
-        T: DecomposableInto<u8>,
-    {
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
-            .iter_as::<u8>()
-            .chain(std::iter::repeat(0u8));
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: left
-                        .degree
-                        .after_bitxor(Degree::new(u64::from(scalar_block))),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
 
     // eq/ne, and comparisons returns a ciphertext that encrypts a 0 or 1, so the first block
     // (least significant) has a degree of 1, the other blocks should be trivial lwe encrypting 0,