From f0d9f7c5489b11cc39b26102852e78cf8ffd0ff6 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Tue, 4 Feb 2025 09:24:03 +0100
Subject: [PATCH] chore(gpu): track noise/degree in scalar bitops

---
 .../cuda/include/integer/integer.h            |  5 +-
 .../cuda/src/integer/bitwise_ops.cuh          | 20 +++--
 .../cuda/src/integer/cmux.cuh                 |  2 +-
 .../cuda/src/integer/integer.cuh              | 15 +++-
 .../cuda/src/integer/radix_ciphertext.cuh     | 29 ++++++-
 .../cuda/src/integer/scalar_bitops.cu         | 15 ++--
 .../cuda/src/integer/scalar_bitops.cuh        | 81 +++++++++++++-----
 backends/tfhe-cuda-backend/src/bindings.rs    |  6 +-
 tfhe/src/integer/gpu/ciphertext/info.rs       | 83 -------------------
 tfhe/src/integer/gpu/mod.rs                   | 23 +++--
 .../gpu/server_key/radix/scalar_bitwise_op.rs | 10 +--
 11 files changed, 140 insertions(+), 149 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
index 018efe1e80..c349a426a2 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -250,9 +250,10 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
 
 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
+    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
     uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);
+    void *const *ksks);
 
 void cleanup_cuda_integer_bitop(void *const *streams,
                                 uint32_t const *gpu_indexes, uint32_t gpu_count,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
index 1d7f85fa88..e39e33a8c8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -20,15 +20,10 @@ __host__ void host_integer_radix_bitop_kb(
     void *const *bsks, Torus *const *ksks) {
 
   auto lut = mem_ptr->lut;
-
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
-      bsks, ksks, lut, lut->params.message_modulus);
-
+  uint64_t degrees[lwe_array_1->num_radix_blocks];
   if (mem_ptr->op == BITOP_TYPE::BITAND) {
     for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
-      lwe_array_out->degrees[i] =
-          std::min(lwe_array_1->degrees[i], lwe_array_2->degrees[i]);
+      degrees[i] = std::min(lwe_array_1->degrees[i], lwe_array_2->degrees[i]);
     }
   } else if (mem_ptr->op == BITOP_TYPE::BITOR) {
     for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -41,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
           result = max | j;
         }
       }
-      lwe_array_out->degrees[i] = result;
+      degrees[i] = result;
     }
   } else if (mem_ptr->op == BITXOR) {
     for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -55,9 +50,16 @@ __host__ void host_integer_radix_bitop_kb(
           result = max ^ j;
         }
       }
-      lwe_array_out->degrees[i] = result;
+      degrees[i] = result;
     }
   }
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
+      bsks, ksks, lut, lut->params.message_modulus);
+
+  memcpy(lwe_array_out->degrees, degrees,
+         lwe_array_out->num_radix_blocks * sizeof(uint64_t));
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index 6c68e7dccf..2977fdca85 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -122,7 +122,7 @@ __host__ void host_integer_radix_cmux_kb(
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
-      mem_ptr->message_extract_lut);
+      mem_ptr->message_extract_lut, num_radix_blocks);
   delete mem_true;
   delete mem_false;
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 1be62a1171..ff01505874 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -353,12 +353,15 @@ __host__ void pack_bivariate_blocks_with_single_block(
   check_cuda_error(cudaGetLastError());
 }
 
+/// num_radix_blocks corresponds to the number of blocks on which to apply the
+/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
+/// the input and output numbers of blocks
 template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
     CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
-    Torus *const *ksks, int_radix_lut<Torus> *lut) {
+    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
   // apply_lookup_table
   auto params = lut->params;
   auto pbs_type = params.pbs_type;
@@ -378,11 +381,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
     PANIC("Cuda error: input and output radix ciphertexts should have the same "
           "lwe dimension")
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
+      num_radix_blocks > lwe_array_in->num_radix_blocks)
+    PANIC("Cuda error: num radix blocks on which lut is applied should be "
+          "smaller or equal"
+          " to the number of input and output radix blocks")
 
   // In the case of extracting a single LWE this parameters are dummy
   uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
-  uint32_t num_radix_blocks = lwe_array_in->num_radix_blocks;
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
   std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
@@ -452,7 +459,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
                            lut->num_blocks * sizeof(Torus), streams[0],
                            gpu_indexes[0]);
-  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
+  for (uint i = 0; i < num_radix_blocks; i++) {
     lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
   }
@@ -1888,7 +1895,7 @@ void host_apply_univariate_lut_kb(cudaStream_t const *streams,
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
-      mem);
+      mem, radix_lwe_out->num_radix_blocks);
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
index 8949c8968f..baa428afc2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -36,8 +36,8 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
   if (input_radix->num_radix_blocks < end_lwe_index - start_lwe_index + 1)
     PANIC("Cuda error: input radix should have more blocks than the specified "
           "range")
-  if (start_lwe_index >= end_lwe_index)
-    PANIC("Cuda error: slice range should be strictly positive")
+  if (start_lwe_index > end_lwe_index)
+    PANIC("Cuda error: slice range should be non negative")
 
   auto lwe_size = input_radix->lwe_dimension + 1;
   output_radix->num_radix_blocks = end_lwe_index - start_lwe_index + 1;
@@ -80,4 +80,29 @@ void copy_radix_ciphertext_to_larger_output_slice_async(
   }
 }
 
+// end_lwe_index is inclusive
+template <typename Torus>
+void set_zero_radix_ciphertext_async(cudaStream_t const stream,
+                                     uint32_t const gpu_index,
+                                     CudaRadixCiphertextFFI *radix,
+                                     const uint32_t start_lwe_index,
+                                     const uint32_t end_lwe_index) {
+  if (radix->num_radix_blocks < end_lwe_index - start_lwe_index + 1)
+    PANIC("Cuda error: input radix should have more blocks than the specified "
+          "range")
+  if (start_lwe_index > end_lwe_index)
+    PANIC("Cuda error: slice range should be non negative")
+
+  auto lwe_size = radix->lwe_dimension + 1;
+  auto num_blocks_to_set = end_lwe_index - start_lwe_index + 1;
+  auto lwe_array_out_block = (Torus *)radix->ptr + start_lwe_index * lwe_size;
+  cuda_memset_async(lwe_array_out_block, 0,
+                    num_blocks_to_set * lwe_size * sizeof(Torus), stream,
+                    gpu_index);
+  memset(&radix->degrees[start_lwe_index], 0,
+         num_blocks_to_set * sizeof(uint64_t));
+  memset(&radix->noise_levels[start_lwe_index], 0,
+         num_blocks_to_set * sizeof(uint64_t));
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
index 1e992a3f61..e64d98f8b9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -2,15 +2,14 @@
 
 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
+    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
     uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
+    void *const *ksks) {
 
   host_integer_radix_scalar_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_input),
-      static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      lwe_ciphertext_count, op);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
+      lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
+      num_clear_blocks, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks));
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
index 24673f4bde..ec25dc24de 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -7,45 +7,82 @@
 template <typename Torus>
 __host__ void host_integer_radix_scalar_bitop_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
-    Torus const *clear_blocks, uint32_t num_clear_blocks,
-    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
-    uint32_t num_radix_blocks, BITOP_TYPE op) {
+    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
+    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
+    void *const *bsks, Torus *const *ksks) {
 
+  if (output->num_radix_blocks != input->num_radix_blocks)
+    PANIC("Cuda error: input and output num radix blocks must be equal")
+  if (output->lwe_dimension != input->lwe_dimension)
+    PANIC("Cuda error: input and output num radix blocks must be equal")
   auto lut = mem_ptr->lut;
-  auto params = lut->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-
-  uint32_t lwe_size = big_lwe_dimension + 1;
+  auto op = mem_ptr->op;
+  auto num_radix_blocks = output->num_radix_blocks;
 
   if (num_clear_blocks == 0) {
     if (op == SCALAR_BITAND) {
-      cuda_memset_async(lwe_array_out, 0,
-                        num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
-                        gpu_indexes[0]);
+      set_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
+                                             0, num_radix_blocks - 1);
     } else {
-      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
-                                   num_radix_blocks * lwe_size * sizeof(Torus),
-                                   streams[0], gpu_indexes[0]);
+      if (input != output)
+        copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
+            streams[0], gpu_indexes[0], output, input, 0);
     }
   } else {
     // We have all possible LUTs pre-computed and we use the decomposed scalar
     // as index to recover the right one
+    uint64_t degrees[num_clear_blocks];
+    uint64_t clear_degrees[num_clear_blocks];
+    cuda_memcpy_async_to_cpu(&clear_degrees, clear_blocks,
+                             num_clear_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    if (mem_ptr->op == BITOP_TYPE::SCALAR_BITAND) {
+      for (uint i = 0; i < num_clear_blocks; i++) {
+        degrees[i] = std::min(clear_degrees[i], input->degrees[i]);
+      }
+    } else if (mem_ptr->op == BITOP_TYPE::SCALAR_BITOR) {
+      for (uint i = 0; i < num_clear_blocks; i++) {
+        auto max = std::max(clear_degrees[i], input->degrees[i]);
+        auto min = std::min(clear_degrees[i], input->degrees[i]);
+        auto result = max;
+
+        for (uint j = 0; j < min + 1; j++) {
+          if (max | j > result) {
+            result = max | j;
+          }
+        }
+        degrees[i] = result;
+      }
+    } else if (mem_ptr->op == SCALAR_BITXOR) {
+      for (uint i = 0; i < num_clear_blocks; i++) {
+        auto max = std::max(clear_degrees[i], input->degrees[i]);
+        auto min = std::min(clear_degrees[i], input->degrees[i]);
+        auto result = max;
+
+        // Try every possibility to find the worst case
+        for (uint j = 0; j < min + 1; j++) {
+          if (max ^ j > result) {
+            result = max ^ j;
+          }
+        }
+        degrees[i] = result;
+      }
+    }
     cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                  num_clear_blocks * sizeof(Torus), streams[0],
                                  gpu_indexes[0]);
     lut->broadcast_lut(streams, gpu_indexes, 0);
 
-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
-        ksks, num_clear_blocks, lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, output, input, bsks, ksks, lut,
+        num_clear_blocks);
+    memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
 
     if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
-      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
-      cuda_memset_async(lwe_array_out_block, 0,
-                        (num_radix_blocks - num_clear_blocks) * lwe_size *
-                            sizeof(Torus),
-                        streams[0], gpu_indexes[0]);
+      set_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
+                                             num_clear_blocks,
+                                             num_radix_blocks - 1);
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 75a0b329af..56500dc023 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -639,15 +639,13 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_input: *const ffi::c_void,
+        lwe_array_out: *mut CudaRadixCiphertextFFI,
+        lwe_array_input: *const CudaRadixCiphertextFFI,
         clear_blocks: *const ffi::c_void,
         num_clear_blocks: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        lwe_ciphertext_count: u32,
-        op: BITOP_TYPE,
     );
 }
 unsafe extern "C" {
diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs
index 7fccab4363..7be18d2a5c 100644
--- a/tfhe/src/integer/gpu/ciphertext/info.rs
+++ b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -311,89 +311,6 @@ impl CudaRadixCiphertextInfo {
                 .collect(),
         }
     }
-    pub(crate) fn after_scalar_bitand<T>(&self, scalar: T) -> Self
-    where
-        T: DecomposableInto<u8>,
-    {
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
-            .iter_as::<u8>()
-            .chain(std::iter::repeat(0u8));
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: left
-                        .degree
-                        .after_bitand(Degree::new(u64::from(scalar_block))),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
-
-    pub(crate) fn after_scalar_bitor<T>(&self, scalar: T) -> Self
-    where
-        T: DecomposableInto<u8>,
-    {
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
-            .iter_as::<u8>()
-            .chain(std::iter::repeat(0u8));
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: left
-                        .degree
-                        .after_bitor(Degree::new(u64::from(scalar_block))),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
-
-    pub(crate) fn after_scalar_bitxor<T>(&self, scalar: T) -> Self
-    where
-        T: DecomposableInto<u8>,
-    {
-        let message_modulus = self.blocks.first().unwrap().message_modulus;
-        let bits_in_message = message_modulus.0.ilog2();
-        let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
-            .iter_as::<u8>()
-            .chain(std::iter::repeat(0u8));
-
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(decomposer)
-                .map(|(left, scalar_block)| CudaBlockInfo {
-                    degree: left
-                        .degree
-                        .after_bitxor(Degree::new(u64::from(scalar_block))),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level,
-                })
-                .collect(),
-        }
-    }
 
     // eq/ne, and comparisons returns a ciphertext that encrypts a 0 or 1, so the first block
     // (least significant) has a degree of 1, the other blocks should be trivial lwe encrypting 0,
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 188e41c736..0eea69e1a4 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -880,7 +880,7 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
     B: Numeric,
 >(
     streams: &CudaStreams,
-    radix_lwe: &mut CudaVec<T>,
+    radix_lwe: &mut CudaRadixCiphertext,
     clear_blocks: &CudaVec<T>,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
@@ -901,7 +901,7 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe.gpu_index(0),
+        radix_lwe.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -920,6 +920,18 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut radix_lwe_degrees = radix_lwe.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut radix_lwe_noise_levels = radix_lwe
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_radix_lwe = prepare_cuda_radix_ffi(
+        radix_lwe,
+        &mut radix_lwe_degrees,
+        &mut radix_lwe_noise_levels,
+    );
     scratch_cuda_integer_radix_bitop_kb_64(
         streams.ptr.as_ptr(),
         streams
@@ -955,15 +967,13 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
             .collect::<Vec<u32>>()
             .as_ptr(),
         streams.len() as u32,
-        radix_lwe.as_mut_c_ptr(0),
-        radix_lwe.as_mut_c_ptr(0),
+        &mut cuda_ffi_radix_lwe,
+        &cuda_ffi_radix_lwe,
         clear_blocks.as_c_ptr(0),
         min(clear_blocks.len() as u32, num_blocks),
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
-        num_blocks,
-        op as u32,
     );
     cleanup_cuda_integer_bitop(
         streams.ptr.as_ptr(),
@@ -976,6 +986,7 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    update_noise_degree(radix_lwe, &cuda_ffi_radix_lwe);
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
index 337abdd928..4260eb9aa6 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
@@ -37,7 +37,7 @@ impl CudaServerKey {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 unchecked_scalar_bitop_integer_radix_kb_assign_async(
                     streams,
-                    &mut ct.as_mut().d_blocks.0.d_vec,
+                    ct.as_mut(),
                     &clear_blocks,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
@@ -64,7 +64,7 @@ impl CudaServerKey {
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 unchecked_scalar_bitop_integer_radix_kb_assign_async(
                     streams,
-                    &mut ct.as_mut().d_blocks.0.d_vec,
+                    ct.as_mut(),
                     &clear_blocks,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
@@ -117,7 +117,6 @@ impl CudaServerKey {
     {
         unsafe {
             self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
-            ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
         }
         streams.synchronize();
     }
@@ -143,7 +142,6 @@ impl CudaServerKey {
     {
         unsafe {
             self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
-            ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
         }
         streams.synchronize();
     }
@@ -174,7 +172,6 @@ impl CudaServerKey {
     {
         unsafe {
             self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
-            ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
         }
         streams.synchronize();
     }
@@ -196,7 +193,6 @@ impl CudaServerKey {
             self.full_propagate_assign_async(ct, streams);
         }
         self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
-        ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
     }
 
     pub fn scalar_bitand_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
@@ -237,7 +233,6 @@ impl CudaServerKey {
             self.full_propagate_assign_async(ct, streams);
         }
         self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
-        ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
     }
 
     pub fn scalar_bitor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
@@ -278,7 +273,6 @@ impl CudaServerKey {
             self.full_propagate_assign_async(ct, streams);
         }
         self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
-        ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
     }
 
     pub fn scalar_bitxor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)