diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index c349a426a2..9efc096a40 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -148,9 +148,8 @@ void cuda_negate_integer_radix_ciphertext_64( void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void const *scalar_input, uint32_t lwe_dimension, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus); + CudaRadixCiphertextFFI *lwe_array, void const *scalar_input, + uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus); void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index ff01505874..a37ce04d80 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -1441,7 +1441,7 @@ void host_compute_propagation_simulators_and_group_carries( auto scalar_array_cum_sum = mem->scalar_array_cum_sum; auto big_lwe_dimension = big_lwe_size - 1; - host_integer_radix_scalar_addition_inplace( + legacy_host_integer_radix_scalar_addition_inplace( streams, gpu_indexes, gpu_count, propagation_cum_sums, scalar_array_cum_sum, big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu index cae83b55bd..72e3d2513b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu @@ -2,13 +2,11 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void const *scalar_input, uint32_t lwe_dimension, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus) { + CudaRadixCiphertextFFI *lwe_array, void const *scalar_input, + uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) { host_integer_radix_scalar_addition_inplace( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), - static_cast(scalar_input), lwe_dimension, - lwe_ciphertext_count, message_modulus, carry_modulus); + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + static_cast(scalar_input), num_scalars, message_modulus, + carry_modulus); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh index 32b0443db4..a05dab4a3d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh @@ -24,7 +24,7 @@ __global__ void device_integer_radix_scalar_addition_inplace( } template -__host__ void host_integer_radix_scalar_addition_inplace( +__host__ void legacy_host_integer_radix_scalar_addition_inplace( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input, uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count, @@ -49,6 +49,42 @@ __host__ void host_integer_radix_scalar_addition_inplace( delta); check_cuda_error(cudaGetLastError()); } +template +__host__ void host_integer_radix_scalar_addition_inplace( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, + Torus const *scalar_input, uint32_t num_scalars, uint32_t message_modulus, + uint32_t carry_modulus) { + if (lwe_array->num_radix_blocks < num_scalars) + PANIC("Cuda error: num scalars should be smaller or equal to input num " + "radix blocks") + cuda_set_device(gpu_indexes[0]); + + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = num_scalars; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + // Value of the shift we multiply our messages by + // If message_modulus and carry_modulus are always powers of 2 we can simplify + // this + uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); + + device_integer_radix_scalar_addition_inplace + <<>>((Torus *)lwe_array->ptr, scalar_input, + num_scalars, lwe_array->lwe_dimension, + delta); + check_cuda_error(cudaGetLastError()); + Torus scalar_input_cpu[num_scalars]; + cuda_memcpy_async_to_cpu(&scalar_input_cpu, scalar_input, + num_scalars * sizeof(Torus), streams[0], + gpu_indexes[0]); + for (uint i = 0; i < num_scalars; i++) { + lwe_array->degrees[i] = lwe_array->degrees[i] + scalar_input_cpu[i]; + } +} template __global__ void device_integer_radix_add_scalar_one_inplace( diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 56500dc023..86ae238eed 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -396,10 +396,9 @@ unsafe extern "C" { streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_array: *mut ffi::c_void, + lwe_array: *mut CudaRadixCiphertextFFI, scalar_input: *const ffi::c_void, - lwe_dimension: u32, - lwe_ciphertext_count: u32, + num_scalars: u32, message_modulus: u32, carry_modulus: u32, ); diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs index 7be18d2a5c..164ff021e1 100644 --- a/tfhe/src/integer/gpu/ciphertext/info.rs +++ b/tfhe/src/integer/gpu/ciphertext/info.rs @@ -2,7 +2,6 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; use crate::integer::server_key::TwosComplementNegation; use crate::shortint::ciphertext::{Degree, NoiseLevel}; use crate::shortint::{CarryModulus, MessageModulus, PBSOrder}; -use itertools::Itertools; #[derive(Clone, Copy)] pub struct CudaBlockInfo { @@ -228,33 +227,6 @@ impl CudaRadixCiphertextInfo { } } - pub(crate) fn after_scalar_add(&self, scalar: T) -> Self - where - T: DecomposableInto, - { - let message_modulus = self.blocks.first().unwrap().message_modulus; - let bits_in_message = message_modulus.0.ilog2(); - let decomposer = - BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message).iter_as::(); - let mut scalar_composed = decomposer.collect_vec(); - scalar_composed.resize(self.blocks.len(), 0); - - Self { - blocks: self - .blocks - .iter() - .zip(scalar_composed) - .map(|(left, scalar_block)| CudaBlockInfo { - degree: Degree::new(left.degree.get() + u64::from(scalar_block)), - message_modulus: left.message_modulus, - carry_modulus: left.carry_modulus, - pbs_order: left.pbs_order, - noise_level: left.noise_level, - }) - .collect(), - } - } - pub(crate) fn after_scalar_mul(&self) -> Self { Self { blocks: self diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index f9353bf7ea..849bfe00f9 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -228,16 +228,15 @@ where /// is required pub unsafe fn scalar_addition_integer_radix_assign_async( streams: &CudaStreams, - lwe_array: &mut CudaVec, + lwe_array: &mut CudaRadixCiphertext, scalar_input: &CudaVec, - lwe_dimension: LweDimension, - num_samples: u32, + num_scalars: u32, message_modulus: u32, carry_modulus: u32, ) { assert_eq!( streams.gpu_indexes[0], - lwe_array.gpu_index(0), + lwe_array.d_blocks.0.d_vec.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( @@ -245,17 +244,29 @@ pub unsafe fn scalar_addition_integer_radix_assign_async( scalar_input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); + let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect(); + let mut lwe_array_noise_levels = lwe_array + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi( + lwe_array, + &mut lwe_array_degrees, + &mut lwe_array_noise_levels, + ); cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams.ptr.as_ptr(), streams.gpu_indexes_ptr(), streams.len() as u32, - lwe_array.as_mut_c_ptr(0), + &mut cuda_ffi_lwe_array, scalar_input.as_c_ptr(0), - lwe_dimension.0 as u32, - num_samples, + num_scalars, message_modulus, carry_modulus, ); + update_noise_degree(lwe_array, &cuda_ffi_lwe_array); } #[allow(clippy::too_many_arguments)] diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs index 1a93403e7b..696cf10121 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs @@ -92,20 +92,16 @@ impl CudaServerKey { .collect::>(); d_decomposed_scalar.copy_from_cpu_async(decomposed_scalar.as_slice(), streams, 0); - let lwe_dimension = ct.as_ref().d_blocks.lwe_dimension(); // If the scalar is decomposed using less than the number of blocks our ciphertext // has, we just don't touch ciphertext's last blocks scalar_addition_integer_radix_assign_async( streams, - &mut ct.as_mut().d_blocks.0.d_vec, + ct.as_mut(), &d_decomposed_scalar, - lwe_dimension, decomposed_scalar.len() as u32, self.message_modulus.0 as u32, self.carry_modulus.0 as u32, ); - - ct.as_mut().info = ct.as_ref().info.after_scalar_add(scalar); } }