Skip to content

Commit

Permalink
chore(gpu): track noise/degree through scalar add
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 6, 2025
1 parent 60c3721 commit be719ee
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 55 deletions.
5 changes: 2 additions & 3 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,8 @@ void cuda_negate_integer_radix_ciphertext_64(

void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus);
CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus);

void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1441,7 +1441,7 @@ void host_compute_propagation_simulators_and_group_carries(
auto scalar_array_cum_sum = mem->scalar_array_cum_sum;
auto big_lwe_dimension = big_lwe_size - 1;

host_integer_radix_scalar_addition_inplace<Torus>(
legacy_host_integer_radix_scalar_addition_inplace<Torus>(
streams, gpu_indexes, gpu_count, propagation_cum_sums,
scalar_array_cum_sum, big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
Expand Down
12 changes: 5 additions & 7 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@

void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {

host_integer_radix_scalar_addition_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(scalar_input), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
static_cast<const uint64_t *>(scalar_input), num_scalars, message_modulus,
carry_modulus);
}
38 changes: 37 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ __global__ void device_integer_radix_scalar_addition_inplace(
}

template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
__host__ void legacy_host_integer_radix_scalar_addition_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
Expand All @@ -49,6 +49,42 @@ __host__ void host_integer_radix_scalar_addition_inplace(
delta);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
Torus const *scalar_input, uint32_t num_scalars, uint32_t message_modulus,
uint32_t carry_modulus) {
if (lwe_array->num_radix_blocks < num_scalars)
PANIC("Cuda error: num scalars should be smaller or equal to input num "
"radix blocks")
cuda_set_device(gpu_indexes[0]);

// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = num_scalars;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);

// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

device_integer_radix_scalar_addition_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr, scalar_input,
num_scalars, lwe_array->lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
Torus scalar_input_cpu[num_scalars];
cuda_memcpy_async_to_cpu(&scalar_input_cpu, scalar_input,
num_scalars * sizeof(Torus), streams[0],
gpu_indexes[0]);
for (uint i = 0; i < num_scalars; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + scalar_input_cpu[i];
}
}

template <typename Torus>
__global__ void device_integer_radix_add_scalar_one_inplace(
Expand Down
5 changes: 2 additions & 3 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,10 +396,9 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array: *mut ffi::c_void,
lwe_array: *mut CudaRadixCiphertextFFI,
scalar_input: *const ffi::c_void,
lwe_dimension: u32,
lwe_ciphertext_count: u32,
num_scalars: u32,
message_modulus: u32,
carry_modulus: u32,
);
Expand Down
28 changes: 0 additions & 28 deletions tfhe/src/integer/gpu/ciphertext/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::server_key::TwosComplementNegation;
use crate::shortint::ciphertext::{Degree, NoiseLevel};
use crate::shortint::{CarryModulus, MessageModulus, PBSOrder};
use itertools::Itertools;

#[derive(Clone, Copy)]
pub struct CudaBlockInfo {
Expand Down Expand Up @@ -228,33 +227,6 @@ impl CudaRadixCiphertextInfo {
}
}

pub(crate) fn after_scalar_add<T>(&self, scalar: T) -> Self
where
T: DecomposableInto<u8>,
{
let message_modulus = self.blocks.first().unwrap().message_modulus;
let bits_in_message = message_modulus.0.ilog2();
let decomposer =
BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message).iter_as::<u8>();
let mut scalar_composed = decomposer.collect_vec();
scalar_composed.resize(self.blocks.len(), 0);

Self {
blocks: self
.blocks
.iter()
.zip(scalar_composed)
.map(|(left, scalar_block)| CudaBlockInfo {
degree: Degree::new(left.degree.get() + u64::from(scalar_block)),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: left.noise_level,
})
.collect(),
}
}

pub(crate) fn after_scalar_mul(&self) -> Self {
Self {
blocks: self
Expand Down
25 changes: 18 additions & 7 deletions tfhe/src/integer/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,34 +228,45 @@ where
/// is required
pub unsafe fn scalar_addition_integer_radix_assign_async<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array: &mut CudaVec<T>,
lwe_array: &mut CudaRadixCiphertext,
scalar_input: &CudaVec<T>,
lwe_dimension: LweDimension,
num_samples: u32,
num_scalars: u32,
message_modulus: u32,
carry_modulus: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
lwe_array.gpu_index(0),
lwe_array.d_blocks.0.d_vec.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
scalar_input.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect();
let mut lwe_array_noise_levels = lwe_array
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi(
lwe_array,
&mut lwe_array_degrees,
&mut lwe_array_noise_levels,
);
cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
streams.len() as u32,
lwe_array.as_mut_c_ptr(0),
&mut cuda_ffi_lwe_array,
scalar_input.as_c_ptr(0),
lwe_dimension.0 as u32,
num_samples,
num_scalars,
message_modulus,
carry_modulus,
);
update_noise_degree(lwe_array, &cuda_ffi_lwe_array);
}

#[allow(clippy::too_many_arguments)]
Expand Down
6 changes: 1 addition & 5 deletions tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,16 @@ impl CudaServerKey {
.collect::<Vec<_>>();
d_decomposed_scalar.copy_from_cpu_async(decomposed_scalar.as_slice(), streams, 0);

let lwe_dimension = ct.as_ref().d_blocks.lwe_dimension();
// If the scalar is decomposed using less than the number of blocks our ciphertext
// has, we just don't touch ciphertext's last blocks
scalar_addition_integer_radix_assign_async(
streams,
&mut ct.as_mut().d_blocks.0.d_vec,
ct.as_mut(),
&d_decomposed_scalar,
lwe_dimension,
decomposed_scalar.len() as u32,
self.message_modulus.0 as u32,
self.carry_modulus.0 as u32,
);

ct.as_mut().info = ct.as_ref().info.after_scalar_add(scalar);
}
}

Expand Down

0 comments on commit be719ee

Please sign in to comment.