Skip to content

Commit

Permalink
chore(gpu): track noise/degree in scalar bitops
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 6, 2025
1 parent 75b5b8d commit f0d9f7c
Show file tree
Hide file tree
Showing 11 changed files with 140 additions and 149 deletions.
5 changes: 3 additions & 2 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,10 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(

void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);
void *const *ksks);

void cleanup_cuda_integer_bitop(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Expand Down
20 changes: 11 additions & 9 deletions backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,10 @@ __host__ void host_integer_radix_bitop_kb(
void *const *bsks, Torus *const *ksks) {

auto lut = mem_ptr->lut;

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
bsks, ksks, lut, lut->params.message_modulus);

uint64_t degrees[lwe_array_1->num_radix_blocks];
if (mem_ptr->op == BITOP_TYPE::BITAND) {
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
lwe_array_out->degrees[i] =
std::min(lwe_array_1->degrees[i], lwe_array_2->degrees[i]);
degrees[i] = std::min(lwe_array_1->degrees[i], lwe_array_2->degrees[i]);
}
} else if (mem_ptr->op == BITOP_TYPE::BITOR) {
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
Expand All @@ -41,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
result = max | j;
}
}
lwe_array_out->degrees[i] = result;
degrees[i] = result;
}
} else if (mem_ptr->op == BITXOR) {
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
Expand All @@ -55,9 +50,16 @@ __host__ void host_integer_radix_bitop_kb(
result = max ^ j;
}
}
lwe_array_out->degrees[i] = result;
degrees[i] = result;
}
}

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
bsks, ksks, lut, lut->params.message_modulus);

memcpy(lwe_array_out->degrees, degrees,
lwe_array_out->num_radix_blocks * sizeof(uint64_t));
}

template <typename Torus>
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ __host__ void host_integer_radix_cmux_kb(

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
mem_ptr->message_extract_lut);
mem_ptr->message_extract_lut, num_radix_blocks);
delete mem_true;
delete mem_false;
}
Expand Down
15 changes: 11 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,15 @@ __host__ void pack_bivariate_blocks_with_single_block(
check_cuda_error(cudaGetLastError());
}

/// num_radix_blocks corresponds to the number of blocks on which to apply the
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
/// the input and output numbers of blocks
template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
Torus *const *ksks, int_radix_lut<Torus> *lut) {
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
Expand All @@ -378,11 +381,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output radix ciphertexts should have the same "
"lwe dimension")
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_in->num_radix_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal"
" to the number of input and output radix blocks")

// In the case of extracting a single LWE this parameters are dummy
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
uint32_t num_radix_blocks = lwe_array_in->num_radix_blocks;
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
Expand Down Expand Up @@ -452,7 +459,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
lut->num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
for (uint i = 0; i < num_radix_blocks; i++) {
lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
}
Expand Down Expand Up @@ -1888,7 +1895,7 @@ void host_apply_univariate_lut_kb(cudaStream_t const *streams,

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
mem);
mem, radix_lwe_out->num_radix_blocks);
}

template <typename Torus>
Expand Down
29 changes: 27 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
if (input_radix->num_radix_blocks < end_lwe_index - start_lwe_index + 1)
PANIC("Cuda error: input radix should have more blocks than the specified "
"range")
if (start_lwe_index >= end_lwe_index)
PANIC("Cuda error: slice range should be strictly positive")
if (start_lwe_index > end_lwe_index)
PANIC("Cuda error: slice range should be non negative")

auto lwe_size = input_radix->lwe_dimension + 1;
output_radix->num_radix_blocks = end_lwe_index - start_lwe_index + 1;
Expand Down Expand Up @@ -80,4 +80,29 @@ void copy_radix_ciphertext_to_larger_output_slice_async(
}
}

// end_lwe_index is inclusive
template <typename Torus>
void set_zero_radix_ciphertext_async(cudaStream_t const stream,
uint32_t const gpu_index,
CudaRadixCiphertextFFI *radix,
const uint32_t start_lwe_index,
const uint32_t end_lwe_index) {
if (radix->num_radix_blocks < end_lwe_index - start_lwe_index + 1)
PANIC("Cuda error: input radix should have more blocks than the specified "
"range")
if (start_lwe_index > end_lwe_index)
PANIC("Cuda error: slice range should be non negative")

auto lwe_size = radix->lwe_dimension + 1;
auto num_blocks_to_set = end_lwe_index - start_lwe_index + 1;
auto lwe_array_out_block = (Torus *)radix->ptr + start_lwe_index * lwe_size;
cuda_memset_async(lwe_array_out_block, 0,
num_blocks_to_set * lwe_size * sizeof(Torus), stream,
gpu_index);
memset(&radix->degrees[start_lwe_index], 0,
num_blocks_to_set * sizeof(uint64_t));
memset(&radix->noise_levels[start_lwe_index], 0,
num_blocks_to_set * sizeof(uint64_t));
}

#endif
15 changes: 7 additions & 8 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@

void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
void *const *ksks) {

host_integer_radix_scalar_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_array_input),
static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
lwe_ciphertext_count, op);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
num_clear_blocks, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
81 changes: 59 additions & 22 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,82 @@
template <typename Torus>
__host__ void host_integer_radix_scalar_bitop_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
Torus const *clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks, BITOP_TYPE op) {
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {

if (output->num_radix_blocks != input->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be equal")
if (output->lwe_dimension != input->lwe_dimension)
PANIC("Cuda error: input and output num radix blocks must be equal")
auto lut = mem_ptr->lut;
auto params = lut->params;
auto big_lwe_dimension = params.big_lwe_dimension;

uint32_t lwe_size = big_lwe_dimension + 1;
auto op = mem_ptr->op;
auto num_radix_blocks = output->num_radix_blocks;

if (num_clear_blocks == 0) {
if (op == SCALAR_BITAND) {
cuda_memset_async(lwe_array_out, 0,
num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
set_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
0, num_radix_blocks - 1);
} else {
cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
num_radix_blocks * lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
if (input != output)
copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
streams[0], gpu_indexes[0], output, input, 0);
}
} else {
// We have all possible LUTs pre-computed and we use the decomposed scalar
// as index to recover the right one
uint64_t degrees[num_clear_blocks];
uint64_t clear_degrees[num_clear_blocks];
cuda_memcpy_async_to_cpu(&clear_degrees, clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
if (mem_ptr->op == BITOP_TYPE::SCALAR_BITAND) {
for (uint i = 0; i < num_clear_blocks; i++) {
degrees[i] = std::min(clear_degrees[i], input->degrees[i]);
}
} else if (mem_ptr->op == BITOP_TYPE::SCALAR_BITOR) {
for (uint i = 0; i < num_clear_blocks; i++) {
auto max = std::max(clear_degrees[i], input->degrees[i]);
auto min = std::min(clear_degrees[i], input->degrees[i]);
auto result = max;

for (uint j = 0; j < min + 1; j++) {
if (max | j > result) {
result = max | j;
}
}
degrees[i] = result;
}
} else if (mem_ptr->op == SCALAR_BITXOR) {
for (uint i = 0; i < num_clear_blocks; i++) {
auto max = std::max(clear_degrees[i], input->degrees[i]);
auto min = std::min(clear_degrees[i], input->degrees[i]);
auto result = max;

// Try every possibility to find the worst case
for (uint j = 0; j < min + 1; j++) {
if (max ^ j > result) {
result = max ^ j;
}
}
degrees[i] = result;
}
}
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, 0);

legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
ksks, num_clear_blocks, lut);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, output, input, bsks, ksks, lut,
num_clear_blocks);
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));

if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
cuda_memset_async(lwe_array_out_block, 0,
(num_radix_blocks - num_clear_blocks) * lwe_size *
sizeof(Torus),
streams[0], gpu_indexes[0]);
set_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
num_clear_blocks,
num_radix_blocks - 1);
}
}
}
Expand Down
6 changes: 2 additions & 4 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -639,15 +639,13 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_input: *const ffi::c_void,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_input: *const CudaRadixCiphertextFFI,
clear_blocks: *const ffi::c_void,
num_clear_blocks: u32,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
lwe_ciphertext_count: u32,
op: BITOP_TYPE,
);
}
unsafe extern "C" {
Expand Down
83 changes: 0 additions & 83 deletions tfhe/src/integer/gpu/ciphertext/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,89 +311,6 @@ impl CudaRadixCiphertextInfo {
.collect(),
}
}
pub(crate) fn after_scalar_bitand<T>(&self, scalar: T) -> Self
where
T: DecomposableInto<u8>,
{
let message_modulus = self.blocks.first().unwrap().message_modulus;
let bits_in_message = message_modulus.0.ilog2();
let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
.iter_as::<u8>()
.chain(std::iter::repeat(0u8));

Self {
blocks: self
.blocks
.iter()
.zip(decomposer)
.map(|(left, scalar_block)| CudaBlockInfo {
degree: left
.degree
.after_bitand(Degree::new(u64::from(scalar_block))),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: left.noise_level,
})
.collect(),
}
}

pub(crate) fn after_scalar_bitor<T>(&self, scalar: T) -> Self
where
T: DecomposableInto<u8>,
{
let message_modulus = self.blocks.first().unwrap().message_modulus;
let bits_in_message = message_modulus.0.ilog2();
let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
.iter_as::<u8>()
.chain(std::iter::repeat(0u8));

Self {
blocks: self
.blocks
.iter()
.zip(decomposer)
.map(|(left, scalar_block)| CudaBlockInfo {
degree: left
.degree
.after_bitor(Degree::new(u64::from(scalar_block))),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: left.noise_level,
})
.collect(),
}
}

pub(crate) fn after_scalar_bitxor<T>(&self, scalar: T) -> Self
where
T: DecomposableInto<u8>,
{
let message_modulus = self.blocks.first().unwrap().message_modulus;
let bits_in_message = message_modulus.0.ilog2();
let decomposer = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message)
.iter_as::<u8>()
.chain(std::iter::repeat(0u8));

Self {
blocks: self
.blocks
.iter()
.zip(decomposer)
.map(|(left, scalar_block)| CudaBlockInfo {
degree: left
.degree
.after_bitxor(Degree::new(u64::from(scalar_block))),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: left.noise_level,
})
.collect(),
}
}

// eq/ne, and comparisons returns a ciphertext that encrypts a 0 or 1, so the first block
// (least significant) has a degree of 1, the other blocks should be trivial lwe encrypting 0,
Expand Down
Loading

0 comments on commit f0d9f7c

Please sign in to comment.