Skip to content

Commit

Permalink
chore(gpu): refactor scalar rotate to track noise/degree
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 11, 2025
1 parent 1243aea commit 6544124
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 102 deletions.
4 changes: 2 additions & 2 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(

void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
51 changes: 25 additions & 26 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,44 +94,42 @@ __host__ void array_rotate_left(Torus *array_out, Torus *array_in,
// calculation is not inplace, so `dst` and `src` must not be the same
// one block is responsible to process single lwe ciphertext
template <typename Torus>
__host__ void host_radix_blocks_rotate_right(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
CudaRadixCiphertextFFI *dst,
CudaRadixCiphertextFFI *src,
uint32_t rotations) {
__host__ void host_radix_blocks_rotate_right(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
CudaRadixCiphertextFFI *src, uint32_t rotations, uint32_t num_blocks) {
if (src == dst) {
PANIC("Cuda error (blocks_rotate_right): the source and destination "
"pointers should be different");
}
if (dst->lwe_dimension != src->lwe_dimension)
PANIC("Cuda error: input and output should have the same "
"lwe dimension")
if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
PANIC("Cuda error: input and output should have more blocks than asked for "
"in the "
"function call")

auto lwe_size = src->lwe_dimension + 1;

cuda_set_device(gpu_indexes[0]);
radix_blocks_rotate_right<Torus>
<<<src->num_radix_blocks, 1024, 0, streams[0]>>>(
(Torus *)dst->ptr, (Torus *)src->ptr, rotations,
dst->num_radix_blocks, lwe_size);
radix_blocks_rotate_right<Torus><<<num_blocks, 1024, 0, streams[0]>>>(
(Torus *)dst->ptr, (Torus *)src->ptr, rotations, num_blocks, lwe_size);
check_cuda_error(cudaGetLastError());

// Rotate degrees and noise to follow blocks
array_rotate_right(dst->degrees, src->degrees, rotations,
dst->num_radix_blocks);
array_rotate_right(dst->degrees, src->degrees, rotations, num_blocks);
array_rotate_right(dst->noise_levels, src->noise_levels, rotations,
dst->num_radix_blocks);
num_blocks);
}

// rotate radix ciphertext left with specific value
// calculation is not inplace, so `dst` and `src` must not be the same
template <typename Torus>
__host__ void
host_radix_blocks_rotate_left(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *dst,
CudaRadixCiphertextFFI *src, uint32_t value) {
__host__ void host_radix_blocks_rotate_left(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
CudaRadixCiphertextFFI *src, uint32_t value, uint32_t num_blocks) {
if (src == dst) {
PANIC("Cuda error (blocks_rotate_left): the source and destination "
"pointers should be different");
Expand All @@ -140,20 +138,21 @@ host_radix_blocks_rotate_left(cudaStream_t const *streams,
if (dst->lwe_dimension != src->lwe_dimension)
PANIC("Cuda error: input and output should have the same "
"lwe dimension")
if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
PANIC("Cuda error: input and output should have more blocks than asked for "
"in the "
"function call")

auto lwe_size = src->lwe_dimension + 1;

cuda_set_device(gpu_indexes[0]);
radix_blocks_rotate_left<Torus>
<<<src->num_radix_blocks, 1024, 0, streams[0]>>>(
(Torus *)dst->ptr, (Torus *)src->ptr, value, dst->num_radix_blocks,
lwe_size);
radix_blocks_rotate_left<Torus><<<num_blocks, 1024, 0, streams[0]>>>(
(Torus *)dst->ptr, (Torus *)src->ptr, value, num_blocks, lwe_size);
check_cuda_error(cudaGetLastError());

// Rotate degrees and noise to follow blocks
array_rotate_left(dst->degrees, src->degrees, value, dst->num_radix_blocks);
array_rotate_left(dst->noise_levels, src->noise_levels, value,
dst->num_radix_blocks);
array_rotate_left(dst->degrees, src->degrees, value, num_blocks);
array_rotate_left(dst->noise_levels, src->noise_levels, value, num_blocks);
}

// rotate radix ciphertext right with specific value
Expand Down Expand Up @@ -1836,7 +1835,7 @@ void host_propagate_single_sub_borrow(cudaStream_t const *streams,

host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
step_output, generates_or_propagates, 1,
num_blocks, big_lwe_size);
num_blocks);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);

Expand Down
9 changes: 4 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,13 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(

void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {

host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), n,
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, n,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), num_blocks);
(uint64_t **)(ksks));
}

void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
Expand Down
56 changes: 26 additions & 30 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,14 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, uint32_t n,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks) {
Torus *const *ksks) {

auto num_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;

size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

size_t num_bits_in_message = (size_t)log2_int(message_modulus);
size_t total_num_bits = num_bits_in_message * num_blocks;
n = n % total_num_bits;
Expand All @@ -48,7 +44,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
size_t rotations = n / num_bits_in_message;
size_t shift_within_block = n % num_bits_in_message;

Torus *rotated_buffer = (Torus *)mem->tmp_rotated->ptr;
auto rotated_buffer = mem->tmp_rotated;

// rotate right all the blocks in radix ciphertext
// copy result in new buffer
Expand All @@ -57,56 +53,56 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
// rotate right as the blocks are from LSB to MSB
legacy_host_radix_blocks_rotate_right<Torus>(
streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks);

cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
rotated_buffer, 0, num_blocks);

if (shift_within_block == 0) {
return;
}

auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
legacy_host_radix_blocks_rotate_right<Torus>(
streams, gpu_indexes, gpu_count, giver_blocks, lwe_array, 1, num_blocks,
big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1,
num_blocks);

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
lut_bivariate->params.message_modulus);

} else {
// rotate left as the blocks are from LSB to MSB
legacy_host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks);

cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
lwe_array, 0, num_blocks,
rotated_buffer, 0, num_blocks);

if (shift_within_block == 0) {
return;
}

auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
legacy_host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1,
num_blocks);

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
lut_bivariate->params.message_modulus);
}
}
Expand Down
8 changes: 5 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

if (mem->shift_type == LEFT_SHIFT) {
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right<Torus>(
streams, gpu_indexes, gpu_count, &rotated_buffer, lwe_array, rotations);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
&rotated_buffer, lwe_array, rotations,
num_blocks);

// create trivial assign for value = 0
if (rotations > 0)
Expand Down Expand Up @@ -177,7 +178,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
} else {
// right shift
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
&rotated_buffer, lwe_array, rotations);
&rotated_buffer, lwe_array, rotations,
num_blocks);

// rotate left as the blocks are from LSB to MSB
// create trivial assign for value = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
PANIC("Cuda error: incorrect number of blocks")
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
rotations);
rotations, total_nb_bits);

if (rotations > 0)
set_zero_radix_ciphertext_slice_async<Torus>(
Expand All @@ -120,7 +120,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
PANIC("Cuda error: incorrect number of blocks")
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
rotations);
rotations, total_nb_bits);

if (mem->is_signed)
for (int i = 0; i < rotations; i++) {
Expand All @@ -140,13 +140,13 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
rotations);
rotations, total_nb_bits);
break;
case RIGHT_ROTATE:
// rotate left as the blocks are from LSB to MSB
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b,
rotations);
rotations, total_nb_bits);
break;
default:
PANIC("Unknown operation")
Expand Down
3 changes: 1 addition & 2 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -726,12 +726,11 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array: *mut ffi::c_void,
lwe_array: *mut CudaRadixCiphertextFFI,
n: u32,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks: u32,
);
}
unsafe extern "C" {
Expand Down
15 changes: 0 additions & 15 deletions tfhe/src/integer/gpu/ciphertext/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,21 +165,6 @@ impl CudaRadixCiphertextInfo {
.collect(),
}
}
pub(crate) fn after_scalar_rotate(&self) -> Self {
Self {
blocks: self
.blocks
.iter()
.map(|left| CudaBlockInfo {
degree: Degree::new(left.message_modulus.0 - 1),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: NoiseLevel::NOMINAL,
})
.collect(),
}
}
pub(crate) fn after_min_max(&self) -> Self {
Self {
blocks: self
Expand Down
Loading

0 comments on commit 6544124

Please sign in to comment.