Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(gpu): refactor negate to keep track of degree and noise changes #2022

Merged
merged 2 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ void cleanup_cuda_integer_mult(void *const *streams,

void cuda_negate_integer_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
uint32_t carry_modulus);

void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
Expand Down
12 changes: 6 additions & 6 deletions backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}

host_integer_radix_negation(
legacy_host_integer_radix_negation(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
Expand All @@ -679,11 +679,11 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
bsks, ksks, num_blocks, requested_flag, uses_carry);

host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
gpu_count, int_mem_ptr->negated_remainder,
remainder, radix_params.big_lwe_dimension,
num_blocks, radix_params.message_modulus,
radix_params.carry_modulus);
legacy_host_integer_radix_negation(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
int_mem_ptr->negated_remainder, remainder,
radix_params.big_lwe_dimension, num_blocks,
radix_params.message_modulus, radix_params.carry_modulus);
pdroalves marked this conversation as resolved.
Show resolved Hide resolved

host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
Expand Down
12 changes: 5 additions & 7 deletions backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@

void cuda_negate_integer_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
uint32_t carry_modulus) {

host_integer_radix_negation<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
host_integer_radix_negation<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, lwe_array_out, lwe_array_in,
message_modulus, carry_modulus);
}
59 changes: 58 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,64 @@ device_integer_radix_negation(Torus *output, Torus const *input,
}

template <typename Torus>
__host__ void host_integer_radix_negation(
__host__ void
host_integer_radix_negation(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
uint64_t message_modulus, uint64_t carry_modulus) {
cuda_set_device(gpu_indexes[0]);

if (lwe_array_out->num_radix_blocks != lwe_array_in->num_radix_blocks)
PANIC("Cuda error: lwe_array_in and lwe_array_out num radix blocks must be "
"the same")

if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
"the same")

auto num_radix_blocks = lwe_array_out->num_radix_blocks;
auto lwe_dimension = lwe_array_out->lwe_dimension;
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);

// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

device_integer_radix_negation<Torus><<<grid, thds, 0, streams[0]>>>(
static_cast<Torus *>(lwe_array_out->ptr),
static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
message_modulus, delta);
check_cuda_error(cudaGetLastError());

uint8_t zb = 0;
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
auto input_degree = lwe_array_in->degrees[i];

if (zb != 0) {
input_degree += static_cast<uint64_t>(zb);
}
Torus z =
std::max(static_cast<Torus>(1),
static_cast<Torus>(ceil(input_degree / message_modulus))) *
message_modulus;

lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
zb = z / message_modulus;
}
}
template <typename Torus>
__host__ void legacy_host_integer_radix_negation(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *output, Torus const *input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
Expand Down
6 changes: 2 additions & 4 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,10 +385,8 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_dimension: u32,
lwe_ciphertext_count: u32,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_in: *const CudaRadixCiphertextFFI,
message_modulus: u32,
carry_modulus: u32,
);
Expand Down
34 changes: 0 additions & 34 deletions tfhe/src/core_crypto/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,40 +620,6 @@ pub unsafe fn negate_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
);
}

#[allow(clippy::too_many_arguments)]
/// Assign negation of a vector of LWE ciphertexts representing an integer
///
/// # Safety
///
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
pub unsafe fn negate_integer_radix_async<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
lwe_array_in: &CudaVec<T>,
lwe_dimension: LweDimension,
num_samples: u32,
message_modulus: u32,
carry_modulus: u32,
) {
cuda_negate_integer_radix_ciphertext_64(
streams.ptr.as_ptr(),
streams
.gpu_indexes
.iter()
.map(|i| i.0)
.collect::<Vec<u32>>()
.as_ptr(),
streams.len() as u32,
lwe_array_out.as_mut_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_dimension.0 as u32,
num_samples,
message_modulus,
carry_modulus,
);
}

/// Multiplication of a vector of LWEs with a vector of cleartexts (assigned)
///
/// # Safety
Expand Down
24 changes: 0 additions & 24 deletions tfhe/src/integer/gpu/ciphertext/info.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::server_key::radix::neg::NegatedDegreeIter;
use crate::integer::server_key::TwosComplementNegation;
use crate::shortint::ciphertext::{Degree, NoiseLevel};
use crate::shortint::{CarryModulus, MessageModulus, PBSOrder};
Expand Down Expand Up @@ -72,29 +71,6 @@ impl CudaRadixCiphertextInfo {
Some(decomposer)
}

pub(crate) fn after_neg(&self) -> Self {
let new_degrees_iter = NegatedDegreeIter::new(
self.blocks
.iter()
.map(|block| (block.degree, block.message_modulus)),
);

Self {
blocks: self
.blocks
.iter()
.zip(new_degrees_iter)
.map(|(left, d)| CudaBlockInfo {
degree: d,
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: left.noise_level,
})
.collect(),
}
}

pub(crate) fn after_mul(&self) -> Self {
Self {
blocks: self
Expand Down
76 changes: 76 additions & 0 deletions tfhe/src/integer/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3638,3 +3638,79 @@ pub unsafe fn unchecked_are_all_comparisons_block_true_integer_radix_kb_async<
std::ptr::addr_of_mut!(mem_ptr),
);
}

#[allow(clippy::too_many_arguments)]
/// Assign negation of a vector of LWE ciphertexts representing an integer
///
/// # Safety
///
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
pub unsafe fn unchecked_negate_integer_radix_async(
streams: &CudaStreams,
radix_lwe_out: &mut CudaRadixCiphertext,
radix_lwe_in: &CudaRadixCiphertext,
message_modulus: u32,
carry_modulus: u32,
) {
let mut radix_lwe_out_degrees = radix_lwe_out
.info
.blocks
.iter()
.map(|b| b.degree.0)
.collect();
let mut radix_lwe_out_noise_levels = radix_lwe_out
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_radix_lwe_out = prepare_cuda_radix_ffi(
radix_lwe_out,
&mut radix_lwe_out_degrees,
&mut radix_lwe_out_noise_levels,
);
let mut radix_lwe_in_degrees = radix_lwe_in
.info
.blocks
.iter()
.map(|b| b.degree.0)
.collect();
let mut radix_lwe_in_noise_levels = radix_lwe_in
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let cuda_ffi_radix_lwe_in = prepare_cuda_radix_ffi(
radix_lwe_in,
&mut radix_lwe_in_degrees,
&mut radix_lwe_in_noise_levels,
);

cuda_negate_integer_radix_ciphertext_64(
streams.ptr.as_ptr(),
streams
.gpu_indexes
.iter()
.map(|i| i.0)
.collect::<Vec<u32>>()
.as_ptr(),
streams.len() as u32,
&mut cuda_ffi_radix_lwe_out,
&cuda_ffi_radix_lwe_in,
message_modulus,
carry_modulus,
);

radix_lwe_out
.info
.blocks
.iter_mut()
.zip(radix_lwe_out_degrees.iter())
.zip(radix_lwe_out_noise_levels.iter())
.for_each(|((block, degree), noise)| {
block.degree = Degree(*degree);
block.noise_level = NoiseLevel(*noise);
});
}
14 changes: 5 additions & 9 deletions tfhe/src/integer/gpu/server_key/radix/neg.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams};
use crate::core_crypto::gpu::CudaStreams;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::gpu::unchecked_negate_integer_radix_async;
use crate::integer::server_key::radix_parallel::OutputFlag;

impl CudaServerKey {
Expand Down Expand Up @@ -63,22 +64,17 @@ impl CudaServerKey {
streams: &CudaStreams,
) -> T {
let mut ciphertext_out = ctxt.duplicate_async(streams);
let lwe_dimension = ctxt.as_ref().d_blocks.lwe_dimension();
let lwe_ciphertext_count = ctxt.as_ref().d_blocks.lwe_ciphertext_count();

let info = ctxt.as_ref().info.blocks.first().unwrap();

negate_integer_radix_async(
unchecked_negate_integer_radix_async(
streams,
&mut ciphertext_out.as_mut().d_blocks.0.d_vec,
&ctxt.as_ref().d_blocks.0.d_vec,
lwe_dimension,
lwe_ciphertext_count.0 as u32,
ciphertext_out.as_mut(),
ctxt.as_ref(),
info.message_modulus.0 as u32,
info.carry_modulus.0 as u32,
);

ciphertext_out.as_mut().info = ctxt.as_ref().info.after_neg();
ciphertext_out
}

Expand Down
8 changes: 4 additions & 4 deletions tfhe/src/integer/server_key/radix/neg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@ use crate::integer::ciphertext::IntegerRadixCiphertext;
use crate::integer::server_key::CheckError;
use crate::integer::ServerKey;
use crate::shortint::ciphertext::{Degree, MaxDegree};
#[cfg(any(test, feature = "gpu"))]
#[cfg(test)]
use crate::shortint::MessageModulus;

/// Iterator that returns the new degree of blocks
/// after negation was done.
///
/// It takes as input an iterator that returns the degree of the blocks
/// before negation as well as their message modulus.
#[cfg(any(test, feature = "gpu"))]
#[cfg(test)]
pub(crate) struct NegatedDegreeIter<I> {
iter: I,
z_b: u64,
}

#[cfg(any(test, feature = "gpu"))]
#[cfg(test)]
impl<I> NegatedDegreeIter<I>
where
I: Iterator<Item = (Degree, MessageModulus)>,
Expand All @@ -26,7 +26,7 @@ where
}
}

#[cfg(any(test, feature = "gpu"))]
#[cfg(test)]
impl<I> Iterator for NegatedDegreeIter<I>
where
I: Iterator<Item = (Degree, MessageModulus)>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ pub(crate) const NB_CTXT: usize = 2;
pub(crate) trait FunctionExecutor<TestInput, TestOutput> {
/// Setups the executor
///
/// Implementors are expected to be fully functional after this
/// Implementers are expected to be fully functional after this
/// function has been called.
fn setup(&mut self, cks: &RadixClientKey, sks: Arc<ServerKey>);

/// Executes the function
///
/// The function receives some inputs and return some output.
/// Implementors may have to do more than just calling the function
/// Implementers may have to do more than just calling the function
/// that is being tested (for example input/output may need to be converted)
///
/// Look at the test case function to know what are the expected inputs and outputs.
Expand Down
Loading