use crate::backends::cuda::engines::CudaError;
use crate::backends::cuda::private::pointers::StreamPointer;
use crate::backends::cuda::private::vec::CudaVec;
use crate::commons::numeric::{Numeric, UnsignedInteger};
use crate::prelude::{
DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweCiphertextIndex, LweDimension,
PolynomialSize, SharedMemoryAmount,
};
use concrete_cuda::cuda_bind::*;
use std::ffi::c_void;
use std::marker::PhantomData;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct GpuIndex(pub usize);
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct NumberOfSamples(pub usize);
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct NumberOfGpus(pub usize);
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CudaStream {
gpu_index: GpuIndex,
stream: StreamPointer,
}
impl CudaStream {
pub(crate) fn new(gpu_index: GpuIndex) -> Result<Self, CudaError> {
if gpu_index.0 >= unsafe { cuda_get_number_of_gpus() } as usize {
Err(CudaError::InvalidDeviceIndex(gpu_index))
} else {
let stream = StreamPointer(unsafe { cuda_create_stream(gpu_index.0 as u32) });
Ok(CudaStream { gpu_index, stream })
}
}
pub(crate) fn gpu_index(&self) -> GpuIndex {
self.gpu_index
}
pub(crate) fn stream_handle(&self) -> StreamPointer {
self.stream
}
pub(crate) fn check_device_memory(&self, size: u64) -> Result<(), CudaError> {
let valid = unsafe { cuda_check_valid_malloc(size, self.gpu_index().0 as u32) };
match valid {
0 => Ok(()),
-1 => Err(CudaError::NotEnoughDeviceMemory(self.gpu_index())),
-2 => Err(CudaError::InvalidDeviceIndex(self.gpu_index())),
_ => Err(CudaError::UnspecifiedDeviceError(self.gpu_index())),
}
}
pub(crate) fn malloc<T>(&self, elements: u32) -> CudaVec<T>
where
T: Numeric,
{
let size = elements as u64 * std::mem::size_of::<T>() as u64;
let ptr = unsafe { cuda_malloc(size, self.gpu_index().0 as u32) };
CudaVec {
ptr,
idx: self.gpu_index.0 as u32,
len: elements as usize,
_phantom: PhantomData::default(),
}
}
pub(crate) unsafe fn copy_to_gpu_async<T>(&self, dest: &mut CudaVec<T>, src: &[T])
where
T: Numeric,
{
let size = (src.len() * std::mem::size_of::<T>()) as u64;
cuda_memcpy_async_to_gpu(
dest.as_mut_c_ptr(),
src.as_ptr() as *const c_void,
size,
self.stream_handle().0,
self.gpu_index().0 as u32,
);
}
pub(crate) unsafe fn copy_to_gpu<T>(&self, dest: &mut CudaVec<T>, src: &[T])
where
T: Numeric,
{
self.copy_to_gpu_async(dest, src);
self.synchronize_device();
}
pub(crate) unsafe fn copy_to_cpu_async<T>(&self, dest: &mut [T], src: &CudaVec<T>)
where
T: Numeric,
{
let size = (dest.len() * std::mem::size_of::<T>()) as u64;
cuda_memcpy_async_to_cpu(
dest.as_mut_ptr() as *mut c_void,
src.as_c_ptr(),
size,
self.stream_handle().0,
self.gpu_index().0 as u32,
);
}
pub(crate) unsafe fn copy_to_cpu<T>(&self, dest: &mut [T], src: &CudaVec<T>)
where
T: Numeric,
{
self.copy_to_cpu_async(dest, src);
self.synchronize_device();
}
#[allow(dead_code)]
pub(crate) fn synchronize_device(&self) {
unsafe { cuda_synchronize_device(self.gpu_index().0 as u32) };
}
pub(crate) fn get_max_shared_memory(&self) -> Result<i32, CudaError> {
let max_shared_memory = unsafe { cuda_get_max_shared_memory(self.gpu_index().0 as u32) };
match max_shared_memory {
0 => Err(CudaError::SharedMemoryNotFound(self.gpu_index())),
-2 => Err(CudaError::InvalidDeviceIndex(self.gpu_index())),
_ => Ok(max_shared_memory),
}
}
#[allow(dead_code)]
pub fn initialize_twiddles(&self, polynomial_size: PolynomialSize) {
unsafe { cuda_initialize_twiddles(polynomial_size.0 as u32, self.gpu_index.0 as u32) };
}
#[allow(dead_code)]
pub unsafe fn convert_lwe_bootstrap_key<T: UnsignedInteger>(
&self,
dest: &mut CudaVec<f64>,
src: &[T],
input_lwe_dim: LweDimension,
glwe_dim: GlweDimension,
l_gadget: DecompositionLevelCount,
polynomial_size: PolynomialSize,
) {
if T::BITS == 32 {
cuda_convert_lwe_bootstrap_key_32(
dest.as_mut_c_ptr(),
src.as_ptr() as *mut c_void,
self.stream.0,
self.gpu_index.0 as u32,
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
)
} else if T::BITS == 64 {
cuda_convert_lwe_bootstrap_key_64(
dest.as_mut_c_ptr(),
src.as_ptr() as *mut c_void,
self.stream.0,
self.gpu_index.0 as u32,
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
)
}
}
#[allow(dead_code, clippy::too_many_arguments)]
pub unsafe fn discard_bootstrap_amortized_lwe_ciphertext_vector<T: UnsignedInteger>(
&self,
lwe_array_out: &mut CudaVec<T>,
test_vector: &CudaVec<T>,
test_vector_indexes: &CudaVec<u32>,
lwe_array_in: &CudaVec<T>,
bootstrapping_key: &CudaVec<f64>,
lwe_dimension: LweDimension,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
base_log: DecompositionBaseLog,
level: DecompositionLevelCount,
num_samples: NumberOfSamples,
lwe_idx: LweCiphertextIndex,
max_shared_memory: SharedMemoryAmount,
) {
if T::BITS == 32 {
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
self.stream.0,
lwe_array_out.as_mut_c_ptr(),
test_vector.as_c_ptr(),
test_vector_indexes.as_c_ptr(),
lwe_array_in.as_c_ptr(),
bootstrapping_key.as_c_ptr(),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples.0 as u32,
num_samples.0 as u32,
lwe_idx.0 as u32,
max_shared_memory.0 as u32,
)
} else if T::BITS == 64 {
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
self.stream.0,
lwe_array_out.as_mut_c_ptr(),
test_vector.as_c_ptr(),
test_vector_indexes.as_c_ptr(),
lwe_array_in.as_c_ptr(),
bootstrapping_key.as_c_ptr(),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples.0 as u32,
num_samples.0 as u32,
lwe_idx.0 as u32,
max_shared_memory.0 as u32,
)
}
}
#[allow(dead_code, clippy::too_many_arguments)]
pub unsafe fn discard_bootstrap_low_latency_lwe_ciphertext_vector<T: UnsignedInteger>(
&self,
lwe_array_out: &mut CudaVec<T>,
test_vector: &CudaVec<T>,
test_vector_indexes: &CudaVec<u32>,
lwe_array_in: &CudaVec<T>,
bootstrapping_key: &CudaVec<f64>,
lwe_dimension: LweDimension,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
base_log: DecompositionBaseLog,
level: DecompositionLevelCount,
num_samples: NumberOfSamples,
lwe_idx: LweCiphertextIndex,
max_shared_memory: SharedMemoryAmount,
) {
if T::BITS == 32 {
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
self.stream.0,
lwe_array_out.as_mut_c_ptr(),
test_vector.as_c_ptr(),
test_vector_indexes.as_c_ptr(),
lwe_array_in.as_c_ptr(),
bootstrapping_key.as_c_ptr(),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples.0 as u32,
num_samples.0 as u32,
lwe_idx.0 as u32,
max_shared_memory.0 as u32,
)
} else if T::BITS == 64 {
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
self.stream.0,
lwe_array_out.as_mut_c_ptr(),
test_vector.as_c_ptr(),
test_vector_indexes.as_c_ptr(),
lwe_array_in.as_c_ptr(),
bootstrapping_key.as_c_ptr(),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples.0 as u32,
num_samples.0 as u32,
lwe_idx.0 as u32,
max_shared_memory.0 as u32,
)
}
}
#[allow(dead_code, clippy::too_many_arguments)]
pub unsafe fn discard_keyswitch_lwe_ciphertext_vector<T: UnsignedInteger>(
&self,
lwe_array_out: &mut CudaVec<T>,
lwe_array_in: &CudaVec<T>,
input_lwe_dimension: LweDimension,
output_lwe_dimension: LweDimension,
keyswitch_key: &CudaVec<T>,
base_log: DecompositionBaseLog,
l_gadget: DecompositionLevelCount,
num_samples: NumberOfSamples,
) {
if T::BITS == 32 {
cuda_keyswitch_lwe_ciphertext_vector_32(
self.stream.0,
lwe_array_out.as_mut_c_ptr(),
lwe_array_in.as_c_ptr(),
keyswitch_key.as_c_ptr(),
input_lwe_dimension.0 as u32,
output_lwe_dimension.0 as u32,
base_log.0 as u32,
l_gadget.0 as u32,
num_samples.0 as u32,
)
} else if T::BITS == 64 {
cuda_keyswitch_lwe_ciphertext_vector_64(
self.stream.0,
lwe_array_out.as_mut_c_ptr(),
lwe_array_in.as_c_ptr(),
keyswitch_key.as_c_ptr(),
input_lwe_dimension.0 as u32,
output_lwe_dimension.0 as u32,
base_log.0 as u32,
l_gadget.0 as u32,
num_samples.0 as u32,
)
}
}
}
impl Drop for CudaStream {
fn drop(&mut self) {
unsafe {
cuda_destroy_stream(self.stream_handle().0, self.gpu_index().0 as u32);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn print_gpu_info() {
println!("Number of GPUs: {}", unsafe { cuda_get_number_of_gpus() });
let gpu_index = GpuIndex(0);
let stream = CudaStream::new(gpu_index).unwrap();
println!(
"Max shared memory: {}",
stream.get_max_shared_memory().unwrap()
)
}
#[test]
fn allocate_and_copy() {
let vec = vec![1_u64, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
let gpu_index = GpuIndex(0);
let stream = CudaStream::new(gpu_index).unwrap();
stream.check_device_memory(vec.len() as u64).unwrap();
let mut d_vec: CudaVec<u64> = stream.malloc::<u64>(vec.len() as u32);
unsafe {
stream.copy_to_gpu(&mut d_vec, &vec);
}
let mut empty = vec![0_u64; vec.len()];
unsafe {
stream.copy_to_cpu(&mut empty, &d_vec);
}
assert_eq!(vec, empty);
}
}