pub mod api;
pub type CUdeviceptr = core::ffi::c_ulonglong;
mod cuda_device;
mod kernel_cache;
mod kernel_launch;
mod ops;
use std::{marker::PhantomData, ptr::null_mut};
pub use cuda_device::*;
pub use kernel_cache::*;
pub use kernel_launch::*;
use crate::{flag::AllocFlag, Buffer, CDatatype, CommonPtrs, PtrType, ShallowCopy};
use self::api::cufree;
pub type CUBuffer<'a, T> = Buffer<'a, T, CUDA>;
pub fn chosen_cu_idx() -> usize {
std::env::var("CUSTOS_CU_DEVICE_IDX")
.unwrap_or_else(|_| "0".into())
.parse()
.expect(
"Environment variable 'CUSTOS_CU_DEVICE_IDX' contains an invalid CUDA device index!",
)
}
#[derive(Debug, PartialEq, Eq)]
pub struct CUDAPtr<T> {
pub ptr: u64,
pub len: usize,
pub flag: AllocFlag,
p: PhantomData<T>,
}
impl<T> Default for CUDAPtr<T> {
#[inline]
fn default() -> Self {
Self {
ptr: 0,
len: 0,
flag: AllocFlag::default(),
p: PhantomData,
}
}
}
impl<T> Drop for CUDAPtr<T> {
fn drop(&mut self) {
if !matches!(self.flag, AllocFlag::None | AllocFlag::BorrowedCache) {
return;
}
if self.ptr == 0 {
return;
}
unsafe {
cufree(self.ptr).unwrap();
}
}
}
impl<T> ShallowCopy for CUDAPtr<T> {
#[inline]
unsafe fn shallow(&self) -> Self {
CUDAPtr {
ptr: self.ptr,
len: self.len,
flag: AllocFlag::Wrapper,
p: PhantomData,
}
}
}
impl<T> PtrType for CUDAPtr<T> {
#[inline]
fn size(&self) -> usize {
self.len
}
#[inline]
fn flag(&self) -> AllocFlag {
self.flag
}
}
impl<T> CommonPtrs<T> for CUDAPtr<T> {
#[inline]
fn ptrs(&self) -> (*const T, *mut std::ffi::c_void, u64) {
(null_mut(), null_mut(), self.ptr)
}
#[inline]
fn ptrs_mut(&mut self) -> (*mut T, *mut std::ffi::c_void, u64) {
(null_mut(), null_mut(), self.ptr)
}
}
pub fn cu_clear<T: CDatatype>(device: &CUDA, buf: &mut Buffer<T, CUDA>) -> crate::Result<()> {
let src = format!(
r#"extern "C" __global__ void clear({datatype}* self, int numElements)
{{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < numElements) {{
self[idx] = 0;
}}
}}
"#,
datatype = T::as_c_type_str()
);
launch_kernel1d(buf.len(), device, &src, "clear", &[buf, &buf.len()])?;
Ok(())
}
#[cfg(test)]
mod tests {
use core::ffi::c_void;
use crate::{
cuda::{api::culaunch_kernel, fn_cache},
Buffer, Read, CUDA,
};
#[test]
fn test_cached_kernel_launch() -> crate::Result<()> {
let device = CUDA::new(0)?;
let a = Buffer::from((&device, [1, 2, 3, 4, 5]));
let b = Buffer::from((&device, [4, 1, 7, 6, 9]));
let c = Buffer::<i32, _>::new(&device, a.len());
let src = r#"
extern "C" __global__ void add(int *a, int *b, int *c, int numElements)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < numElements) {
c[idx] = a[idx] + b[idx];
}
}"#;
for _ in 0..1000 {
fn_cache(&device, src, "add")?;
assert_eq!(device.kernel_cache.borrow().kernels.len(), 1);
}
let function = fn_cache(&device, src, "add")?;
culaunch_kernel(
&function,
[a.len() as u32, 1, 1],
[1, 1, 1],
&mut device.stream(),
&mut [
&a.ptrs().2 as *const u64 as *mut c_void,
&b.ptrs().2 as *const u64 as *mut c_void,
&mut c.ptrs().2 as *mut u64 as *mut c_void,
&a.len() as *const usize as *mut c_void,
],
)?;
assert_eq!(&vec![5, 3, 10, 10, 14], &device.read(&c));
Ok(())
}
}