custos 0.7.0

A minimal OpenCL, WGPU, CUDA and host CPU array manipulation engine.
Documentation
//! CUDA API

#[cfg(test)]
mod test_occupancy;

mod cuda;
mod error;
mod ffi;

pub mod cublas;
pub mod nvrtc;

pub use cuda::*;
pub use ffi::*;

#[cfg(test)]
mod tests {
    use crate::cuda::api::CUdeviceptr;

    #[test]
    fn test_cuda_alloc() {
        use crate::cuda::api::{create_context, cuInit, cumalloc, device};

        unsafe { cuInit(0) };
        let device = device(0).unwrap();
        let _ctx = create_context(&device).unwrap();

        let _x = cumalloc::<f32>(10).unwrap();
    }

    #[test]
    fn test_cuda_alloc2() -> crate::Result<()> {
        use crate::cuda::api::{create_context, cuInit, cumalloc, device, device_count};

        unsafe { cuInit(0) };
        println!("count: {}", device_count()?);

        let device = device(0)?;
        let _ctx = create_context(&device)?;

        let _x = cumalloc::<f32>(10)?;

        Ok(())
    }

    #[test]
    fn test_cuda_write() -> crate::Result<()> {
        use crate::cuda::api::{create_context, cuInit, cu_read, cu_write, cumalloc, device};

        unsafe { cuInit(0) };

        let device = device(0)?;
        let _ctx = create_context(&device)?;

        let x = cumalloc::<f32>(100)?;

        let write = [4f32; 10];
        cu_write(x, &write)?;

        let mut read = vec![0f32; 10];
        cu_read(&mut read, x)?;

        assert_eq!(&[4.0; 10], read.as_slice());

        Ok(())
    }

    #[test]
    fn test_cublas() -> crate::Result<()> {
        use crate::cuda::api::{
            create_context, cuInit, cu_read, cu_write,
            cublas::{cublasContext, cublasCreate_v2, cublasOperation_t, cublasSgemm_v2},
            cumalloc, device,
        };
        use std::ptr::null_mut;

        let m = 3;
        let k = 2;
        let n = 3;

        unsafe { cuInit(0) };

        let device = device(0)?;
        let _ctx = create_context(&device)?;

        let a = cumalloc::<f32>(m * k)?;

        let write = (0..m * k).map(|x| x as f32).collect::<Vec<f32>>();
        cu_write(a, &write)?;

        let b = cumalloc::<f32>(k * n)?;

        let write = (0..k * n).rev().map(|x| x as f32).collect::<Vec<f32>>();
        cu_write(b, &write)?;

        let c = cumalloc::<f32>(m * n)?;

        unsafe {
            let mut handle: *mut cublasContext = null_mut();
            let res = cublasCreate_v2(&mut handle);
            if res as u32 != 0 {
                println!("cublas create")
            }

            let res = cublasSgemm_v2(
                handle,
                cublasOperation_t::CUBLAS_OP_N,
                cublasOperation_t::CUBLAS_OP_N,
                n as i32,
                m as i32,
                k as i32,
                &1f32 as *const f32,
                b as *const u64 as *const f32,
                n as i32,
                a as *const u64 as *const f32,
                k as i32,
                &0f32 as *const f32,
                c as *mut u64 as *mut f32,
                n as i32,
            );
            if res as u32 != 0 {
                println!("cublas gemm")
            }
            let mut read = vec![0f32; n * m];
            cu_read(&mut read, c)?;
            println!("read: {read:?}");
        }
        Ok(())
    }

    const N: usize = 100;

    #[test]
    fn test_ffi_cuda() {
        use std::{ffi::c_void, mem::size_of};

        use crate::cuda::api::{
            cuCtxCreate_v2, cuDeviceGet, cuInit, cuMemAlloc_v2, cuMemcpyDtoH_v2, cuMemcpyHtoD_v2,
            CUctx_st,
        };

        unsafe {
            let mut device = 0;
            let mut context: *mut CUctx_st = std::ptr::null_mut();

            let a: Vec<f32> = (0..N).into_iter().map(|x| x as f32).collect();
            let mut a_d: CUdeviceptr = 0;

            let mut out = [0f32; N];

            cuInit(0).to_result().unwrap();
            cuDeviceGet(&mut device, 0).to_result().unwrap();
            cuCtxCreate_v2(&mut context, 0, device).to_result().unwrap();

            cuMemAlloc_v2(&mut a_d, N * size_of::<f32>());

            cuMemcpyHtoD_v2(a_d, a.as_ptr() as *const c_void, N * size_of::<f32>())
                .to_result()
                .unwrap();
            cuMemcpyDtoH_v2(out.as_mut_ptr() as *mut c_void, a_d, N * size_of::<f32>())
                .to_result()
                .unwrap();
            println!("out: {out:?}");
        };
    }

    #[test]
    fn test_cuda_device() -> crate::Result<()> {
        use crate::{cuda::CUDA, Buffer};

        let device = CUDA::new(0)?;
        let _a = Buffer::<f32, _>::new(&device, 10);
        Ok(())
    }
}