use baracuda_driver::{Context, Device, DeviceBuffer, Module, Stream};
const VECTOR_ADD_PTX: &str = include_str!("kernels/vector_add.ptx");
fn setup() -> baracuda_driver::Result<(Device, Context, Stream)> {
baracuda_driver::init()?;
let device = Device::get(0)?;
let ctx = Context::new(&device)?;
let stream = Stream::new(&ctx)?;
Ok((device, ctx, stream))
}
#[test]
#[ignore = "requires an NVIDIA GPU; run with `cargo test -- --ignored`"]
fn device_enumeration() {
baracuda_driver::init().expect("cuInit should succeed on a GPU host");
let count = Device::count().expect("cuDeviceGetCount");
assert!(count >= 1, "no CUDA devices detected");
let device = Device::get(0).expect("cuDeviceGet(0)");
let name = device.name().expect("cuDeviceGetName");
assert!(!name.is_empty());
eprintln!("device 0: {name}");
let (major, minor) = device.compute_capability().unwrap();
eprintln!("compute capability: {major}.{minor}");
eprintln!(
"multiprocessors: {}",
device.multiprocessor_count().unwrap()
);
eprintln!("warp size: {}", device.warp_size().unwrap());
eprintln!(
"total memory: {:.2} GiB",
device.total_memory().unwrap() as f64 / (1024.0 * 1024.0 * 1024.0)
);
}
#[test]
#[ignore = "requires an NVIDIA GPU; run with `cargo test -- --ignored`"]
fn memcpy_roundtrip() {
let (_dev, ctx, stream) = setup().expect("setup");
let host: Vec<f32> = (0..4096).map(|i| i as f32 * 0.25).collect();
let device = DeviceBuffer::from_slice(&ctx, &host).expect("alloc + H2D");
let mut back = vec![0.0f32; host.len()];
device.copy_to_host(&mut back).expect("D2H");
stream.synchronize().expect("stream sync");
assert_eq!(host, back);
}
#[test]
#[ignore = "requires an NVIDIA GPU; run with `cargo test -- --ignored`"]
fn vector_add_kernel_launch() {
let (_dev, ctx, stream) = setup().expect("setup");
let n: u32 = 1 << 20; let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (n - i) as f32).collect();
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
let d_a = DeviceBuffer::from_slice(&ctx, &a).unwrap();
let d_b = DeviceBuffer::from_slice(&ctx, &b).unwrap();
let d_c: DeviceBuffer<f32> = DeviceBuffer::new(&ctx, n as usize).unwrap();
let module = Module::load_ptx(&ctx, VECTOR_ADD_PTX).expect("cuModuleLoadData");
let kernel = module
.get_function("vector_add")
.expect("cuModuleGetFunction");
let block: u32 = 256;
let grid: u32 = n.div_ceil(block);
let a_ptr = d_a.as_raw();
let b_ptr = d_b.as_raw();
let c_ptr = d_c.as_raw();
unsafe {
kernel
.launch()
.grid((grid, 1, 1))
.block((block, 1, 1))
.stream(&stream)
.arg(&a_ptr)
.arg(&b_ptr)
.arg(&c_ptr)
.arg(&n)
.launch()
.expect("cuLaunchKernel");
}
stream.synchronize().expect("stream sync");
let mut got = vec![0.0f32; n as usize];
d_c.copy_to_host(&mut got).expect("D2H");
assert_eq!(expected, got);
}
#[test]
#[ignore = "requires an NVIDIA GPU; run with `cargo test -- --ignored`"]
fn zero_reuses_existing_buffer() {
let (_dev, ctx, stream) = setup().expect("setup");
let host: Vec<f32> = (0..1024).map(|i| (i as f32) + 1.0).collect();
let buf = DeviceBuffer::from_slice(&ctx, &host).expect("alloc + H2D");
buf.zero().expect("zero");
let mut got = vec![f32::NAN; host.len()];
buf.copy_to_host(&mut got).expect("D2H");
assert!(got.iter().all(|&x| x == 0.0), "sync zero left non-zero bytes");
buf.copy_from_host(&host).expect("H2D refill");
buf.zero_async(&stream).expect("zero_async");
stream.synchronize().expect("sync");
let mut got2 = vec![f32::NAN; host.len()];
buf.copy_to_host(&mut got2).expect("D2H");
assert!(got2.iter().all(|&x| x == 0.0), "async zero left non-zero bytes");
let empty: DeviceBuffer<f32> = DeviceBuffer::new(&ctx, 0).expect("empty");
empty.zero().expect("zero on empty");
empty.zero_async(&stream).expect("zero_async on empty");
}