use baracuda::runtime::{Device, DeviceBuffer, Library, Stream};
const PTX: &str = include_str!("../../baracuda-runtime/tests/kernels/vector_add.ptx");
fn main() -> baracuda::runtime::Result<()> {
let runtime = baracuda::runtime::runtime_version()?;
let driver = baracuda::runtime::driver_version()?;
println!("CUDA runtime: {runtime} (driver: {driver})");
let device = Device::from_ordinal(0);
device.set_current()?;
let (major, minor) = device.compute_capability()?;
println!(
"device 0: cc {major}.{minor}, {} SMs",
device.multiprocessor_count()?,
);
let n: u32 = 1 << 16;
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (n - i) as f32).collect();
let d_a = DeviceBuffer::from_slice(&a)?;
let d_b = DeviceBuffer::from_slice(&b)?;
let d_c: DeviceBuffer<f32> = DeviceBuffer::new(n as usize)?;
let lib = Library::load_ptx(PTX)?;
let kernel = lib.get_kernel("vector_add")?;
let stream = Stream::new()?;
let block: u32 = 256;
let grid: u32 = n.div_ceil(block);
let a_ptr = d_a.as_device_ptr();
let b_ptr = d_b.as_device_ptr();
let c_ptr = d_c.as_device_ptr();
unsafe {
kernel
.launch()
.grid((grid, 1, 1))
.block((block, 1, 1))
.stream(&stream)
.arg(&a_ptr)
.arg(&b_ptr)
.arg(&c_ptr)
.arg(&n)
.launch()?;
}
stream.synchronize()?;
let mut got = vec![0.0f32; n as usize];
d_c.copy_to_host(&mut got)?;
let mismatches: usize = a
.iter()
.zip(&b)
.zip(&got)
.filter(|((x, y), z)| (**x + **y - **z).abs() > 1e-3)
.count();
if mismatches == 0 {
println!("runtime vector_add: {n} elements match");
} else {
eprintln!("runtime vector_add: {mismatches} mismatches out of {n}");
std::process::exit(1);
}
Ok(())
}