use cudarc::{
driver::{CudaDevice, DriverError, LaunchAsync, LaunchConfig},
nvrtc::Ptx,
};
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}