use cudarc::{
    driver::{CudaDevice, DriverError, LaunchAsync, LaunchConfig},
    nvrtc::Ptx,
};

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}