04_streams/
04-streams.rs

1use cudarc::{
2    driver::{CudaContext, DriverError, LaunchConfig, PushKernelArg},
3    nvrtc::Ptx,
4};
5
6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.clone_htod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.clone_dtoh(&a_dev)?;
38    let b_host = stream.clone_dtoh(&b_dev)?;
39
40    println!("Found {b_host:?}");
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}
04_streams/04-streams.rs

04_streams/
04-streams.rs