Struct LaunchArgs

Source

pub struct LaunchArgs<'a> { /* private fields */ }

Expand description

The kernel launch builder. Instantiate with CudaStream::launch_builder(), and then launch the kernel with LaunchArgs::launch()

Anything added as a kernel argument with LaunchArgs::arg() must either:

Implement DeviceRepr
Add a custom implementation of impl<'a> PushKernelArg<T> for LaunchArgs<'a>, where T is your type.

Implementations§

Source §

impl LaunchArgs<'_>

Source

pub fn record_kernel_launch(&mut self, flags: CUevent_flags) -> &mut Self

Calling this will make LaunchArgs::launch() and LaunchArgs::launch_cooperative() return 2 CudaEvents that recorded before and after the kernel is submitted.

Source

pub unsafe fn launch( &mut self, cfg: LaunchConfig, ) -> Result<Option<(CudaEvent, CudaEvent)>, DriverError>

Submits the configuration CudaFunction to execute asychronously on the configured device stream.

§Safety

This is generally unsafe for two main reasons:

We can’t guarantee that the arguments are valid for the configured CudaFunction. We don’t know if the types are correct, if the arguments are in the correct order, if the types are representable in CUDA, etc.
We can’t guarantee that the cuda kernel follows the mutability of the arguments configured with LaunchArgs::arg(). For instance, you can pass a reference to a CudaSlice, which on rust side can’t be mutated, but on cuda side the kernel can mutate it.
CudaFunction can access memory outside of limits.

§Handling asynchronous mutation

All CudaSlice/CudaView/CudaViewMut contain 2 events that record when the data associated with them are read from/written to.

The PushKernelArg implementation of these adds these events to LaunchArgs, so when LaunchArgs::launch() is called, we properly do multi stream synchronization.

So in practice it is not possible to have multiple kernels concurrently modify device data while using the safe api.

§Handling use after free

Since LaunchArgs::launch() properly records reads/writes for CudaSlice/CudaView/CudaViewMut, and the drop implementation of CudaSlice waits on those events to finish, we will never encounter a use after free situation.

Examples found in repository ?

examples/05-device-repr.rs (line 51)

32fn main() -> Result<(), DriverError> {
33    let ctx = CudaContext::new(0)?;
34    let stream = ctx.default_stream();
35
36    let ptx = compile_ptx(PTX_SRC).unwrap();
37    let module = ctx.load_module(ptx)?;
38    let f = module.load_function("my_custom_kernel")?;
39
40    // try changing some of these values to see a device assert
41    let thing = MyCoolRustStruct {
42        a: 1.0,
43        b: 2.34,
44        c: 57,
45        d: 420,
46    };
47
48    let mut builder = stream.launch_builder(&f);
49    // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
50    builder.arg(&thing);
51    unsafe { builder.launch(LaunchConfig::for_num_elems(1)) }?;
52
53    Ok(())
54}

More examples

Hide additional examples

examples/03-launch-kernel.rs (line 28)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // You can load a function from a pre-compiled PTX like so:
11    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13    // and then load a function from it:
14    let f = module.load_function("sin_kernel").unwrap();
15
16    let a_host = [1.0, 2.0, 3.0];
17
18    let a_dev = stream.memcpy_stod(&a_host)?;
19    let mut b_dev = a_dev.clone();
20
21    // we use a buidler pattern to launch kernels.
22    let n = 3i32;
23    let cfg = LaunchConfig::for_num_elems(n as u32);
24    let mut launch_args = stream.launch_builder(&f);
25    launch_args.arg(&mut b_dev);
26    launch_args.arg(&a_dev);
27    launch_args.arg(&n);
28    unsafe { launch_args.launch(cfg) }?;
29
30    let a_host_2 = stream.memcpy_dtov(&a_dev)?;
31    let b_host = stream.memcpy_dtov(&b_dev)?;
32
33    println!("Found {:?}", b_host);
34    println!("Expected {:?}", a_host.map(f32::sin));
35    assert_eq!(&a_host, a_host_2.as_slice());
36
37    Ok(())
38}

examples/matmul-kernel.rs (line 56)

22fn main() -> Result<(), DriverError> {
23    let start = std::time::Instant::now();
24
25    let ptx = compile_ptx(PTX_SRC).unwrap();
26    println!("Compilation succeeded in {:?}", start.elapsed());
27
28    let ctx = CudaContext::new(0)?;
29    let stream = ctx.default_stream();
30    println!("Built in {:?}", start.elapsed());
31
32    let module = ctx.load_module(ptx)?;
33    let f = module.load_function("matmul")?;
34    println!("Loaded in {:?}", start.elapsed());
35
36    let a_host = [1.0f32, 2.0, 3.0, 4.0];
37    let b_host = [1.0f32, 2.0, 3.0, 4.0];
38    let mut c_host = [0.0f32; 4];
39
40    let a_dev = stream.memcpy_stod(&a_host)?;
41    let b_dev = stream.memcpy_stod(&b_host)?;
42    let mut c_dev = stream.memcpy_stod(&c_host)?;
43
44    println!("Copied in {:?}", start.elapsed());
45
46    let mut builder = stream.launch_builder(&f);
47    builder.arg(&a_dev);
48    builder.arg(&b_dev);
49    builder.arg(&mut c_dev);
50    builder.arg(&2i32);
51    let cfg = LaunchConfig {
52        block_dim: (2, 2, 1),
53        grid_dim: (1, 1, 1),
54        shared_mem_bytes: 0,
55    };
56    unsafe { builder.launch(cfg) }?;
57
58    stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59    println!("Found {:?} in {:?}", c_host, start.elapsed());
60    Ok(())
61}

examples/04-streams.rs (line 29)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.memcpy_stod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.memcpy_dtov(&a_dev)?;
38    let b_host = stream.memcpy_dtov(&b_dev)?;
39
40    println!("Found {:?}", b_host);
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}

examples/06-threading.rs (line 29)

12fn main() -> Result<(), DriverError> {
13    {
14        // Option 1: sharing ctx & module between threads
15        thread::scope(|s| {
16            let ptx = compile_ptx(KERNEL_SRC).unwrap();
17            let ctx = CudaContext::new(0)?;
18            let module = ctx.load_module(ptx)?;
19            for i in 0..10i32 {
20                let thread_ctx = ctx.clone();
21                let thread_module = module.clone();
22                s.spawn(move || {
23                    let stream = thread_ctx.default_stream();
24                    let f = thread_module.load_function("hello_world")?;
25                    unsafe {
26                        stream
27                            .launch_builder(&f)
28                            .arg(&i)
29                            .launch(LaunchConfig::for_num_elems(1))
30                    }
31                });
32            }
33            Ok(())
34        })?;
35    }
36
37    {
38        // Option 2: initializing different context in each
39        // Note that this will still schedule to the same stream since we are using the
40        // default stream here on the same device.
41        thread::scope(move |s| {
42            for i in 0..10i32 {
43                s.spawn(move || {
44                    let ptx = compile_ptx(KERNEL_SRC).unwrap();
45                    let ctx = CudaContext::new(0)?;
46                    let module = ctx.load_module(ptx)?;
47                    let stream = ctx.default_stream();
48                    let f = module.load_function("hello_world")?;
49                    unsafe {
50                        stream
51                            .launch_builder(&f)
52                            .arg(&i)
53                            .launch(LaunchConfig::for_num_elems(1))
54                    }
55                });
56            }
57            Ok(())
58        })?;
59    }
60
61    Ok(())
62}