pub struct LaunchArgs<'a> { /* private fields */ }Expand description
The kernel launch builder. Instantiate with CudaStream::launch_builder(), and then launch the kernel with LaunchArgs::launch()
Anything added as a kernel argument with LaunchArgs::arg() must either:
- Implement DeviceRepr
- Add a custom implementation of
impl<'a> PushKernelArg<T> for LaunchArgs<'a>, whereTis your type.
Implementations§
Source§impl LaunchArgs<'_>
impl LaunchArgs<'_>
Sourcepub fn record_kernel_launch(&mut self, flags: CUevent_flags) -> &mut Self
pub fn record_kernel_launch(&mut self, flags: CUevent_flags) -> &mut Self
Calling this will make LaunchArgs::launch() and LaunchArgs::launch_cooperative() return 2 CudaEvents that recorded before and after the kernel is submitted.
Sourcepub unsafe fn launch(
&mut self,
cfg: LaunchConfig,
) -> Result<Option<(CudaEvent, CudaEvent)>, DriverError>
pub unsafe fn launch( &mut self, cfg: LaunchConfig, ) -> Result<Option<(CudaEvent, CudaEvent)>, DriverError>
Submits the configuration CudaFunction to execute asychronously on the configured device stream.
§Safety
This is generally unsafe for two main reasons:
- We can’t guarantee that the arguments are valid for the configured CudaFunction. We don’t know if the types are correct, if the arguments are in the correct order, if the types are representable in CUDA, etc.
- We can’t guarantee that the cuda kernel follows the mutability of the arguments configured with LaunchArgs::arg(). For instance, you can pass a reference to a CudaSlice, which on rust side can’t be mutated, but on cuda side the kernel can mutate it.
- CudaFunction can access memory outside of limits.
§Handling asynchronous mutation
All CudaSlice/CudaView/CudaViewMut contain 2 events that record when the data associated with them are read from/written to.
The PushKernelArg implementation of these adds these events to LaunchArgs, so when LaunchArgs::launch() is called, we properly do multi stream synchronization.
So in practice it is not possible to have multiple kernels concurrently modify device data while using the safe api.
§Handling use after free
Since LaunchArgs::launch() properly records reads/writes for CudaSlice/CudaView/CudaViewMut, and the drop implementation of CudaSlice waits on those events to finish, we will never encounter a use after free situation.
Examples found in repository?
32fn main() -> Result<(), DriverError> {
33 let ctx = CudaContext::new(0)?;
34 let stream = ctx.default_stream();
35
36 let ptx = compile_ptx(PTX_SRC).unwrap();
37 let module = ctx.load_module(ptx)?;
38 let f = module.load_function("my_custom_kernel")?;
39
40 // try changing some of these values to see a device assert
41 let thing = MyCoolRustStruct {
42 a: 1.0,
43 b: 2.34,
44 c: 57,
45 d: 420,
46 };
47
48 let mut builder = stream.launch_builder(&f);
49 // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
50 builder.arg(&thing);
51 unsafe { builder.launch(LaunchConfig::for_num_elems(1)) }?;
52
53 Ok(())
54}More examples
6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // You can load a function from a pre-compiled PTX like so:
11 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13 // and then load a function from it:
14 let f = module.load_function("sin_kernel").unwrap();
15
16 let a_host = [1.0, 2.0, 3.0];
17
18 let a_dev = stream.memcpy_stod(&a_host)?;
19 let mut b_dev = a_dev.clone();
20
21 // we use a buidler pattern to launch kernels.
22 let n = 3i32;
23 let cfg = LaunchConfig::for_num_elems(n as u32);
24 let mut launch_args = stream.launch_builder(&f);
25 launch_args.arg(&mut b_dev);
26 launch_args.arg(&a_dev);
27 launch_args.arg(&n);
28 unsafe { launch_args.launch(cfg) }?;
29
30 let a_host_2 = stream.memcpy_dtov(&a_dev)?;
31 let b_host = stream.memcpy_dtov(&b_dev)?;
32
33 println!("Found {:?}", b_host);
34 println!("Expected {:?}", a_host.map(f32::sin));
35 assert_eq!(&a_host, a_host_2.as_slice());
36
37 Ok(())
38}22fn main() -> Result<(), DriverError> {
23 let start = std::time::Instant::now();
24
25 let ptx = compile_ptx(PTX_SRC).unwrap();
26 println!("Compilation succeeded in {:?}", start.elapsed());
27
28 let ctx = CudaContext::new(0)?;
29 let stream = ctx.default_stream();
30 println!("Built in {:?}", start.elapsed());
31
32 let module = ctx.load_module(ptx)?;
33 let f = module.load_function("matmul")?;
34 println!("Loaded in {:?}", start.elapsed());
35
36 let a_host = [1.0f32, 2.0, 3.0, 4.0];
37 let b_host = [1.0f32, 2.0, 3.0, 4.0];
38 let mut c_host = [0.0f32; 4];
39
40 let a_dev = stream.memcpy_stod(&a_host)?;
41 let b_dev = stream.memcpy_stod(&b_host)?;
42 let mut c_dev = stream.memcpy_stod(&c_host)?;
43
44 println!("Copied in {:?}", start.elapsed());
45
46 let mut builder = stream.launch_builder(&f);
47 builder.arg(&a_dev);
48 builder.arg(&b_dev);
49 builder.arg(&mut c_dev);
50 builder.arg(&2i32);
51 let cfg = LaunchConfig {
52 block_dim: (2, 2, 1),
53 grid_dim: (1, 1, 1),
54 shared_mem_bytes: 0,
55 };
56 unsafe { builder.launch(cfg) }?;
57
58 stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59 println!("Found {:?} in {:?}", c_host, start.elapsed());
60 Ok(())
61}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11 let f = module.load_function("sin_kernel")?;
12
13 let n = 3i32;
14 let a_host = [1.0, 2.0, 3.0];
15 let a_dev = stream.memcpy_stod(&a_host)?;
16 let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18 // we can safely create a second stream using [CudaStream::fork()].
19 // This synchronizes with the source stream, so
20 // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21 // work on this stream can start.
22 let stream2 = stream.fork()?;
23
24 // now we launch this work on the other stream
25 let mut builder = stream2.launch_builder(&f);
26 builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27 builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28 builder.arg(&n);
29 unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31 // cudarc automatically manages multi stream synchronization,
32 // so even though we launched the above on a separate stream,
33 // doing this device to host transfer will still properly synchronize.
34 // a_dev doesn't need to synchronize at all since we specified it is just
35 // being read from.
36 // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37 let a_host_2 = stream.memcpy_dtov(&a_dev)?;
38 let b_host = stream.memcpy_dtov(&b_dev)?;
39
40 println!("Found {:?}", b_host);
41 println!("Expected {:?}", a_host.map(f32::sin));
42 assert_eq!(&a_host, a_host_2.as_slice());
43
44 Ok(())
45}12fn main() -> Result<(), DriverError> {
13 {
14 // Option 1: sharing ctx & module between threads
15 thread::scope(|s| {
16 let ptx = compile_ptx(KERNEL_SRC).unwrap();
17 let ctx = CudaContext::new(0)?;
18 let module = ctx.load_module(ptx)?;
19 for i in 0..10i32 {
20 let thread_ctx = ctx.clone();
21 let thread_module = module.clone();
22 s.spawn(move || {
23 let stream = thread_ctx.default_stream();
24 let f = thread_module.load_function("hello_world")?;
25 unsafe {
26 stream
27 .launch_builder(&f)
28 .arg(&i)
29 .launch(LaunchConfig::for_num_elems(1))
30 }
31 });
32 }
33 Ok(())
34 })?;
35 }
36
37 {
38 // Option 2: initializing different context in each
39 // Note that this will still schedule to the same stream since we are using the
40 // default stream here on the same device.
41 thread::scope(move |s| {
42 for i in 0..10i32 {
43 s.spawn(move || {
44 let ptx = compile_ptx(KERNEL_SRC).unwrap();
45 let ctx = CudaContext::new(0)?;
46 let module = ctx.load_module(ptx)?;
47 let stream = ctx.default_stream();
48 let f = module.load_function("hello_world")?;
49 unsafe {
50 stream
51 .launch_builder(&f)
52 .arg(&i)
53 .launch(LaunchConfig::for_num_elems(1))
54 }
55 });
56 }
57 Ok(())
58 })?;
59 }
60
61 Ok(())
62}