1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
//! Kernel launch builder for the Runtime API.
use core::ffi::c_void;
use baracuda_cuda_sys::runtime::{cudaStream_t, runtime, types::dim3};
use baracuda_types::KernelArg;
use crate::error::{check, Result};
use crate::module::Kernel;
use crate::stream::Stream;
/// Grid / block size triple, matching [`baracuda_driver::Dim3`].
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct Dim3 {
/// Extent in the X dimension. Must be `>= 1`.
pub x: u32,
/// Extent in the Y dimension. Use `1` for 1-D launches.
pub y: u32,
/// Extent in the Z dimension. Use `1` for 1-D and 2-D launches.
pub z: u32,
}
impl Dim3 {
#[inline]
fn to_sys(self) -> dim3 {
dim3::new(self.x, self.y, self.z)
}
}
impl From<u32> for Dim3 {
fn from(x: u32) -> Self {
Self { x, y: 1, z: 1 }
}
}
impl From<(u32, u32)> for Dim3 {
fn from((x, y): (u32, u32)) -> Self {
Self { x, y, z: 1 }
}
}
impl From<(u32, u32, u32)> for Dim3 {
fn from((x, y, z): (u32, u32, u32)) -> Self {
Self { x, y, z }
}
}
impl Kernel {
/// Start a kernel-launch builder for this kernel.
#[inline]
pub fn launch(&self) -> LaunchBuilder<'_> {
LaunchBuilder {
kernel: self,
grid: Dim3 { x: 1, y: 1, z: 1 },
block: Dim3 { x: 1, y: 1, z: 1 },
shared_mem_bytes: 0,
stream: None,
args: Vec::new(),
}
}
}
/// Builder produced by [`Kernel::launch`].
#[must_use = "the launch builder does nothing until `.launch()` is called"]
pub struct LaunchBuilder<'k> {
kernel: &'k Kernel,
grid: Dim3,
block: Dim3,
shared_mem_bytes: usize,
stream: Option<&'k Stream>,
args: Vec<*mut c_void>,
}
impl core::fmt::Debug for LaunchBuilder<'_> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("LaunchBuilder")
.field("grid", &self.grid)
.field("block", &self.block)
.field("shared_mem_bytes", &self.shared_mem_bytes)
.field("arg_count", &self.args.len())
.finish_non_exhaustive()
}
}
impl<'k> LaunchBuilder<'k> {
/// Set the grid dimensions (number of thread blocks).
#[inline]
pub fn grid(mut self, grid: impl Into<Dim3>) -> Self {
self.grid = grid.into();
self
}
/// Set the block dimensions (threads per block).
#[inline]
pub fn block(mut self, block: impl Into<Dim3>) -> Self {
self.block = block.into();
self
}
/// Reserve `bytes` of dynamic shared memory per block.
#[inline]
pub fn shared_mem_bytes(mut self, bytes: usize) -> Self {
self.shared_mem_bytes = bytes;
self
}
/// Enqueue on `stream` instead of the default stream.
#[inline]
pub fn stream(mut self, stream: &'k Stream) -> Self {
self.stream = Some(stream);
self
}
/// Append `arg` to the kernel argument list. Arguments are passed
/// positionally in the order they are added.
#[inline]
pub fn arg<K: KernelArg>(mut self, arg: K) -> Self {
self.args.push(arg.as_kernel_arg_ptr());
self
}
/// Enqueue the kernel.
///
/// # Safety
///
/// Same rules as [`baracuda_driver::LaunchBuilder::launch`]: argument
/// types and order must match the kernel's C signature, referenced
/// device memory must stay valid for the duration of device execution,
/// and grid/block dims must be within device limits.
pub unsafe fn launch(mut self) -> Result<()> { unsafe {
let r = runtime()?;
let cu = r.cuda_launch_kernel()?;
let stream_handle: cudaStream_t = self.stream.map_or(core::ptr::null_mut(), |s| s.as_raw());
let args_ptr = if self.args.is_empty() {
core::ptr::null_mut()
} else {
self.args.as_mut_ptr()
};
check(cu(
self.kernel.as_launch_ptr(),
self.grid.to_sys(),
self.block.to_sys(),
args_ptr,
self.shared_mem_bytes,
stream_handle,
))
}}
/// Launch as a cooperative kernel — grid-wide sync via
/// `cooperative_groups::this_grid()`. All blocks must fit resident
/// on the device simultaneously; use
/// [`crate::Kernel::max_active_blocks_per_multiprocessor`] to size
/// the grid.
///
/// # Safety
///
/// Same as [`launch`](Self::launch) plus the kernel must be
/// compiled with cooperative-groups support.
pub unsafe fn launch_cooperative(mut self) -> Result<()> { unsafe {
let r = runtime()?;
let cu = r.cuda_launch_cooperative_kernel()?;
let stream_handle: cudaStream_t = self.stream.map_or(core::ptr::null_mut(), |s| s.as_raw());
let args_ptr = if self.args.is_empty() {
core::ptr::null_mut()
} else {
self.args.as_mut_ptr()
};
check(cu(
self.kernel.as_launch_ptr(),
self.grid.to_sys(),
self.block.to_sys(),
args_ptr,
self.shared_mem_bytes,
stream_handle,
))
}}
}