1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
//! CUDA streams — ordered queues of work on a device.
use std::sync::Arc;
use baracuda_cuda_sys::types::CUstream_flags;
use baracuda_cuda_sys::{driver, CUstream};
use crate::context::Context;
use crate::error::{check, Result};
/// An asynchronous work queue on a CUDA device.
///
/// Work submitted to the same stream executes in order; work on different
/// streams may run concurrently, subject to device scheduling. Streams are
/// `Send + Sync` — CUDA explicitly permits concurrent submission from
/// multiple host threads.
#[derive(Clone)]
pub struct Stream {
inner: Arc<StreamInner>,
}
struct StreamInner {
handle: CUstream,
// Hold the owning context so it outlives the stream.
context: Context,
}
// SAFETY: NVIDIA documents that a CUstream may be used from any thread.
unsafe impl Send for StreamInner {}
unsafe impl Sync for StreamInner {}
impl core::fmt::Debug for StreamInner {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("Stream")
.field("handle", &self.handle)
.finish_non_exhaustive()
}
}
impl core::fmt::Debug for Stream {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.inner.fmt(f)
}
}
impl Stream {
/// Create a new stream on `context` with default flags (blocking wrt the
/// legacy default stream).
pub fn new(context: &Context) -> Result<Self> {
Self::with_flags(context, CUstream_flags::DEFAULT)
}
/// Create a non-blocking stream — work on this stream does not
/// synchronize with the legacy null stream.
pub fn non_blocking(context: &Context) -> Result<Self> {
Self::with_flags(context, CUstream_flags::NON_BLOCKING)
}
/// Create a stream with a raw flag bitmask (see [`CUstream_flags`]).
pub fn with_flags(context: &Context, flags: u32) -> Result<Self> {
context.set_current()?;
let d = driver()?;
let cu = d.cu_stream_create()?;
let mut stream: CUstream = core::ptr::null_mut();
// SAFETY: writable pointer; flags are from a known module.
check(unsafe { cu(&mut stream, flags) })?;
Ok(Self {
inner: Arc::new(StreamInner {
handle: stream,
context: context.clone(),
}),
})
}
/// Create a stream with a specific priority. Use
/// [`Context::stream_priority_range`] to discover the legal range on
/// this device (lower = higher priority).
pub fn with_priority(context: &Context, flags: u32, priority: i32) -> Result<Self> {
context.set_current()?;
let d = driver()?;
let cu = d.cu_stream_create_with_priority()?;
let mut stream: CUstream = core::ptr::null_mut();
check(unsafe { cu(&mut stream, flags, priority) })?;
Ok(Self {
inner: Arc::new(StreamInner {
handle: stream,
context: context.clone(),
}),
})
}
/// This stream's scheduling priority.
pub fn priority(&self) -> Result<i32> {
let d = driver()?;
let cu = d.cu_stream_get_priority()?;
let mut p: core::ffi::c_int = 0;
check(unsafe { cu(self.inner.handle, &mut p) })?;
Ok(p)
}
/// This stream's flags bitmask.
pub fn flags(&self) -> Result<u32> {
let d = driver()?;
let cu = d.cu_stream_get_flags()?;
let mut f: core::ffi::c_uint = 0;
check(unsafe { cu(self.inner.handle, &mut f) })?;
Ok(f)
}
/// Enqueue a host-side callback on this stream. The callback runs on
/// a driver-owned thread after all prior stream work completes.
///
/// The closure is boxed and freed after it runs; a panic inside will
/// abort the process (there's no way to propagate it through the C
/// callback). Keep the closure simple.
pub fn launch_host_func<F>(&self, f: F) -> Result<()>
where
F: FnOnce() + Send + 'static,
{
use core::ffi::c_void;
// Box up the closure and hand the raw pointer to CUDA.
let boxed: Box<Box<dyn FnOnce() + Send>> = Box::new(Box::new(f));
let raw = Box::into_raw(boxed) as *mut c_void;
unsafe extern "C" fn trampoline(user_data: *mut c_void) {
// SAFETY: user_data was `Box::into_raw`'d just above.
let f: Box<Box<dyn FnOnce() + Send>> =
unsafe { Box::from_raw(user_data as *mut Box<dyn FnOnce() + Send>) };
(*f)();
}
let d = driver()?;
let cu = d.cu_launch_host_func()?;
// SAFETY: trampoline owns and frees the boxed closure; stream handle is live.
let rc = unsafe { cu(self.inner.handle, Some(trampoline), raw) };
if rc != baracuda_cuda_sys::CUresult::SUCCESS {
// Reclaim the box so we don't leak on submission failure.
// SAFETY: cuLaunchHostFunc didn't take ownership on error.
drop(unsafe { Box::from_raw(raw as *mut Box<dyn FnOnce() + Send>) });
return Err(crate::error::Error::Status { status: rc });
}
Ok(())
}
/// Block the calling thread until all work previously enqueued on this
/// stream has completed.
pub fn synchronize(&self) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_synchronize()?;
check(unsafe { cu(self.inner.handle) })
}
/// `Ok(true)` if the stream has completed all queued work, `Ok(false)`
/// if work is still outstanding.
pub fn is_complete(&self) -> Result<bool> {
use baracuda_cuda_sys::CUresult;
let d = driver()?;
let cu = d.cu_stream_query()?;
let res = unsafe { cu(self.inner.handle) };
match res {
CUresult::SUCCESS => Ok(true),
CUresult::ERROR_NOT_READY => Ok(false),
other => Err(crate::error::Error::Status { status: other }),
}
}
/// The [`Context`] this stream lives in.
#[inline]
pub fn context(&self) -> &Context {
&self.inner.context
}
/// Raw `CUstream` handle. Use with care.
#[inline]
pub fn as_raw(&self) -> CUstream {
self.inner.handle
}
/// Device-to-device async copy scheduled on this stream.
///
/// Sugar over [`DeviceBuffer::copy_to_device_async`] with the borrows
/// flipped the way the call-site usually wants them: the destination
/// buffer is taken by `&mut`, so the borrow checker will catch
/// aliasing bugs at compile time. `src.len()` must equal `dst.len()`.
///
/// ```no_run
/// # use baracuda_driver::{Context, Device, DeviceBuffer, Stream};
/// # use baracuda_types::DeviceRepr;
/// # fn demo() -> baracuda_driver::Result<()> {
/// let ctx = Context::new(&Device::get(0)?)?;
/// let stream = Stream::new(&ctx)?;
/// let src: DeviceBuffer<f32> = DeviceBuffer::zeros(&ctx, 1024)?;
/// let mut dst: DeviceBuffer<f32> = DeviceBuffer::zeros(&ctx, 1024)?;
/// stream.memcpy_dtod(&src, &mut dst)?;
/// # Ok(()) }
/// ```
pub fn memcpy_dtod<T: baracuda_types::DeviceRepr>(
&self,
src: &crate::memory::DeviceBuffer<T>,
dst: &mut crate::memory::DeviceBuffer<T>,
) -> Result<()> {
src.copy_to_device_async(dst, self)
}
/// Return the driver-assigned 64-bit ID for this stream. Useful for
/// correlating CUPTI traces against baracuda streams.
pub fn id(&self) -> Result<u64> {
let d = driver()?;
let cu = d.cu_stream_get_id()?;
let mut out: u64 = 0;
check(unsafe { cu(self.inner.handle, &mut out) })?;
Ok(out)
}
/// Copy all CUDA-managed attributes (access policy window, sync
/// policy) from `src` onto `self`. Does not copy priority or flags
/// (those are set at stream creation time).
pub fn copy_attributes_from(&self, src: &Stream) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_copy_attributes()?;
check(unsafe { cu(self.inner.handle, src.inner.handle) })
}
/// Make this stream wait for `event` to complete before processing
/// any subsequent work. `flags` is typically `0`
/// (`CU_EVENT_WAIT_DEFAULT`). Use this for cross-stream
/// dependencies — record an event on stream A, then have stream B
/// wait on it.
pub fn wait_event(&self, event: &crate::Event, flags: u32) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_wait_event()?;
check(unsafe { cu(self.inner.handle, event.as_raw(), flags) })
}
/// Read a `CUstreamAttrValue` for `attr` from this stream. The
/// caller passes a writable buffer big enough for the largest
/// attribute value (`CUstreamAttrValue` is up to 48 bytes).
/// Use the `CU_STREAM_ATTRIBUTE_*` constants for `attr`.
///
/// # Safety
///
/// `value_out` must be a writable region matching the layout of the
/// `CUstreamAttrValue` variant for `attr`.
pub unsafe fn get_attribute(
&self,
attr: i32,
value_out: *mut core::ffi::c_void,
) -> Result<()> { unsafe {
let d = driver()?;
let cu = d.cu_stream_get_attribute()?;
check(cu(self.inner.handle, attr, value_out))
}}
/// Set a `CUstreamAttrValue` on this stream. See [`Self::get_attribute`]
/// for the value layout.
///
/// # Safety
///
/// `value` must point at a properly-initialized `CUstreamAttrValue`
/// variant for `attr`.
pub unsafe fn set_attribute(
&self,
attr: i32,
value: *const core::ffi::c_void,
) -> Result<()> { unsafe {
let d = driver()?;
let cu = d.cu_stream_set_attribute()?;
check(cu(self.inner.handle, attr, value))
}}
/// Associate a managed-memory region with this stream. Pass
/// `flags = 0` for the default ("one thread").
pub fn attach_mem_async(
&self,
dptr: baracuda_cuda_sys::CUdeviceptr,
length: usize,
flags: u32,
) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_attach_mem_async()?;
check(unsafe { cu(self.inner.handle, dptr, length, flags) })
}
/// Enqueue a 32-bit write of `value` to device memory `addr` on this
/// stream, ordered like any other stream op.
///
/// `flags` is a bitmask of
/// [`baracuda_cuda_sys::types::CUstreamWriteValue_flags`].
pub fn write_value_32(
&self,
addr: baracuda_cuda_sys::CUdeviceptr,
value: u32,
flags: u32,
) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_write_value_32()?;
check(unsafe { cu(self.inner.handle, addr, value, flags) })
}
pub fn write_value_64(
&self,
addr: baracuda_cuda_sys::CUdeviceptr,
value: u64,
flags: u32,
) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_write_value_64()?;
check(unsafe { cu(self.inner.handle, addr, value, flags) })
}
/// Block the stream until the device memory at `addr` satisfies the
/// condition specified by `flags` (see
/// [`baracuda_cuda_sys::types::CUstreamWaitValue_flags`] —
/// GEQ / EQ / AND / NOR, optionally OR'd with FLUSH).
pub fn wait_value_32(
&self,
addr: baracuda_cuda_sys::CUdeviceptr,
value: u32,
flags: u32,
) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_wait_value_32()?;
check(unsafe { cu(self.inner.handle, addr, value, flags) })
}
pub fn wait_value_64(
&self,
addr: baracuda_cuda_sys::CUdeviceptr,
value: u64,
flags: u32,
) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_wait_value_64()?;
check(unsafe { cu(self.inner.handle, addr, value, flags) })
}
/// Submit a batch of wait/write value ops atomically on this stream.
/// `ops` is typically a small array built via
/// [`baracuda_cuda_sys::types::CUstreamBatchMemOpParams::wait_value_32`]
/// etc.
pub fn batch_mem_op(
&self,
ops: &mut [baracuda_cuda_sys::types::CUstreamBatchMemOpParams],
flags: u32,
) -> Result<()> {
let d = driver()?;
let cu = d.cu_stream_batch_mem_op()?;
check(unsafe {
cu(
self.inner.handle,
ops.len() as core::ffi::c_uint,
ops.as_mut_ptr(),
flags,
)
})
}
/// Query stream-capture state. Returns `(active, capture_id, graph_handle)`
/// where `active` is `true` if the stream is currently capturing. The
/// graph handle is only meaningful while capturing.
pub fn capture_info(&self) -> Result<(bool, u64, baracuda_cuda_sys::CUgraph)> {
let d = driver()?;
let cu = d.cu_stream_get_capture_info()?;
let mut status: core::ffi::c_int = 0;
let mut id: u64 = 0;
let mut graph: baracuda_cuda_sys::CUgraph = core::ptr::null_mut();
let mut deps_ptr: *const baracuda_cuda_sys::CUgraphNode = core::ptr::null();
let mut num_deps: usize = 0;
check(unsafe {
cu(
self.inner.handle,
&mut status,
&mut id,
&mut graph,
&mut deps_ptr,
&mut num_deps,
)
})?;
// CUstreamCaptureStatus: NONE=0, ACTIVE=1, INVALIDATED=2.
Ok((status == 1, id, graph))
}
}
impl Drop for StreamInner {
fn drop(&mut self) {
if let Ok(d) = driver() {
if let Ok(cu) = d.cu_stream_destroy() {
// SAFETY: last Arc drop; handle is unique.
let _ = unsafe { cu(self.handle) };
}
}
}
}