1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#[allow(unused_imports)]
use crate::error::Status;
use std::{ptr, sync::Arc};
use singe_cuda_sys::{driver, runtime};
use crate::{
context::Context,
error::{Error, Result},
stream::{BorrowedStream, Stream, StreamBinding},
try_ffi,
};
bitflags::bitflags! {
/// Flags for CUDA event creation ([`Context::create_event_with_flags`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct EventFlags: u32 {
const DEFAULT = driver::CUevent_flags::CU_EVENT_DEFAULT as _;
const BLOCKING_SYNC = driver::CUevent_flags::CU_EVENT_BLOCKING_SYNC as _;
const DISABLE_TIMING = driver::CUevent_flags::CU_EVENT_DISABLE_TIMING as _;
const INTERPROCESS = driver::CUevent_flags::CU_EVENT_INTERPROCESS as _;
}
}
bitflags::bitflags! {
/// Flags for `Event::record_raw`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct EventRecordFlags: u32 {
const DEFAULT = runtime::cudaEventRecordDefault;
const EXTERNAL = runtime::cudaEventRecordExternal;
}
}
#[derive(Debug)]
pub struct Event {
handle: runtime::cudaEvent_t,
ctx: Arc<Context>,
}
impl Event {
pub fn record(&self, stream: &Stream, flags: EventRecordFlags) -> Result<()> {
if stream.context() != self.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.record_raw(stream.as_raw(), flags)
}
pub fn record_borrowed(&self, stream: &BorrowedStream, flags: EventRecordFlags) -> Result<()> {
if stream.context() != self.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.record_raw(stream.as_raw(), flags)
}
pub fn record_on(&self, stream: &StreamBinding, flags: EventRecordFlags) -> Result<()> {
if stream.context() != self.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.record_raw(stream.as_raw(), flags)
}
fn record_raw(&self, stream: runtime::cudaStream_t, flags: EventRecordFlags) -> Result<()> {
self.ctx.bind()?;
unsafe {
try_ffi!(runtime::cudaEventRecordWithFlags(
self.as_raw(),
stream,
flags.bits(),
))?;
}
Ok(())
}
/// Queries the status of all work currently captured by event.
/// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
///
/// Returns `true` if all captured work has been completed, or `false` if any captured work is incomplete.
///
/// For the purposes of Unified Memory, a return value of `true` is equivalent to having called [`Event::synchronize`].
///
/// # Errors
///
/// Returns an error if CUDA cannot query the event. CUDA may also report
/// errors from previous asynchronous launches, internal runtime
/// initialization errors such as [`Status::NotInitialized`],
/// [`Status::CallRequiresNewerDriver`], or [`Status::NoDevice`], and
/// callback diagnostics such as [`Status::NotPermitted`].
pub fn query(&self) -> Result<bool> {
let error = unsafe { runtime::cudaEventQuery(self.as_raw()) };
match error {
runtime::cudaError_t::CUDA_SUCCESS => Ok(true),
runtime::cudaError_t::CUDA_ERROR_NOT_READY => Ok(false),
_ => Err(error.into()),
}
}
/// Waits until the completion of all work currently captured in event.
/// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
///
/// Waiting for an event created with [`EventFlags::BLOCKING_SYNC`] causes the calling CPU thread to block until the event has been completed by the device.
/// Without [`EventFlags::BLOCKING_SYNC`], the CPU thread will busy-wait until the event has been completed by the device.
///
/// # Errors
///
/// Returns an error if CUDA cannot wait for the event. CUDA may also report
/// errors from previous asynchronous launches, internal runtime
/// initialization errors such as [`Status::NotInitialized`],
/// [`Status::CallRequiresNewerDriver`], or [`Status::NoDevice`], and
/// callback diagnostics such as [`Status::NotPermitted`].
pub fn synchronize(&self) -> Result<()> {
self.ctx.bind()?;
unsafe {
try_ffi!(runtime::cudaEventSynchronize(self.as_raw()))?;
}
Ok(())
}
/// Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds).
/// Note this call is not guaranteed to return the latest errors for pending work.
/// Use it only for elapsed-time calculation; poll for completion on the events to be compared with [`Event::query`] instead.
///
/// If either event was last recorded in a non-default stream, the resulting time may be greater than expected, even if both used the same stream handle.
/// This happens because the [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) operation takes place asynchronously and there is no guarantee that the measured latency is actually just between the two events.
/// Any number of other different stream operations could execute in between the two measured events, thus altering the timing in a significant way.
///
/// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has not been called on either event, then [`Status::InvalidHandle`] is returned.
/// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has been called on both events but one or both of them has not yet been completed (that is, [`Event::query`] would return [`Status::NotReady`] on at least one of the events), [`Status::NotReady`] is returned.
/// If either event was created with [`EventFlags::DISABLE_TIMING`], this returns [`Status::InvalidHandle`].
///
/// # Errors
///
/// Returns an error if the events belong to different contexts, either
/// event has not been recorded, either event is incomplete, timing was
/// disabled on either event, or CUDA rejects the elapsed-time query. CUDA
/// may also report errors from previous asynchronous launches, internal
/// runtime initialization errors such as [`Status::NotInitialized`],
/// [`Status::CallRequiresNewerDriver`], or [`Status::NoDevice`], and
/// callback diagnostics such as [`Status::NotPermitted`].
pub fn elapsed_time_since(&self, start: &Event) -> Result<f32> {
if self.context() != start.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.ctx.bind()?;
let mut milliseconds = 0.0f32;
unsafe {
try_ffi!(runtime::cudaEventElapsedTime(
&raw mut milliseconds,
start.as_raw(),
self.as_raw(),
))?;
}
Ok(milliseconds)
}
pub fn context(&self) -> &Context {
&self.ctx
}
pub const fn as_raw(&self) -> runtime::cudaEvent_t {
self.handle
}
}
// CUDA events are synchronization handles. Recording/waiting uses CUDA's event
// semantics and does not mutate Rust-owned state through shared references.
unsafe impl Send for Event {}
unsafe impl Sync for Event {}
impl Drop for Event {
fn drop(&mut self) {
if let Err(err) = self.ctx.bind() {
#[cfg(debug_assertions)]
eprintln!("failed to bind context before destroying event: {err}");
}
unsafe {
if let Err(err) = try_ffi!(runtime::cudaEventDestroy(self.handle)) {
#[cfg(debug_assertions)]
eprintln!("failed to destroy CUDA event: {err}");
}
}
}
}
impl Context {
pub fn create_event(self: &Arc<Self>) -> Result<Event> {
self.create_event_with_flags(EventFlags::DEFAULT)
}
/// Creates an event object for the current device with the specified flags.
/// Valid flags include:
///
/// * [`EventFlags::DEFAULT`]: default event creation flag.
/// * [`EventFlags::BLOCKING_SYNC`]: the event uses blocking synchronization.
/// A host thread that uses [`Event::synchronize`] to wait on an event created with this flag will block until the event actually completes.
/// * [`EventFlags::DISABLE_TIMING`]: the created event does not record timing data.
/// Events created with this flag specified and [`EventFlags::BLOCKING_SYNC`] not specified will provide the best performance when used with [`Stream::wait_event`] and [`Event::query`].
/// * [`EventFlags::INTERPROCESS`]: the created event may be used as an interprocess event by [`sys::cudaIpcGetEventHandle`](singe_cuda_sys::runtime::cudaIpcGetEventHandle).
/// [`EventFlags::INTERPROCESS`] must be specified along with [`EventFlags::DISABLE_TIMING`].
///
/// # Errors
///
/// Returns an error if the context cannot be bound, the flag combination is
/// invalid, CUDA cannot create the event, or CUDA returns a null event
/// handle. CUDA may also report errors from previous asynchronous launches,
/// internal runtime initialization errors such as
/// [`Status::NotInitialized`], [`Status::CallRequiresNewerDriver`],
/// or [`Status::NoDevice`], and callback diagnostics such as
/// [`Status::NotPermitted`].
pub fn create_event_with_flags(self: &Arc<Self>, flags: EventFlags) -> Result<Event> {
self.bind()?;
let mut handle = ptr::null_mut();
unsafe {
try_ffi!(runtime::cudaEventCreateWithFlags(
&raw mut handle,
flags.bits()
))?;
}
if handle.is_null() {
return Err(Error::NullHandle);
}
Ok(Event {
handle,
ctx: Arc::clone(self),
})
}
}