1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#[allow(unused_imports)]
use crate::error::ErrorCode;
use std::{ptr, sync::Arc};
use singe_cuda_sys::{driver, runtime};
use crate::{
context::Context,
error::{Error, Result},
stream::{BorrowedStream, Stream, StreamBinding},
try_cuda,
};
bitflags::bitflags! {
/// Flags for CUDA event creation ([`Context::create_event_with_flags`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct EventFlags: u32 {
const DEFAULT = driver::CUevent_flags::CU_EVENT_DEFAULT as _;
const BLOCKING_SYNC = driver::CUevent_flags::CU_EVENT_BLOCKING_SYNC as _;
const DISABLE_TIMING = driver::CUevent_flags::CU_EVENT_DISABLE_TIMING as _;
const INTERPROCESS = driver::CUevent_flags::CU_EVENT_INTERPROCESS as _;
}
}
bitflags::bitflags! {
/// Flags for [`Event::record_raw`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct EventRecordFlags: u32 {
const DEFAULT = runtime::cudaEventRecordDefault;
const EXTERNAL = runtime::cudaEventRecordExternal;
}
}
#[derive(Debug)]
pub struct Event {
handle: runtime::cudaEvent_t,
ctx: Arc<Context>,
}
impl Event {
pub fn record(&self, stream: &Stream, flags: EventRecordFlags) -> Result<()> {
if stream.context() != self.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.record_raw(unsafe { stream.as_raw() }, flags)
}
pub fn record_borrowed(&self, stream: &BorrowedStream, flags: EventRecordFlags) -> Result<()> {
if stream.context() != self.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.record_raw(stream.as_raw(), flags)
}
pub fn record_on(&self, stream: &StreamBinding, flags: EventRecordFlags) -> Result<()> {
if stream.context() != self.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.record_raw(stream.as_raw(), flags)
}
fn record_raw(&self, stream: runtime::cudaStream_t, flags: EventRecordFlags) -> Result<()> {
self.ctx.bind()?;
unsafe {
try_cuda!(runtime::cudaEventRecordWithFlags(
self.as_raw(),
stream,
flags.bits(),
))?;
}
Ok(())
}
/// Queries the status of all work currently captured by event.
/// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
///
/// Returns `true` if all captured work has been completed, or `false` if any captured work is incomplete.
///
/// For the purposes of Unified Memory, a return value of `true` is equivalent to having called [`Event::synchronize`].
///
/// Note:
///
/// * Note that this function may also return error codes from previous, asynchronous launches.
/// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
/// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
/// [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
pub fn query(&self) -> Result<bool> {
let error = unsafe { runtime::cudaEventQuery(self.as_raw()) };
match error {
runtime::cudaError_t::CUDA_SUCCESS => Ok(true),
runtime::cudaError_t::CUDA_ERROR_NOT_READY => Ok(false),
_ => Err(error.into()),
}
}
/// Waits until the completion of all work currently captured in event.
/// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
///
/// Waiting for an event created with [`EventFlags::BLOCKING_SYNC`] causes the calling CPU thread to block until the event has been completed by the device.
/// Without [`EventFlags::BLOCKING_SYNC`], the CPU thread will busy-wait until the event has been completed by the device.
///
/// Note:
///
/// * Note that this function may also return error codes from previous, asynchronous launches.
/// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
/// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
/// [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
pub fn synchronize(&self) -> Result<()> {
self.ctx.bind()?;
unsafe {
try_cuda!(runtime::cudaEventSynchronize(self.as_raw()))?;
}
Ok(())
}
/// Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds).
/// Note this API is not guaranteed to return the latest errors for pending work.
/// As such this API is intended to serve as a elapsed time calculation only and polling for completion on the events to be compared should be done with [`Event::query`] instead.
///
/// If either event was last recorded in a non-default stream, the resulting time may be greater than expected, even if both used the same stream handle.
/// This happens because the [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) operation takes place asynchronously and there is no guarantee that the measured latency is actually just between the two events.
/// Any number of other different stream operations could execute in between the two measured events, thus altering the timing in a significant way.
///
/// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has not been called on either event, then [`ErrorCode::InvalidHandle`] is returned.
/// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has been called on both events but one or both of them has not yet been completed (that is, [`Event::query`] would return [`ErrorCode::NotReady`] on at least one of the events), [`ErrorCode::NotReady`] is returned.
/// If either event was created with [`EventFlags::DISABLE_TIMING`], then this function will return [`ErrorCode::InvalidHandle`].
///
/// Note:
///
/// * Note that this function may also return error codes from previous, asynchronous launches.
/// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
/// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
/// [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
pub fn elapsed_time_since(&self, start: &Event) -> Result<f32> {
if self.context() != start.context() {
return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
}
self.ctx.bind()?;
let mut milliseconds = 0.0f32;
unsafe {
try_cuda!(runtime::cudaEventElapsedTime(
&raw mut milliseconds,
start.as_raw(),
self.as_raw(),
))?;
}
Ok(milliseconds)
}
pub fn context(&self) -> &Context {
&self.ctx
}
pub const unsafe fn as_raw(&self) -> runtime::cudaEvent_t {
self.handle
}
}
unsafe impl Send for Event {}
unsafe impl Sync for Event {}
impl Drop for Event {
fn drop(&mut self) {
if let Err(err) = self.ctx.bind() {
#[cfg(debug_assertions)]
eprintln!("failed to bind context before destroying stream: {err}");
}
unsafe {
if let Err(err) = try_cuda!(runtime::cudaEventDestroy(self.handle)) {
#[cfg(debug_assertions)]
eprintln!("failed to destroy CUDA event: {err}");
}
}
}
}
impl Context {
pub fn create_event(self: &Arc<Self>) -> Result<Event> {
self.create_event_with_flags(EventFlags::DEFAULT)
}
/// Creates an event object for the current device with the specified flags.
/// Valid flags include:
///
/// * [`EventFlags::DEFAULT`]: Default event creation flag.
/// * [`EventFlags::BLOCKING_SYNC`]: Specifies that event should use blocking synchronization.
/// A host thread that uses [`Event::synchronize`] to wait on an event created with this flag will block until the event actually completes.
/// * [`EventFlags::DISABLE_TIMING`]: Specifies that the created event does not need to record timing data.
/// Events created with this flag specified and [`EventFlags::BLOCKING_SYNC`] not specified will provide the best performance when used with [`Stream::wait_event`] and [`Event::query`].
/// * [`EventFlags::INTERPROCESS`]: Specifies that the created event may be used as an interprocess event by [`sys::cudaIpcGetEventHandle`](singe_cuda_sys::runtime::cudaIpcGetEventHandle).
/// [`EventFlags::INTERPROCESS`] must be specified along with [`EventFlags::DISABLE_TIMING`].
///
/// Note:
///
/// * Note that this function may also return error codes from previous, asynchronous launches.
/// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
/// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
/// [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
pub fn create_event_with_flags(self: &Arc<Self>, flags: EventFlags) -> Result<Event> {
self.bind()?;
let mut handle = ptr::null_mut();
unsafe {
try_cuda!(runtime::cudaEventCreateWithFlags(
&raw mut handle,
flags.bits()
))?;
}
if handle.is_null() {
return Err(Error::NullHandle);
}
Ok(Event {
handle,
ctx: Arc::clone(self),
})
}
}