Skip to main content

singe_cuda/
event.rs

1use std::{cmp::Ordering, mem::ManuallyDrop, ptr, sync::Arc};
2
3use singe_cuda_sys::{driver, runtime};
4
5use crate::{
6    context::Context,
7    error::{Error, Result},
8    stream::{BorrowedStream, Stream, StreamBinding},
9    try_ffi,
10};
11
12bitflags::bitflags! {
13    /// Flags for CUDA event creation ([`Context::create_event_with_flags`]).
14    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15    pub struct EventFlags: u32 {
16        const DEFAULT = driver::CUevent_flags::CU_EVENT_DEFAULT as _;
17        const BLOCKING_SYNC = driver::CUevent_flags::CU_EVENT_BLOCKING_SYNC as _;
18        const DISABLE_TIMING = driver::CUevent_flags::CU_EVENT_DISABLE_TIMING as _;
19        const INTERPROCESS = driver::CUevent_flags::CU_EVENT_INTERPROCESS as _;
20    }
21}
22
23bitflags::bitflags! {
24    /// Flags for `Event::record_raw`.
25    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
26    pub struct EventRecordFlags: u32 {
27        const DEFAULT = runtime::cudaEventRecordDefault;
28        const EXTERNAL = runtime::cudaEventRecordExternal;
29    }
30}
31
32#[derive(Debug)]
33pub struct Event {
34    handle: runtime::cudaEvent_t,
35    ctx: Arc<Context>,
36}
37
38impl PartialEq for Event {
39    fn eq(&self, other: &Self) -> bool {
40        self.handle == other.handle && Arc::ptr_eq(&self.ctx, &other.ctx)
41    }
42}
43
44impl Eq for Event {}
45
46impl PartialOrd for Event {
47    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
48        Some(self.cmp(other))
49    }
50}
51
52impl Ord for Event {
53    fn cmp(&self, other: &Self) -> Ordering {
54        let lhs = (Arc::as_ptr(&self.ctx) as usize, self.handle as usize);
55        let rhs = (Arc::as_ptr(&other.ctx) as usize, other.handle as usize);
56        lhs.cmp(&rhs)
57    }
58}
59
60impl Event {
61    /// Wraps an existing CUDA event handle.
62    ///
63    /// # Safety
64    ///
65    /// `handle` must be a valid CUDA event owned by `ctx`, and ownership of the
66    /// handle is transferred to the returned [`Event`]. The handle must not be
67    /// destroyed elsewhere after calling this function.
68    pub unsafe fn from_raw(handle: runtime::cudaEvent_t, ctx: Arc<Context>) -> Result<Self> {
69        if handle.is_null() {
70            return Err(Error::NullHandle);
71        }
72
73        Ok(Self { handle, ctx })
74    }
75
76    pub fn record(&self, stream: &Stream, flags: EventRecordFlags) -> Result<()> {
77        if stream.context() != self.context() {
78            return Err(Error::StreamContextMismatch);
79        }
80        self.record_raw(stream.as_raw(), flags)
81    }
82
83    pub fn record_borrowed(&self, stream: &BorrowedStream, flags: EventRecordFlags) -> Result<()> {
84        if stream.context() != self.context() {
85            return Err(Error::StreamContextMismatch);
86        }
87        self.record_raw(stream.as_raw(), flags)
88    }
89
90    pub fn record_on(&self, stream: &StreamBinding, flags: EventRecordFlags) -> Result<()> {
91        if stream.context() != self.context() {
92            return Err(Error::StreamContextMismatch);
93        }
94        self.record_raw(stream.as_raw(), flags)
95    }
96
97    fn record_raw(&self, stream: runtime::cudaStream_t, flags: EventRecordFlags) -> Result<()> {
98        self.ctx.bind()?;
99        unsafe {
100            try_ffi!(runtime::cudaEventRecordWithFlags(
101                self.as_raw(),
102                stream,
103                flags.bits(),
104            ))?;
105        }
106        Ok(())
107    }
108
109    /// Queries the status of all work currently captured by event.
110    /// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
111    ///
112    /// Returns `true` if all captured work has been completed, or `false` if any captured work is incomplete.
113    ///
114    /// For the purposes of Unified Memory, a return value of `true` is equivalent to having called [`Event::synchronize`].
115    ///
116    /// # Errors
117    ///
118    /// Returns an error if CUDA cannot query the event. CUDA may also report
119    /// errors from previous asynchronous launches, internal runtime
120    /// initialization errors such as [`crate::error::Status::NotInitialized`],
121    /// [`crate::error::Status::CallRequiresNewerDriver`], or [`crate::error::Status::NoDevice`], and
122    /// callback diagnostics such as [`crate::error::Status::NotPermitted`].
123    pub fn query(&self) -> Result<bool> {
124        self.ctx.bind()?;
125        let error = unsafe { runtime::cudaEventQuery(self.as_raw()) };
126        match error {
127            runtime::cudaError_t::CUDA_SUCCESS => Ok(true),
128            runtime::cudaError_t::CUDA_ERROR_NOT_READY => Ok(false),
129            _ => Err(error.into()),
130        }
131    }
132
133    /// Waits until the completion of all work currently captured in event.
134    /// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
135    ///
136    /// Waiting for an event created with [`EventFlags::BLOCKING_SYNC`] causes the calling CPU thread to block until the event has been completed by the device.
137    /// Without [`EventFlags::BLOCKING_SYNC`], the CPU thread will busy-wait until the event has been completed by the device.
138    ///
139    /// # Errors
140    ///
141    /// Returns an error if CUDA cannot wait for the event. CUDA may also report
142    /// errors from previous asynchronous launches, internal runtime
143    /// initialization errors such as [`crate::error::Status::NotInitialized`],
144    /// [`crate::error::Status::CallRequiresNewerDriver`], or [`crate::error::Status::NoDevice`], and
145    /// callback diagnostics such as [`crate::error::Status::NotPermitted`].
146    pub fn synchronize(&self) -> Result<()> {
147        self.ctx.bind()?;
148        unsafe {
149            try_ffi!(runtime::cudaEventSynchronize(self.as_raw()))?;
150        }
151        Ok(())
152    }
153
154    /// Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds).
155    /// Note this call is not guaranteed to return the latest errors for pending work.
156    /// Use it only for elapsed-time calculation; poll for completion on the events to be compared with [`Event::query`] instead.
157    ///
158    /// If either event was last recorded in a non-default stream, the resulting time may be greater than expected, even if both used the same stream handle.
159    /// This happens because the [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) operation takes place asynchronously and there is no guarantee that the measured latency is actually just between the two events.
160    /// Any number of other different stream operations could execute in between the two measured events, thus altering the timing in a significant way.
161    ///
162    /// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has not been called on either event, then [`crate::error::Status::InvalidHandle`] is returned.
163    /// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has been called on both events but one or both of them has not yet been completed (that is, [`Event::query`] would return [`crate::error::Status::NotReady`] on at least one of the events), [`crate::error::Status::NotReady`] is returned.
164    /// If either event was created with [`EventFlags::DISABLE_TIMING`], this returns [`crate::error::Status::InvalidHandle`].
165    ///
166    /// # Errors
167    ///
168    /// Returns an error if the events belong to different contexts, either
169    /// event has not been recorded, either event is incomplete, timing was
170    /// disabled on either event, or CUDA rejects the elapsed-time query. CUDA
171    /// may also report errors from previous asynchronous launches, internal
172    /// runtime initialization errors such as [`crate::error::Status::NotInitialized`],
173    /// [`crate::error::Status::CallRequiresNewerDriver`], or [`crate::error::Status::NoDevice`], and
174    /// callback diagnostics such as [`crate::error::Status::NotPermitted`].
175    pub fn elapsed_time_since(&self, start: &Event) -> Result<f32> {
176        if self.context() != start.context() {
177            return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
178        }
179
180        self.ctx.bind()?;
181        let mut milliseconds = 0.0f32;
182        unsafe {
183            try_ffi!(runtime::cudaEventElapsedTime(
184                &raw mut milliseconds,
185                start.as_raw(),
186                self.as_raw(),
187            ))?;
188        }
189        Ok(milliseconds)
190    }
191
192    pub fn context(&self) -> &Context {
193        &self.ctx
194    }
195
196    pub const fn as_raw(&self) -> runtime::cudaEvent_t {
197        self.handle
198    }
199
200    /// Consumes the event and returns the raw CUDA event handle without
201    /// destroying it.
202    ///
203    /// The caller becomes responsible for eventually destroying the returned
204    /// handle with CUDA.
205    pub fn into_raw(self) -> runtime::cudaEvent_t {
206        let event = ManuallyDrop::new(self);
207        event.handle
208    }
209}
210
211// CUDA events are synchronization handles. Recording/waiting uses CUDA's event
212// semantics and does not mutate Rust-owned state through shared references.
213unsafe impl Send for Event {}
214unsafe impl Sync for Event {}
215
216impl Drop for Event {
217    fn drop(&mut self) {
218        if let Err(err) = self.ctx.bind() {
219            #[cfg(debug_assertions)]
220            eprintln!("failed to bind context before destroying event: {err}");
221        }
222        unsafe {
223            if let Err(err) = try_ffi!(runtime::cudaEventDestroy(self.handle)) {
224                #[cfg(debug_assertions)]
225                eprintln!("failed to destroy CUDA event: {err}");
226            }
227        }
228    }
229}
230
231impl Context {
232    pub fn create_event(self: &Arc<Self>) -> Result<Event> {
233        self.create_event_with_flags(EventFlags::DEFAULT)
234    }
235
236    /// Creates an event object for the current device with the specified flags.
237    /// Valid flags include:
238    ///
239    /// * [`EventFlags::DEFAULT`]: default event creation flag.
240    /// * [`EventFlags::BLOCKING_SYNC`]: the event uses blocking synchronization.
241    ///   A host thread that uses [`Event::synchronize`] to wait on an event created with this flag will block until the event actually completes.
242    /// * [`EventFlags::DISABLE_TIMING`]: the created event does not record timing data.
243    ///   Events created with this flag specified and [`EventFlags::BLOCKING_SYNC`] not specified will provide the best performance when used with [`Stream::wait_event`] and [`Event::query`].
244    /// * [`EventFlags::INTERPROCESS`]: the created event may be used as an interprocess event by [`sys::cudaIpcGetEventHandle`](singe_cuda_sys::runtime::cudaIpcGetEventHandle).
245    ///   [`EventFlags::INTERPROCESS`] must be specified along with [`EventFlags::DISABLE_TIMING`].
246    ///
247    /// # Errors
248    ///
249    /// Returns an error if the context cannot be bound, the flag combination is
250    /// invalid, CUDA cannot create the event, or CUDA returns a null event
251    /// handle. CUDA may also report errors from previous asynchronous launches,
252    /// internal runtime initialization errors such as
253    /// [`crate::error::Status::NotInitialized`], [`crate::error::Status::CallRequiresNewerDriver`],
254    /// or [`crate::error::Status::NoDevice`], and callback diagnostics such as
255    /// [`crate::error::Status::NotPermitted`].
256    pub fn create_event_with_flags(self: &Arc<Self>, flags: EventFlags) -> Result<Event> {
257        self.bind()?;
258        let mut handle = ptr::null_mut();
259        unsafe {
260            try_ffi!(runtime::cudaEventCreateWithFlags(
261                &raw mut handle,
262                flags.bits()
263            ))?;
264        }
265        if handle.is_null() {
266            return Err(Error::NullHandle);
267        }
268        unsafe { Event::from_raw(handle, Arc::clone(self)) }
269    }
270}