singe-cuda 0.1.0-alpha.3

#[allow(unused_imports)]
use crate::error::ErrorCode;

use std::{ptr, sync::Arc};

use singe_cuda_sys::{driver, runtime};

use crate::{
    context::Context,
    error::{Error, Result},
    stream::{BorrowedStream, Stream, StreamBinding},
    try_cuda,
};

bitflags::bitflags! {
    /// Flags for CUDA event creation ([`Context::create_event_with_flags`]).
    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    pub struct EventFlags: u32 {
        const DEFAULT = driver::CUevent_flags::CU_EVENT_DEFAULT as _;
        const BLOCKING_SYNC = driver::CUevent_flags::CU_EVENT_BLOCKING_SYNC as _;
        const DISABLE_TIMING = driver::CUevent_flags::CU_EVENT_DISABLE_TIMING as _;
        const INTERPROCESS = driver::CUevent_flags::CU_EVENT_INTERPROCESS as _;
    }
}

bitflags::bitflags! {
    /// Flags for [`Event::record_raw`].
    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    pub struct EventRecordFlags: u32 {
        const DEFAULT = runtime::cudaEventRecordDefault;
        const EXTERNAL = runtime::cudaEventRecordExternal;
    }
}

#[derive(Debug)]
pub struct Event {
    handle: runtime::cudaEvent_t,
    ctx: Arc<Context>,
}

impl Event {
    pub fn record(&self, stream: &Stream, flags: EventRecordFlags) -> Result<()> {
        if stream.context() != self.context() {
            return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
        }
        self.record_raw(unsafe { stream.as_raw() }, flags)
    }

    pub fn record_borrowed(&self, stream: &BorrowedStream, flags: EventRecordFlags) -> Result<()> {
        if stream.context() != self.context() {
            return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
        }
        self.record_raw(stream.as_raw(), flags)
    }

    pub fn record_on(&self, stream: &StreamBinding, flags: EventRecordFlags) -> Result<()> {
        if stream.context() != self.context() {
            return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
        }
        self.record_raw(stream.as_raw(), flags)
    }

    fn record_raw(&self, stream: runtime::cudaStream_t, flags: EventRecordFlags) -> Result<()> {
        self.ctx.bind()?;
        unsafe {
            try_cuda!(runtime::cudaEventRecordWithFlags(
                self.as_raw(),
                stream,
                flags.bits(),
            ))?;
        }
        Ok(())
    }

    /// Queries the status of all work currently captured by event.
    /// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
    ///
    /// Returns `true` if all captured work has been completed, or `false` if any captured work is incomplete.
    ///
    /// For the purposes of Unified Memory, a return value of `true` is equivalent to having called [`Event::synchronize`].
    ///
    /// Note:
    ///
    /// * Note that this function may also return error codes from previous, asynchronous launches.
    /// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
    /// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
    ///   [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
    pub fn query(&self) -> Result<bool> {
        let error = unsafe { runtime::cudaEventQuery(self.as_raw()) };
        match error {
            runtime::cudaError_t::CUDA_SUCCESS => Ok(true),
            runtime::cudaError_t::CUDA_ERROR_NOT_READY => Ok(false),
            _ => Err(error.into()),
        }
    }

    /// Waits until the completion of all work currently captured in event.
    /// See [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) for details on what is captured by an event.
    ///
    /// Waiting for an event created with [`EventFlags::BLOCKING_SYNC`] causes the calling CPU thread to block until the event has been completed by the device.
    /// Without [`EventFlags::BLOCKING_SYNC`], the CPU thread will busy-wait until the event has been completed by the device.
    ///
    /// Note:
    ///
    /// * Note that this function may also return error codes from previous, asynchronous launches.
    /// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
    /// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
    ///   [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
    pub fn synchronize(&self) -> Result<()> {
        self.ctx.bind()?;
        unsafe {
            try_cuda!(runtime::cudaEventSynchronize(self.as_raw()))?;
        }
        Ok(())
    }

    /// Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds).
    /// Note this API is not guaranteed to return the latest errors for pending work.
    /// As such this API is intended to serve as a elapsed time calculation only and polling for completion on the events to be compared should be done with [`Event::query`] instead.
    ///
    /// If either event was last recorded in a non-default stream, the resulting time may be greater than expected, even if both used the same stream handle.
    /// This happens because the [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) operation takes place asynchronously and there is no guarantee that the measured latency is actually just between the two events.
    /// Any number of other different stream operations could execute in between the two measured events, thus altering the timing in a significant way.
    ///
    /// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has not been called on either event, then [`ErrorCode::InvalidHandle`] is returned.
    /// If [`sys::cudaEventRecord`](singe_cuda_sys::runtime::cudaEventRecord) has been called on both events but one or both of them has not yet been completed (that is, [`Event::query`] would return [`ErrorCode::NotReady`] on at least one of the events), [`ErrorCode::NotReady`] is returned.
    /// If either event was created with [`EventFlags::DISABLE_TIMING`], then this function will return [`ErrorCode::InvalidHandle`].
    ///
    /// Note:
    ///
    /// * Note that this function may also return error codes from previous, asynchronous launches.
    /// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
    /// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
    ///   [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
    pub fn elapsed_time_since(&self, start: &Event) -> Result<f32> {
        if self.context() != start.context() {
            return Err(runtime::cudaError_t::CUDA_ERROR_INVALID_CONTEXT.into());
        }

        self.ctx.bind()?;
        let mut milliseconds = 0.0f32;
        unsafe {
            try_cuda!(runtime::cudaEventElapsedTime(
                &raw mut milliseconds,
                start.as_raw(),
                self.as_raw(),
            ))?;
        }
        Ok(milliseconds)
    }

    pub fn context(&self) -> &Context {
        &self.ctx
    }

    pub const unsafe fn as_raw(&self) -> runtime::cudaEvent_t {
        self.handle
    }
}

unsafe impl Send for Event {}

unsafe impl Sync for Event {}

impl Drop for Event {
    fn drop(&mut self) {
        if let Err(err) = self.ctx.bind() {
            #[cfg(debug_assertions)]
            eprintln!("failed to bind context before destroying stream: {err}");
        }
        unsafe {
            if let Err(err) = try_cuda!(runtime::cudaEventDestroy(self.handle)) {
                #[cfg(debug_assertions)]
                eprintln!("failed to destroy CUDA event: {err}");
            }
        }
    }
}

impl Context {
    pub fn create_event(self: &Arc<Self>) -> Result<Event> {
        self.create_event_with_flags(EventFlags::DEFAULT)
    }

    /// Creates an event object for the current device with the specified flags.
    /// Valid flags include:
    ///
    /// * [`EventFlags::DEFAULT`]: Default event creation flag.
    /// * [`EventFlags::BLOCKING_SYNC`]: Specifies that event should use blocking synchronization.
    ///   A host thread that uses [`Event::synchronize`] to wait on an event created with this flag will block until the event actually completes.
    /// * [`EventFlags::DISABLE_TIMING`]: Specifies that the created event does not need to record timing data.
    ///   Events created with this flag specified and [`EventFlags::BLOCKING_SYNC`] not specified will provide the best performance when used with [`Stream::wait_event`] and [`Event::query`].
    /// * [`EventFlags::INTERPROCESS`]: Specifies that the created event may be used as an interprocess event by [`sys::cudaIpcGetEventHandle`](singe_cuda_sys::runtime::cudaIpcGetEventHandle).
    ///   [`EventFlags::INTERPROCESS`] must be specified along with [`EventFlags::DISABLE_TIMING`].
    ///
    /// Note:
    ///
    /// * Note that this function may also return error codes from previous, asynchronous launches.
    /// * Note that this function may also return [`ErrorCode::NotInitialized`], [`ErrorCode::CallRequiresNewerDriver`] or [`ErrorCode::NoDevice`] if this call tries to initialize internal CUDA RT state.
    /// * Note that as specified by [`Stream::add_callback`] no CUDA function may be called from callback.
    ///   [`ErrorCode::NotPermitted`] may, but is not guaranteed to, be returned as a diagnostic in such case.
    pub fn create_event_with_flags(self: &Arc<Self>, flags: EventFlags) -> Result<Event> {
        self.bind()?;
        let mut handle = ptr::null_mut();
        unsafe {
            try_cuda!(runtime::cudaEventCreateWithFlags(
                &raw mut handle,
                flags.bits()
            ))?;
        }
        if handle.is_null() {
            return Err(Error::NullHandle);
        }
        Ok(Event {
            handle,
            ctx: Arc::clone(self),
        })
    }
}