singe-cuda 0.1.0-alpha.5

#[allow(unused_imports)]
use crate::error::Status;

use std::{
    ffi::CString,
    fmt::{self, Display, Formatter},
    ptr,
};

use num_enum::{IntoPrimitive, TryFromPrimitive};
use singe_core::impl_enum_conversion;
use singe_cuda_sys::{driver, runtime};

use crate::{
    dim::Dim3,
    error::{Error, Result},
    event::Event,
    memory::{MemoryAccessDescriptor, MemoryCopyKind, MemoryPoolProps},
    module::{KernelLaunchArgs, LaunchConfig},
    stream::Stream,
    try_ffi,
    types::{DeviceFunction, DevicePtr},
    view::{ByteBuffer, ByteBufferMut},
};

use raw::{
    HostNodeParams, Memcpy1DNodeParams, Memcpy3DNodeParams, MemcpyFromSymbolNodeParams,
    MemcpyToSymbolNodeParams,
};

/// Identifiers for [`GraphKernelNodeAttribute`] values used by CUDA graph kernel nodes.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum GraphKernelNodeAttributeId {
    /// Identifies [`GraphKernelNodeAttribute::Cooperative`].
    Cooperative = runtime::cudaLaunchAttributeID::cudaLaunchAttributeCooperative as _,
    /// Identifies [`GraphKernelNodeAttribute::ClusterDimension`].
    ClusterDimension = runtime::cudaLaunchAttributeID::cudaLaunchAttributeClusterDimension as _,
    /// Identifies [`GraphKernelNodeAttribute::Priority`].
    Priority = runtime::cudaLaunchAttributeID::cudaLaunchAttributePriority as _,
    /// Identifies [`GraphKernelNodeAttribute::PreferredSharedMemoryCarveout`].
    /// The value is a percentage in the range `0..=100` describing the preferred
    /// shared-memory carveout for the launch. This is a hint, and the driver
    /// may choose a different configuration if required.
    PreferredSharedMemoryCarveout =
        runtime::cudaLaunchAttributeID::cudaLaunchAttributePreferredSharedMemoryCarveout as _,
}

impl_enum_conversion!(
    u32,
    runtime::cudaLaunchAttributeID,
    GraphKernelNodeAttributeId
);

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GraphKernelNodeAttribute {
    Cooperative(bool),
    ClusterDimension(Dim3),
    Priority(i32),
    PreferredSharedMemoryCarveout(u32),
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct MemoryAllocationNodeInfo {
    ptr: DevicePtr,
    pub byte_size: usize,
}

bitflags::bitflags! {
    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    pub struct GraphInstantiateFlags: u64 {
        const AUTO_FREE_ON_LAUNCH = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH as _;
        const UPLOAD = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD as _;
        const DEVICE_LAUNCH = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH as _;
        const USE_NODE_PRIORITY = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY as _;
    }
}

bitflags::bitflags! {
    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    pub struct GraphDebugDotFlags: u32 {
        const VERBOSE = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE as _;
        const RUNTIME_TYPES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES as _;
        const KERNEL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS as _;
        const MEMCPY_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS as _;
        const MEMSET_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS as _;
        const HOST_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS as _;
        const EVENT_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS as _;
        const EXTERNAL_SEMAPHORE_SIGNAL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS as _;
        const EXTERNAL_SEMAPHORE_WAIT_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS as _;
        const KERNEL_NODE_ATTRIBUTES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES as _;
        const HANDLES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES as _;
        const MEMORY_ALLOC_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS as _;
        const MEMORY_FREE_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS as _;
        const BATCH_MEM_OP_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS as _;
        const EXTRA_TOPOLOGY_INFO = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO as _;
        const CONDITIONAL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS as _;
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum GraphNodeType {
    Kernel = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_KERNEL as _,
    Memcpy = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEMCPY as _,
    Memset = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEMSET as _,
    Host = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_HOST as _,
    Graph = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_GRAPH as _,
    Empty = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EMPTY as _,
    WaitEvent = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_WAIT_EVENT as _,
    EventRecord = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EVENT_RECORD as _,
    ExternalSemaphoresSignal = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL as _,
    ExternalSemaphoresWait = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT as _,
    MemoryAlloc = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEM_ALLOC as _,
    MemoryFree = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEM_FREE as _,
    BatchMemOp = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_BATCH_MEM_OP as _,
    Conditional = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_CONDITIONAL as _,
}

impl_enum_conversion!(u32, runtime::cudaGraphNodeType, GraphNodeType);

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u8)]
pub enum GraphDependencyType {
    Default = driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT as _,
    Programmatic = driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC as _,
}

impl From<driver::CUgraphDependencyType> for GraphDependencyType {
    fn from(value: driver::CUgraphDependencyType) -> Self {
        match value {
            driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT => Self::Default,
            driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC => {
                Self::Programmatic
            }
        }
    }
}

impl From<GraphDependencyType> for driver::CUgraphDependencyType {
    fn from(value: GraphDependencyType) -> Self {
        match value {
            GraphDependencyType::Default => Self::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT,
            GraphDependencyType::Programmatic => Self::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC,
        }
    }
}

impl Display for GraphNodeType {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Kernel => write!(f, "cudaGraphNodeTypeKernel"),
            Self::Memcpy => write!(f, "cudaGraphNodeTypeMemcpy"),
            Self::Memset => write!(f, "cudaGraphNodeTypeMemset"),
            Self::Host => write!(f, "cudaGraphNodeTypeHost"),
            Self::Graph => write!(f, "cudaGraphNodeTypeGraph"),
            Self::Empty => write!(f, "cudaGraphNodeTypeEmpty"),
            Self::WaitEvent => write!(f, "cudaGraphNodeTypeWaitEvent"),
            Self::EventRecord => write!(f, "cudaGraphNodeTypeEventRecord"),
            Self::ExternalSemaphoresSignal => {
                write!(f, "cudaGraphNodeTypeExternalSemaphoresSignal")
            }
            Self::ExternalSemaphoresWait => {
                write!(f, "cudaGraphNodeTypeExternalSemaphoresWait")
            }
            Self::MemoryAlloc => write!(f, "cudaGraphNodeTypeMemAlloc"),
            Self::MemoryFree => write!(f, "cudaGraphNodeTypeMemFree"),
            Self::BatchMemOp => write!(f, "cudaGraphNodeTypeBatchMemOp"),
            Self::Conditional => write!(f, "cudaGraphNodeTypeConditional"),
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum GraphExecUpdateResult {
    Success = driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_SUCCESS as _,
    Error = driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR as _,
    ErrorTopologyChanged =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED as _,
    ErrorNodeTypeChanged =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED as _,
    ErrorFunctionChanged =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED as _,
    ErrorParametersChanged =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED as _,
    ErrorNotSupported =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED as _,
    ErrorUnsupportedFunctionChange =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE
            as _,
    ErrorAttributesChanged =
        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED as _,
}

impl_enum_conversion!(driver::CUgraphExecUpdateResult, GraphExecUpdateResult);

impl Display for GraphExecUpdateResult {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Success => write!(f, "CU_GRAPH_EXEC_UPDATE_SUCCESS"),
            Self::Error => write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR"),
            Self::ErrorTopologyChanged => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED")
            }
            Self::ErrorNodeTypeChanged => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED")
            }
            Self::ErrorFunctionChanged => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED")
            }
            Self::ErrorParametersChanged => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED")
            }
            Self::ErrorNotSupported => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED")
            }
            Self::ErrorUnsupportedFunctionChange => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE")
            }
            Self::ErrorAttributesChanged => {
                write!(f, "CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED")
            }
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct GraphNode {
    handle: runtime::cudaGraphNode_t,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct GraphEdgeData {
    pub from_port: u8,
    pub to_port: u8,
    pub dependency_type: GraphDependencyType,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct GraphDependency {
    pub node: GraphNode,
    pub data: GraphEdgeData,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct GraphEdge {
    pub from: GraphNode,
    pub to: GraphNode,
    pub data: GraphEdgeData,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Position {
    pub x: usize,
    pub y: usize,
    pub z: usize,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Extent {
    pub width: usize,
    pub height: usize,
    pub depth: usize,
}

// TODO: maybe remove?
pub mod raw {
    use std::ptr;

    use singe_cuda_sys::{driver, runtime};

    use crate::{memory::MemoryCopyKind, types::HostFunction};

    use super::{ArrayHandle, Extent, Position};

    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    pub struct PitchedPtr {
        ptr: *mut (),
        pub pitch: usize,
        pub x_size: usize,
        pub y_size: usize,
    }

    impl PitchedPtr {
        /// Creates pitched pointer parameters from a raw device or mapped host pointer.
        ///
        /// # Safety
        ///
        /// `ptr` must be valid for every row described by `pitch`, `x_size`, and
        /// `y_size` when CUDA evaluates the graph node using this value.
        pub const unsafe fn new(ptr: *mut (), pitch: usize, x_size: usize, y_size: usize) -> Self {
            Self {
                ptr,
                pitch,
                x_size,
                y_size,
            }
        }

        pub const fn ptr(self) -> *mut () {
            self.ptr
        }
    }

    #[derive(Debug, Clone, Copy)]
    pub struct Memcpy3DNodeParams {
        pub src_array: Option<ArrayHandle>,
        pub src_pos: Position,
        pub src_ptr: PitchedPtr,
        pub dst_array: Option<ArrayHandle>,
        pub dst_pos: Position,
        pub dst_ptr: PitchedPtr,
        pub extent: Extent,
        pub kind: MemoryCopyKind,
    }

    #[derive(Debug, Clone, Copy)]
    pub struct MemcpyToSymbolNodeParams {
        pub symbol: *const (),
        pub src: *const (),
        pub count: usize,
        pub offset: usize,
        pub kind: MemoryCopyKind,
    }

    #[derive(Debug, Clone, Copy)]
    pub struct MemcpyFromSymbolNodeParams {
        pub dst: *mut (),
        pub symbol: *const (),
        pub count: usize,
        pub offset: usize,
        pub kind: MemoryCopyKind,
    }

    #[derive(Debug, Clone, Copy)]
    pub struct HostNodeParams {
        pub func: HostFunction,
        pub user_data: *mut (),
    }

    impl HostNodeParams {
        /// Creates host callback node parameters from a raw user-data pointer.
        ///
        /// # Safety
        ///
        /// `user_data` must remain valid for `func` according to CUDA host-node
        /// callback rules until no graph execution can invoke the callback.
        pub const unsafe fn new(func: HostFunction, user_data: *mut ()) -> Self {
            Self { func, user_data }
        }
    }

    #[derive(Debug, Clone, Copy)]
    pub struct Memcpy1DNodeParams {
        pub dst: *mut (),
        pub src: *const (),
        pub count: usize,
        pub kind: MemoryCopyKind,
    }

    impl Memcpy1DNodeParams {
        /// Creates one-dimensional memcpy node parameters from raw pointers.
        ///
        /// # Safety
        ///
        /// `dst` and `src` must be valid for `count` bytes according to `kind` when
        /// CUDA evaluates the graph node using this value.
        pub const unsafe fn new(
            dst: *mut (),
            src: *const (),
            count: usize,
            kind: MemoryCopyKind,
        ) -> Self {
            Self {
                dst,
                src,
                count,
                kind,
            }
        }
    }

    impl From<PitchedPtr> for runtime::cudaPitchedPtr {
        fn from(value: PitchedPtr) -> Self {
            Self {
                ptr: value.ptr().cast(),
                pitch: value.pitch as _,
                xsize: value.x_size as _,
                ysize: value.y_size as _,
            }
        }
    }

    impl From<&Memcpy3DNodeParams> for runtime::cudaMemcpy3DParms {
        fn from(value: &Memcpy3DNodeParams) -> Self {
            Self {
                srcArray: value.src_array.map_or(ptr::null_mut(), ArrayHandle::as_raw),
                srcPos: value.src_pos.into(),
                srcPtr: value.src_ptr.into(),
                dstArray: value.dst_array.map_or(ptr::null_mut(), ArrayHandle::as_raw),
                dstPos: value.dst_pos.into(),
                dstPtr: value.dst_ptr.into(),
                extent: value.extent.into(),
                kind: value.kind.into(),
            }
        }
    }

    impl From<&HostNodeParams> for driver::CUDA_HOST_NODE_PARAMS {
        fn from(value: &HostNodeParams) -> Self {
            Self {
                fn_: value.func.as_raw(),
                userData: value.user_data.cast(),
            }
        }
    }
}

#[derive(Debug, Clone)]
pub struct MemAllocNodeParams<'a> {
    pub pool_props: MemoryPoolProps,
    pub access_descs: &'a [MemoryAccessDescriptor],
    pub byte_size: usize,
}

impl Default for GraphEdgeData {
    fn default() -> Self {
        Self {
            from_port: 0,
            to_port: 0,
            dependency_type: GraphDependencyType::Default,
        }
    }
}

impl From<runtime::cudaGraphEdgeData> for GraphEdgeData {
    fn from(value: runtime::cudaGraphEdgeData) -> Self {
        Self {
            from_port: value.from_port,
            to_port: value.to_port,
            dependency_type: GraphDependencyType::try_from(value.type_)
                .unwrap_or(GraphDependencyType::Default),
        }
    }
}

impl From<GraphEdgeData> for runtime::cudaGraphEdgeData {
    fn from(value: GraphEdgeData) -> Self {
        Self {
            from_port: value.from_port,
            to_port: value.to_port,
            type_: value.dependency_type.into(),
            reserved: [0; 5],
        }
    }
}

impl From<Position> for runtime::cudaPos {
    fn from(value: Position) -> Self {
        Self {
            x: value.x as _,
            y: value.y as _,
            z: value.z as _,
        }
    }
}

impl From<Extent> for runtime::cudaExtent {
    fn from(value: Extent) -> Self {
        Self {
            width: value.width as _,
            height: value.height as _,
            depth: value.depth as _,
        }
    }
}

impl GraphNode {
    const unsafe fn from_raw(handle: runtime::cudaGraphNode_t) -> Self {
        Self { handle }
    }

    /// Returns the node type.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA cannot query the node type or if a previous asynchronous launch
    /// reported an error. CUDA may also return initialization-related errors such as
    /// [`Status::NotInitialized`], [`Status::CallRequiresNewerDriver`], or
    /// [`Status::NoDevice`] if this call initializes internal runtime state. Callbacks must not
    /// call CUDA functions; see [`Stream::add_callback`].
    pub fn node_type(self) -> Result<GraphNodeType> {
        let mut kind = runtime::cudaGraphNodeType::CU_GRAPH_NODE_TYPE_KERNEL;
        unsafe {
            try_ffi!(runtime::cudaGraphNodeGetType(self.handle, &raw mut kind))?;
        }
        Ok(kind.into())
    }

    /// Returns this node's dependencies.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA cannot query the dependencies, a previous
    /// asynchronous launch reports an error, or CUDA reports runtime
    /// initialization diagnostics.
    pub fn dependencies(self) -> Result<Vec<GraphDependency>> {
        unsafe {
            let mut count = 0;
            try_ffi!(runtime::cudaGraphNodeGetDependencies(
                self.handle,
                ptr::null_mut(),
                ptr::null_mut(),
                &raw mut count,
            ))?;

            if count == 0 {
                return Ok(Vec::new());
            }

            let mut handles = Vec::with_capacity(count as usize);
            let mut edge_data = Vec::with_capacity(count as usize);
            try_ffi!(runtime::cudaGraphNodeGetDependencies(
                self.handle,
                handles.as_mut_ptr(),
                edge_data.as_mut_ptr(),
                &raw mut count,
            ))?;
            handles.set_len(count as usize);
            edge_data.set_len(count as usize);

            Ok(handles
                .into_iter()
                .zip(edge_data)
                .map(|(handle, data)| GraphDependency {
                    node: Self { handle },
                    data: data.into(),
                })
                .collect())
        }
    }

    /// Returns this node's dependent nodes.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA cannot query the dependent nodes, a previous
    /// asynchronous launch reports an error, or CUDA reports runtime
    /// initialization diagnostics.
    pub fn dependent_nodes(self) -> Result<Vec<GraphDependency>> {
        unsafe {
            let mut count = 0;
            try_ffi!(runtime::cudaGraphNodeGetDependentNodes(
                self.handle,
                ptr::null_mut(),
                ptr::null_mut(),
                &raw mut count,
            ))?;

            if count == 0 {
                return Ok(Vec::new());
            }

            let mut handles = Vec::with_capacity(count as usize);
            let mut edge_data = Vec::with_capacity(count as usize);
            try_ffi!(runtime::cudaGraphNodeGetDependentNodes(
                self.handle,
                handles.as_mut_ptr(),
                edge_data.as_mut_ptr(),
                &raw mut count,
            ))?;
            handles.set_len(count as usize);
            edge_data.set_len(count as usize);

            Ok(handles
                .into_iter()
                .zip(edge_data)
                .map(|(handle, data)| GraphDependency {
                    node: Self { handle },
                    data: data.into(),
                })
                .collect())
        }
    }

    /// Returns the event of this event record node.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not an event-record node, CUDA cannot query
    /// the event, CUDA returns a null event handle, a previous asynchronous
    /// launch reports an error, or CUDA reports runtime initialization
    /// diagnostics.
    pub fn event_record_node_event(self) -> Result<runtime::cudaEvent_t> {
        let mut event = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphEventRecordNodeGetEvent(
                self.handle,
                &raw mut event,
            ))?;
        }
        if event.is_null() {
            return Err(Error::NullHandle);
        }
        Ok(event)
    }

    /// Returns the event of this event wait node.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not an event-wait node, CUDA cannot query the
    /// event, CUDA returns a null event handle, a previous asynchronous launch
    /// reports an error, or CUDA reports runtime initialization diagnostics.
    pub fn event_wait_node_event(self) -> Result<runtime::cudaEvent_t> {
        let mut event = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphEventWaitNodeGetEvent(
                self.handle,
                &raw mut event,
            ))?;
        }
        if event.is_null() {
            return Err(Error::NullHandle);
        }
        Ok(event)
    }

    /// Returns a handle to the embedded graph in a child graph node.
    /// This does not clone the graph.
    /// Changes to the returned graph are reflected in the node, and the child
    /// node retains ownership of the embedded graph handle.
    /// The returned [`Graph`] is a borrowed wrapper and must not outlive the
    /// child graph node it came from.
    ///
    /// Allocation and free nodes cannot be added to the returned graph.
    /// Attempting to do so returns an error.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a child-graph node, CUDA cannot query the
    /// child graph, CUDA returns a null graph handle, a previous asynchronous
    /// launch reports an error, or CUDA reports runtime initialization
    /// diagnostics.
    pub fn child_graph(self) -> Result<Graph> {
        let mut graph = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphChildGraphNodeGetGraph(
                self.handle,
                &raw mut graph,
            ))?;
        }
        if graph.is_null() {
            return Err(Error::NullHandle);
        }
        Ok(unsafe { Graph::from_raw_borrowed(graph) })
    }

    /// Returns the parameters of this memcpy node.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a memcpy node, CUDA cannot query the
    /// parameters, a previous asynchronous launch reports an error, or CUDA
    /// reports runtime initialization diagnostics.
    pub fn memcpy_node_params(self) -> Result<runtime::cudaMemcpy3DParms> {
        let mut params = runtime::cudaMemcpy3DParms::default();
        unsafe {
            try_ffi!(runtime::cudaGraphMemcpyNodeGetParams(
                self.handle,
                &raw mut params,
            ))?;
        }
        Ok(params)
    }

    /// Returns the parameters of this memset node.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a memset node, CUDA cannot query the
    /// parameters, a previous asynchronous launch reports an error, or CUDA
    /// reports runtime initialization diagnostics.
    pub fn memset_node_params(self) -> Result<driver::CUDA_MEMSET_NODE_PARAMS> {
        let mut params = driver::CUDA_MEMSET_NODE_PARAMS::default();
        unsafe {
            try_ffi!(runtime::cudaGraphMemsetNodeGetParams(
                self.handle,
                &raw mut params,
            ))?;
        }
        Ok(params)
    }

    /// Returns the parameters of this host node.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a host node, CUDA cannot query the
    /// parameters, a previous asynchronous launch reports an error, or CUDA
    /// reports runtime initialization diagnostics.
    pub fn host_node_params(self) -> Result<driver::CUDA_HOST_NODE_PARAMS> {
        let mut params = driver::CUDA_HOST_NODE_PARAMS::default();
        unsafe {
            try_ffi!(runtime::cudaGraphHostNodeGetParams(
                self.handle,
                &raw mut params,
            ))?;
        }
        Ok(params)
    }

    /// Returns the parameters of a memory allocation node.
    /// The `poolProps` and `accessDescs` values in the returned parameters are owned by the node.
    /// This memory remains valid until the node is destroyed.
    /// The returned parameters must not be modified.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a memory-allocation node, CUDA cannot
    /// query the parameters, a previous asynchronous launch reports an error,
    /// or CUDA reports runtime initialization diagnostics.
    pub fn mem_alloc_node_info(self) -> Result<MemoryAllocationNodeInfo> {
        let mut params = runtime::cudaMemAllocNodeParams::default();
        unsafe {
            try_ffi!(runtime::cudaGraphMemAllocNodeGetParams(
                self.handle,
                &raw mut params,
            ))?;
        }
        Ok(MemoryAllocationNodeInfo {
            ptr: DevicePtr::new(params.dptr as _),
            byte_size: params.bytesize as usize,
        })
    }

    /// Returns the address of this memory free node.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a memory-free node, CUDA cannot query the
    /// pointer, a previous asynchronous launch reports an error, or CUDA reports
    /// runtime initialization diagnostics.
    ///
    /// # Safety
    ///
    /// The node must still be a valid memory-free node in a live graph, and the
    /// returned pointer must not be used after the graph frees it.
    pub unsafe fn mem_free_node_ptr(self) -> Result<DevicePtr> {
        let mut ptr = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphMemFreeNodeGetParams(
                self.handle,
                &raw mut ptr as *mut _,
            ))?;
        }
        Ok(DevicePtr::new(ptr as _))
    }

    /// Returns the requested kernel node attribute.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a kernel node, CUDA cannot query the
    /// attribute, or a previous asynchronous launch reports an error.
    pub fn kernel_node_attribute(
        self,
        id: GraphKernelNodeAttributeId,
    ) -> Result<GraphKernelNodeAttribute> {
        let mut value = runtime::cudaLaunchAttributeValue::default();
        unsafe {
            try_ffi!(runtime::cudaGraphKernelNodeGetAttribute(
                self.handle,
                id.into(),
                &raw mut value,
            ))?;

            Ok(match id {
                GraphKernelNodeAttributeId::Cooperative => {
                    GraphKernelNodeAttribute::Cooperative(*value.cooperative.as_ref() != 0)
                }
                GraphKernelNodeAttributeId::ClusterDimension => {
                    let dim = value.clusterDim.as_ref();
                    GraphKernelNodeAttribute::ClusterDimension(Dim3::new(dim.x, dim.y, dim.z))
                }
                GraphKernelNodeAttributeId::Priority => {
                    GraphKernelNodeAttribute::Priority(*value.priority.as_ref())
                }
                GraphKernelNodeAttributeId::PreferredSharedMemoryCarveout => {
                    GraphKernelNodeAttribute::PreferredSharedMemoryCarveout(
                        *value.sharedMemCarveout.as_ref(),
                    )
                }
            })
        }
    }

    /// Sets a kernel node attribute.
    ///
    /// # Errors
    ///
    /// Returns an error if this is not a kernel node, CUDA rejects the
    /// attribute update, or a previous asynchronous launch reports an error.
    pub fn set_kernel_node_attribute(&mut self, attribute: GraphKernelNodeAttribute) -> Result<()> {
        let (id, value) = match attribute {
            GraphKernelNodeAttribute::Cooperative(value) => {
                let mut attr = runtime::cudaLaunchAttributeValue {
                    cooperative: runtime::__BindgenUnionField::new(),
                    ..runtime::cudaLaunchAttributeValue::default()
                };
                unsafe { *attr.cooperative.as_mut() = i32::from(value) };
                (GraphKernelNodeAttributeId::Cooperative, attr)
            }
            GraphKernelNodeAttribute::ClusterDimension(value) => {
                let mut attr = runtime::cudaLaunchAttributeValue {
                    clusterDim: runtime::__BindgenUnionField::new(),
                    ..runtime::cudaLaunchAttributeValue::default()
                };
                unsafe {
                    *attr.clusterDim.as_mut() = runtime::cudaLaunchAttributeValue__bindgen_ty_1 {
                        x: value.x,
                        y: value.y,
                        z: value.z,
                    };
                }
                (GraphKernelNodeAttributeId::ClusterDimension, attr)
            }
            GraphKernelNodeAttribute::Priority(value) => {
                let mut attr = runtime::cudaLaunchAttributeValue {
                    priority: runtime::__BindgenUnionField::new(),
                    ..runtime::cudaLaunchAttributeValue::default()
                };
                unsafe { *attr.priority.as_mut() = value as _ };
                (GraphKernelNodeAttributeId::Priority, attr)
            }
            GraphKernelNodeAttribute::PreferredSharedMemoryCarveout(value) => {
                let mut attr = runtime::cudaLaunchAttributeValue {
                    sharedMemCarveout: runtime::__BindgenUnionField::new(),
                    ..runtime::cudaLaunchAttributeValue::default()
                };
                unsafe { *attr.sharedMemCarveout.as_mut() = value };
                (
                    GraphKernelNodeAttributeId::PreferredSharedMemoryCarveout,
                    attr,
                )
            }
        };

        unsafe {
            try_ffi!(runtime::cudaGraphKernelNodeSetAttribute(
                self.handle,
                id.into(),
                &raw const value,
            ))?;
        }
        Ok(())
    }

    /// Copies attributes from `src` to this node.
    /// Both nodes must have the same context.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the attribute copy or if a previous asynchronous launch
    /// reported an error.
    pub fn copy_kernel_node_attributes(self, other: Self) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphKernelNodeCopyAttributes(
                self.handle,
                other.handle
            ))?;
        }
        Ok(())
    }

    pub const fn as_raw(self) -> runtime::cudaGraphNode_t {
        self.handle
    }
}

impl MemoryAllocationNodeInfo {
    pub const fn ptr(&self) -> DevicePtr {
        self.ptr
    }
}

#[derive(Debug)]
pub struct Graph {
    handle: runtime::cudaGraph_t,
    owns_handle: bool,
}

impl Graph {
    pub(crate) const unsafe fn from_raw(handle: runtime::cudaGraph_t) -> Self {
        Self {
            handle,
            owns_handle: true,
        }
    }

    pub(crate) const unsafe fn from_raw_borrowed(handle: runtime::cudaGraph_t) -> Self {
        Self {
            handle,
            owns_handle: false,
        }
    }

    /// Creates an empty graph.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn create() -> Result<Self> {
        let mut handle = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphCreate(&raw mut handle, 0))?;
        }
        Ok(Self {
            handle,
            owns_handle: true,
        })
    }

    pub fn instantiate(&self) -> Result<ExecutableGraph> {
        self.instantiate_with_flags(GraphInstantiateFlags::empty())
    }

    /// Instantiates graph as an executable graph.
    /// The graph is validated for any structural constraints or intra-node constraints which were not previously validated.
    /// If instantiation is successful, returns an instantiated executable graph.
    ///
    /// `flags` controls the behavior of instantiation and subsequent graph launches.
    /// Valid flags are:
    ///
    /// * [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`], which configures a graph containing memory allocation nodes to automatically free any unfreed memory allocations before
    ///   the graph is relaunched.
    ///
    /// * [`GraphInstantiateFlags::DEVICE_LAUNCH`], which configures the graph for launch from the device.
    ///   If this flag is passed, the executable graph handle returned can
    ///   be used to launch the graph from both the host and device.
    ///   This flag can only be used on platforms which support unified addressing.
    ///   This flag cannot be used in conjunction with [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`].
    ///
    /// * [`GraphInstantiateFlags::USE_NODE_PRIORITY`], which causes the graph to use the priorities from the per-node attributes rather than the priority of the launch stream
    ///   during execution.
    ///   Priorities are only available on kernel nodes and are copied from stream priority during stream capture.
    ///
    /// If the graph contains any allocation or free nodes, there can be at most one executable graph in existence for that graph at a time.
    /// An attempt to instantiate a second executable graph before dropping the first results in an error.
    /// The same also applies if the graph contains any device-updatable kernel nodes.
    ///
    /// If the graph contains kernels which call device-side [`ExecutableGraph::launch`] from multiple devices, this results in an error.
    ///
    /// Graphs instantiated for launch on the device have additional restrictions which do not apply to host graphs:
    ///
    /// * The graph's nodes must reside on a single device.
    /// * The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
    /// * The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
    ///   Operation-specific restrictions are
    ///   outlined below.
    /// * Kernel nodes:
    ///   + Use of CUDA Dynamic Parallelism is not permitted.
    ///   + Cooperative launches are permitted as long as MPS is not in use.
    /// * Memcpy nodes:
    ///   + Only copies involving device memory and/or pinned device-mapped host memory are permitted.
    ///   + Copies involving CUDA arrays are not permitted.
    ///   + Both operands must be accessible from the current device, and the current device must match the device of other nodes in the
    ///     graph.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn instantiate_with_flags(&self, flags: GraphInstantiateFlags) -> Result<ExecutableGraph> {
        let mut handle = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphInstantiateWithFlags(
                &raw mut handle,
                self.handle,
                flags.bits(),
            ))?;
        }
        Ok(ExecutableGraph { handle })
    }

    /// Creates a copy of `original_graph`.
    /// All parameters are copied into the cloned graph.
    /// The original graph may be modified after this call without affecting the clone.
    ///
    /// Child graph nodes in the original graph are recursively copied into the clone.
    ///
    /// Cloning is not supported for graphs that contain memory allocation nodes, memory free nodes, or conditional nodes.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn try_clone(&self) -> Result<Self> {
        let mut handle = ptr::null_mut();
        unsafe {
            try_ffi!(runtime::cudaGraphClone(&raw mut handle, self.handle))?;
        }
        Ok(Self {
            handle,
            owns_handle: true,
        })
    }

    pub fn add_dependency(&mut self, from: GraphNode, to: GraphNode) -> Result<()> {
        self.add_dependencies(&[from], &[to])
    }

    pub fn add_dependencies(&mut self, from: &[GraphNode], to: &[GraphNode]) -> Result<()> {
        self.add_dependencies_with_data(from, to, &[])
    }

    /// Elements in `from` and `to` at corresponding indices define each dependency to add.
    /// Each node in `from` and `to` must belong to this graph.
    ///
    /// If `from` and `to` are empty, the call returns without modifying the graph.
    /// Specifying an existing dependency returns an error.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_dependencies_with_data(
        &mut self,
        from: &[GraphNode],
        to: &[GraphNode],
        edge_data: &[GraphEdgeData],
    ) -> Result<()> {
        if from.len() != to.len() {
            return Err(Error::GraphDependencyMismatch);
        }
        if !edge_data.is_empty() && edge_data.len() != from.len() {
            return Err(Error::GraphDependencyMismatch);
        }
        if from.is_empty() {
            return Ok(());
        }

        let from_raw: Vec<_> = from.iter().map(|node| node.handle).collect();
        let to_raw: Vec<_> = to.iter().map(|node| node.handle).collect();
        let edge_data_raw: Vec<_> = edge_data.iter().copied().map(Into::into).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddDependencies(
                self.handle,
                from_raw.as_ptr(),
                to_raw.as_ptr(),
                if edge_data_raw.is_empty() {
                    ptr::null()
                } else {
                    edge_data_raw.as_ptr()
                },
                from_raw.len() as runtime::size_t,
            ))?;
        }
        Ok(())
    }

    pub fn remove_dependency(&mut self, from: GraphNode, to: GraphNode) -> Result<()> {
        self.remove_dependencies(&[from], &[to])
    }

    pub fn remove_dependencies(&mut self, from: &[GraphNode], to: &[GraphNode]) -> Result<()> {
        self.remove_dependencies_with_data(from, to, &[])
    }

    /// Elements in `from` and `to` at corresponding indices define each dependency to remove.
    /// Each node in `from` and `to` must belong to this graph.
    ///
    /// If `from` and `to` are empty, the call returns without modifying the graph.
    /// Specifying an edge that does not exist in the graph, with data matching `edge_data`, results in an error.
    /// Passing an empty `edge_data` slice is equivalent to passing default edge data for each edge.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn remove_dependencies_with_data(
        &mut self,
        from: &[GraphNode],
        to: &[GraphNode],
        edge_data: &[GraphEdgeData],
    ) -> Result<()> {
        if from.len() != to.len() {
            return Err(Error::GraphDependencyMismatch);
        }
        if !edge_data.is_empty() && edge_data.len() != from.len() {
            return Err(Error::GraphDependencyMismatch);
        }
        if from.is_empty() {
            return Ok(());
        }

        let from_raw: Vec<_> = from.iter().map(|node| node.handle).collect();
        let to_raw: Vec<_> = to.iter().map(|node| node.handle).collect();
        let edge_data_raw: Vec<_> = edge_data.iter().copied().map(Into::into).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphRemoveDependencies(
                self.handle,
                from_raw.as_ptr(),
                to_raw.as_ptr(),
                if edge_data_raw.is_empty() {
                    ptr::null()
                } else {
                    edge_data_raw.as_ptr()
                },
                from_raw.len() as runtime::size_t,
            ))?;
        }
        Ok(())
    }

    pub fn add_edges(&mut self, edges: &[GraphEdge]) -> Result<()> {
        if edges.is_empty() {
            return Ok(());
        }

        let from: Vec<_> = edges.iter().map(|edge| edge.from).collect();
        let to: Vec<_> = edges.iter().map(|edge| edge.to).collect();
        let data: Vec<_> = edges.iter().map(|edge| edge.data).collect();
        self.add_dependencies_with_data(&from, &to, &data)
    }

    pub fn remove_edges(&mut self, edges: &[GraphEdge]) -> Result<()> {
        if edges.is_empty() {
            return Ok(());
        }

        let from: Vec<_> = edges.iter().map(|edge| edge.from).collect();
        let to: Vec<_> = edges.iter().map(|edge| edge.to).collect();
        let data: Vec<_> = edges.iter().map(|edge| edge.data).collect();
        self.remove_dependencies_with_data(&from, &to, &data)
    }

    /// Creates a node that performs no operation and adds it to the graph with the given dependencies.
    /// The dependency list may be empty, in which case the node is placed at the
    /// graph root. It may not contain duplicate entries.
    ///
    /// An empty node performs no operation during execution, but can be used for transitive ordering.
    /// For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2\*n dependency edges, rather than no empty node and n^2 dependency edges.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation or reports runtime initialization
    /// diagnostics. Callbacks must not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_empty_node(&mut self, dependencies: &[GraphNode]) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddEmptyNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates an event record node and adds it to the graph with the given dependencies and event.
    /// The dependency list may be empty, in which case the node is placed at the
    /// graph root. It may not contain duplicate entries.
    ///
    /// Each graph launch records `event` to capture execution of the node's dependencies.
    ///
    /// These nodes may not be used in loops or conditionals.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_event_record_node(
        &mut self,
        dependencies: &[GraphNode],
        event: &Event,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddEventRecordNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                event.as_raw(),
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates an event wait node and adds it to the graph with the given dependencies and event.
    /// The dependency list may be empty, in which case the node is placed at the
    /// graph root. It may not contain duplicate entries.
    ///
    /// The graph node waits for all work captured in `event`.
    /// See [`sys::cuEventRecord`](singe_cuda_sys::driver::cuEventRecord) for details on what is captured by an event.
    /// Synchronization is performed efficiently on the device when applicable.
    /// `event` may come from a different context or device than the launch stream.
    ///
    /// These nodes may not be used in loops or conditionals.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_event_wait_node(
        &mut self,
        dependencies: &[GraphNode],
        event: &Event,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddEventWaitNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                event.as_raw(),
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a CPU execution node and adds it to the graph with the given dependencies and host-node parameters.
    /// The dependency list may be empty, in which case the node is placed at the
    /// graph root. It may not contain duplicate entries.
    ///
    /// When the graph is launched, the node invokes the specified CPU function.
    /// Host nodes are not supported under MPS with pre-Volta GPUs.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_host_node(
        &mut self,
        dependencies: &[GraphNode],
        params: &HostNodeParams,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        let params = params.into();
        unsafe {
            try_ffi!(runtime::cudaGraphAddHostNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                &raw const params,
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a kernel execution node and adds it to the graph with the given dependencies, launch configuration, and kernel parameters.
    /// The dependency list may be empty, in which case the node is placed at the
    /// graph root. It may not contain duplicate entries.
    ///
    /// When the graph is launched, the node invokes the kernel on the grid and blocks specified by [`LaunchConfig`](crate::module::LaunchConfig).
    /// [`LaunchConfig::shared_memory_bytes`](crate::module::LaunchConfig::shared_memory_bytes) sets the amount of dynamic shared memory available to each thread block.
    /// Kernel parameters are passed with [`KernelParameters`](crate::module::KernelParameters) or tuples of shared or mutable references.
    ///
    /// Kernels launched using graphs must not use texture and surface references.
    /// Reading or writing through any texture or surface reference is undefined behavior.
    /// This restriction does not apply to texture and surface objects.
    ///
    /// Runtime kernel handles queried via [`sys::cudaLibraryGetKernel`](singe_cuda_sys::runtime::cudaLibraryGetKernel) or [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) may be used.
    /// The symbol passed to [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) must be registered with the same CUDA Runtime instance.
    /// Passing a symbol that belongs to a different runtime instance results in undefined behavior.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_kernel_node<'a, P>(
        &mut self,
        dependencies: &[GraphNode],
        function: DeviceFunction,
        config: &LaunchConfig,
        params: P,
    ) -> Result<GraphNode>
    where
        P: KernelLaunchArgs<'a>,
    {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        params.with_raw_pointers(|arguments| unsafe {
            let params = runtime::cudaKernelNodeParams {
                func: function.as_raw().cast(),
                gridDim: config.grid_dim.into(),
                blockDim: config.block_dim.into(),
                sharedMemBytes: config.shared_memory_bytes as _,
                kernelParams: arguments.as_mut_ptr().cast(),
                extra: ptr::null_mut(),
            };
            try_ffi!(runtime::cudaGraphAddKernelNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                &raw const params,
            ))?;
            Ok(GraphNode::from_raw(handle))
        })
    }

    /// Creates a new 1D memcpy node and adds it to the graph with the given dependencies.
    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
    ///
    /// When the graph is launched, the node copies `count` bytes from `src` to `dst`.
    /// The transfer direction is described by [`MemoryCopyKind`].
    /// [`MemoryCopyKind::Default`] is recommended when unified virtual addressing is available, in which case the transfer direction is inferred from the pointer values.
    /// Launching a memcpy node with `dst` and `src` pointers that do not match the direction of the copy results in undefined behavior.
    ///
    /// Memcpy nodes have additional restrictions for managed memory if any device in the system does not support concurrent managed access.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_memcpy_node_1d(
        &mut self,
        dependencies: &[GraphNode],
        params: &Memcpy1DNodeParams,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemcpyNode1D(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                params.dst.cast(),
                params.src.cast(),
                params.count as _,
                params.kind.into(),
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a device-to-device memcpy node from typed byte buffers.
    ///
    /// The node copies `src.byte_len()` bytes. `dst` must have at least that
    /// many bytes.
    ///
    /// # Errors
    ///
    /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
    /// operation, if a previous asynchronous launch reported an error, or if CUDA
    /// reports runtime initialization diagnostics.
    pub fn add_memcpy_node_1d_device_to_device<D, S>(
        &mut self,
        dependencies: &[GraphNode],
        dst: &mut D,
        src: &S,
    ) -> Result<GraphNode>
    where
        D: ByteBufferMut + ?Sized,
        S: ByteBuffer + ?Sized,
    {
        let count = src.byte_len();
        if dst.byte_len() < count {
            return Err(Error::InvalidMemoryAccess);
        }
        let params = unsafe {
            Memcpy1DNodeParams::new(
                dst.as_byte_mut_ptr().cast(),
                src.as_byte_ptr().cast(),
                count,
                MemoryCopyKind::DeviceToDevice,
            )
        };
        self.add_memcpy_node_1d(dependencies, &params)
    }

    /// Creates a memcpy node and adds it to the graph with the given dependencies.
    /// The dependency list may be empty, in which case the node is placed at the
    /// graph root. It may not contain duplicate entries.
    ///
    /// When the graph is launched, the node performs the memcpy described by `params`.
    /// See [`sys::cudaMemcpy3D`](singe_cuda_sys::runtime::cudaMemcpy3D) for a description of the structure and its restrictions.
    ///
    /// Memcpy nodes have additional restrictions for managed memory if any device in the system does not support concurrent managed access.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_memcpy_node(
        &mut self,
        dependencies: &[GraphNode],
        params: &Memcpy3DNodeParams,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        let params = params.into();
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemcpyNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                &raw const params,
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    pub fn add_memcpy_node_to_symbol(
        &mut self,
        dependencies: &[GraphNode],
        params: &MemcpyToSymbolNodeParams,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemcpyNodeToSymbol(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                params.symbol.cast(),
                params.src.cast(),
                params.count as _,
                params.offset as _,
                params.kind.into(),
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    pub fn add_memcpy_node_from_symbol(
        &mut self,
        dependencies: &[GraphNode],
        params: &MemcpyFromSymbolNodeParams,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemcpyNodeFromSymbol(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                params.dst.cast(),
                params.symbol.cast(),
                params.count as _,
                params.offset as _,
                params.kind.into(),
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a new memset node and adds it to the graph with the given dependencies.
    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
    ///
    /// The element size must be 1, 2, or 4 bytes.
    /// When the graph is launched, the node performs the memset described by `params`.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_memset_node(
        &mut self,
        dependencies: &[GraphNode],
        params: &MemsetNodeParams,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        let params = params.into();
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemsetNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                &raw const params,
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a new node which executes an embedded graph, and adds it to the graph with the given dependencies.
    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
    ///
    /// If `child_graph` contains allocation nodes, free nodes, or conditional nodes, this call returns an error.
    ///
    /// The node executes an embedded child graph.
    /// The child graph is cloned in this call.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn add_child_graph_node(
        &mut self,
        dependencies: &[GraphNode],
        child_graph: &Self,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddChildGraphNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                child_graph.handle,
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a new memory free node and adds it to the graph with the given dependencies and address.
    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
    ///
    /// [`Graph::add_mem_free_node`] returns [`Status::InvalidValue`] if the caller attempts to free:
    ///
    /// * an allocation twice in the same graph.
    /// * an address that was not returned by an allocation node.
    /// * an invalid address.
    ///
    /// The following restrictions apply to graphs which contain allocation and/or memory free nodes:
    ///
    /// * Nodes and edges of the graph cannot be deleted.
    /// * The graph can only be used in a child node if the ownership is moved to the parent.
    /// * Only one instantiation of the graph may exist at any point in time.
    /// * The graph cannot be cloned.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation or if a previous asynchronous
    /// launch reported an error.
    pub fn add_mem_free_node(
        &mut self,
        dependencies: &[GraphNode],
        ptr: DevicePtr,
    ) -> Result<GraphNode> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemFreeNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                ptr.as_ptr() as _,
            ))?;
            Ok(GraphNode::from_raw(handle))
        }
    }

    /// Creates a new allocation node and adds it to the graph with the given dependencies and allocation parameters.
    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
    ///
    /// When [`Graph::add_mem_alloc_node`] creates an allocation node, it returns the address of the allocation in [`MemoryAllocationNodeInfo::ptr`].
    /// The allocation's address remains fixed across instantiations and launches.
    ///
    /// If the allocation is freed in the same graph, by creating a free node using [`Graph::add_mem_free_node`], the allocation can be accessed by nodes ordered after the allocation node but before the free node.
    /// These allocations cannot be freed outside the owning graph, and they can only be freed once in the owning graph.
    ///
    /// If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the graph which are ordered after the allocation node, but also by stream operations ordered after the graph's execution but before the allocation is freed.
    ///
    /// Allocations which are not freed in the same graph can be freed by:
    ///
    /// * passing the allocation to [`DeviceMemory::free_async`](crate::memory::DeviceMemory::free_async) or [`DeviceMemory::free`](crate::memory::DeviceMemory::free);
    /// * launching a graph with a free node for that allocation; or
    /// * specifying [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`] during instantiation, which makes each launch behave as though it called [`DeviceMemory::free_async`](crate::memory::DeviceMemory::free_async) for every unfreed allocation.
    ///
    /// It is not possible to free an allocation in both the owning graph and another graph.
    /// If the allocation is freed in the same graph, a free node cannot be added to another graph.
    /// If the allocation is freed in another graph, a free node can no longer be added to the owning graph.
    ///
    /// The following restrictions apply to graphs which contain allocation and/or memory free nodes:
    ///
    /// * Nodes and edges of the graph cannot be deleted.
    /// * The graph can only be used in a child node if the ownership is moved to the parent.
    /// * Only one instantiation of the graph may exist at any point in time.
    /// * The graph cannot be cloned.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation or if a previous asynchronous
    /// launch reported an error.
    pub fn add_mem_alloc_node(
        &mut self,
        dependencies: &[GraphNode],
        params: &MemAllocNodeParams<'_>,
    ) -> Result<(GraphNode, DevicePtr)> {
        let mut handle = ptr::null_mut();
        let dependencies_raw: Vec<_> = dependencies.iter().map(|node| node.handle).collect();
        let access_descs: Vec<_> = params
            .access_descs
            .iter()
            .copied()
            .map(Into::into)
            .collect();
        let mut params_raw = runtime::cudaMemAllocNodeParams {
            poolProps: params.pool_props.into(),
            accessDescs: access_descs.as_ptr(),
            accessDescCount: access_descs.len() as runtime::size_t,
            bytesize: params.byte_size as _,
            dptr: 0,
        };
        unsafe {
            try_ffi!(runtime::cudaGraphAddMemAllocNode(
                &raw mut handle,
                self.handle,
                dependencies_raw.as_ptr(),
                dependencies_raw.len() as runtime::size_t,
                &raw mut params_raw,
            ))?;
            // TODO: verify dptr?
            Ok((
                GraphNode::from_raw(handle),
                DevicePtr::new(params_raw.dptr as *mut ()),
            ))
        }
    }

    /// Returns this graph's nodes.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn nodes(&self) -> Result<Vec<GraphNode>> {
        unsafe {
            let mut count = 0;
            try_ffi!(runtime::cudaGraphGetNodes(
                self.handle,
                ptr::null_mut(),
                &raw mut count,
            ))?;

            if count == 0 {
                return Ok(Vec::new());
            }

            let mut handles = Vec::with_capacity(count as usize);
            try_ffi!(runtime::cudaGraphGetNodes(
                self.handle,
                handles.as_mut_ptr(),
                &raw mut count,
            ))?;
            handles.set_len(count as usize);

            Ok(handles
                .into_iter()
                .map(|handle| GraphNode { handle })
                .collect())
        }
    }

    /// Returns this graph's root nodes.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn root_nodes(&self) -> Result<Vec<GraphNode>> {
        unsafe {
            let mut count = 0;
            try_ffi!(runtime::cudaGraphGetRootNodes(
                self.handle,
                ptr::null_mut(),
                &raw mut count,
            ))?;

            if count == 0 {
                return Ok(Vec::new());
            }

            let mut handles = Vec::with_capacity(count as usize);
            try_ffi!(runtime::cudaGraphGetRootNodes(
                self.handle,
                handles.as_mut_ptr(),
                &raw mut count,
            ))?;
            handles.set_len(count as usize);

            Ok(handles
                .into_iter()
                .map(|handle| GraphNode { handle })
                .collect())
        }
    }

    /// Returns this graph's dependency edges.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn edges(&self) -> Result<Vec<GraphEdge>> {
        unsafe {
            let mut count = 0;
            try_ffi!(runtime::cudaGraphGetEdges(
                self.handle,
                ptr::null_mut(),
                ptr::null_mut(),
                ptr::null_mut(),
                &raw mut count,
            ))?;

            if count == 0 {
                return Ok(Vec::new());
            }

            let len = count as usize;
            let mut from = Vec::with_capacity(len);
            let mut to = Vec::with_capacity(len);
            let mut edge_data = Vec::with_capacity(len);
            try_ffi!(runtime::cudaGraphGetEdges(
                self.handle,
                from.as_mut_ptr(),
                to.as_mut_ptr(),
                edge_data.as_mut_ptr(),
                &raw mut count,
            ))?;
            let len = count as usize;
            from.set_len(len);
            to.set_len(len);
            edge_data.set_len(len);

            Ok(from
                .into_iter()
                .zip(to)
                .zip(edge_data)
                .map(|((from, to), data)| GraphEdge {
                    from: GraphNode { handle: from },
                    to: GraphNode { handle: to },
                    data: data.into(),
                })
                .collect())
        }
    }

    /// Writes a DOT-formatted description of the graph to `path`.
    /// By default this includes the graph topology, node types, node ID, kernel names, and memcpy direction.
    /// `flags` can request more detailed information about each node type, such as parameter values, kernel attributes, node handles, and function handles.
    ///
    /// # Errors
    ///
    /// Returns an error if `path` contains an interior NUL byte or if CUDA
    /// Runtime cannot write the DOT file.
    pub fn write_dot(&self, path: &str, flags: GraphDebugDotFlags) -> Result<()> {
        let path = CString::new(path)?;
        unsafe {
            try_ffi!(runtime::cudaGraphDebugDotPrint(
                self.handle,
                path.as_ptr(),
                flags.bits(),
            ))?;
        }
        Ok(())
    }

    pub const fn as_raw(&self) -> runtime::cudaGraph_t {
        self.handle
    }
}

impl Drop for Graph {
    fn drop(&mut self) {
        if !self.owns_handle {
            return;
        }
        unsafe {
            if let Err(err) = try_ffi!(runtime::cudaGraphDestroy(self.handle)) {
                #[cfg(debug_assertions)]
                eprintln!("failed to destroy cuda graph: {err}");
            }
        }
    }
}

#[derive(Debug)]
pub struct ExecutableGraph {
    handle: runtime::cudaGraphExec_t,
}

impl ExecutableGraph {
    /// Returns the flags that were passed to instantiation for the given executable graph.
    /// [`GraphInstantiateFlags::UPLOAD`] is not returned because it does not affect the resulting executable graph.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn flags(&self) -> Result<GraphInstantiateFlags> {
        let mut flags = 0;
        unsafe {
            try_ffi!(runtime::cudaGraphExecGetFlags(self.handle, &raw mut flags))?;
        }
        Ok(GraphInstantiateFlags::from_bits_retain(flags))
    }

    /// Executes this executable graph in `stream`.
    /// Only one instance of this executable graph may be executing at a time.
    /// Each launch is ordered behind both any previous work in `stream` and any previous launches of this executable graph.
    /// To execute a graph concurrently, it must be instantiated multiple times into multiple executable graphs.
    ///
    /// If any allocations created by this executable graph remain unfreed from a previous launch and the graph was not instantiated with [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`], the launch fails with [`Status::InvalidValue`].
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn launch(&self, stream: &Stream) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphLaunch(self.handle, stream.as_raw()))?;
        }
        Ok(())
    }

    /// Uploads this executable graph to the device in `stream` without executing it.
    /// Uploads of the same executable graph are serialized.
    /// Each upload is ordered behind both any previous work in `stream` and any previous launches of this executable graph.
    /// Uses memory cached by `stream` to back the allocations owned by this executable graph.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics.
    pub fn upload(&self, stream: &Stream) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphUpload(self.handle, stream.as_raw()))?;
        }
        Ok(())
    }

    /// Updates this executable graph with the node parameters in a topologically identical `graph`.
    ///
    /// Limitations:
    ///
    /// * Kernel nodes:
    ///   + The owning context of the kernel function cannot change.
    ///   + A node whose kernel function originally did not use CUDA dynamic parallelism cannot be updated to a kernel function that uses CDP.
    ///   + A node whose kernel function originally did not make device-side update calls cannot be updated to a kernel function that makes device-side
    ///     update calls.
    ///   + A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
    ///   + If the graph was instantiated with [`GraphInstantiateFlags::USE_NODE_PRIORITY`], the priority attribute cannot change.
    ///     Equality
    ///     is checked on the originally requested priority values, before they are clamped to the device's supported range.
    ///   + If this executable graph was not instantiated for device launch, a node whose kernel function originally did not use device-side [`ExecutableGraph::launch`] cannot be updated to a kernel function that uses device-side [`ExecutableGraph::launch`] unless the node resides on the same device as nodes which contained such calls at instantiate-time.
    ///     If no such calls were
    ///     present at instantiation, these updates cannot be performed at all.
    ///   + Neither the source graph nor this executable graph may contain device-updatable kernel nodes.
    /// * Memset and memcpy nodes:
    ///   + The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
    ///   + The source/destination memory must be allocated from the same contexts as the original source/destination memory.
    ///   + For 2D memsets, only address and assigned value may be updated.
    ///   + For 1D memsets, updating dimensions is also allowed, but may fail if the resulting operation does not map onto the work resources
    ///     already allocated for the node.
    /// * Additional memcpy node restrictions:
    ///   + Changing either the source or destination memory type, such as [`MemoryType::Device`](crate::types::MemoryType::Device) or [`MemoryType::Array`](crate::types::MemoryType::Array), is not supported.
    /// * Conditional nodes:
    ///   + Changing node parameters is not supported.
    ///   + Changing parameters of nodes within the conditional body graph is subject to the rules above.
    ///   + Conditional handle flags and default values are updated as part of the graph update.
    ///
    /// CUDA may add further restrictions in future releases.
    /// [`ExecutableGraph::update`] sets the update result to [`GraphExecUpdateResult::ErrorTopologyChanged`] under the following conditions:
    ///
    /// * The count of nodes directly in the executable graph and the source graph differ.
    /// * The source graph has more exit nodes.
    /// * A node in the source graph has a different number of dependencies than the paired node from the executable graph.
    /// * A node in the source graph has a dependency that does not match the corresponding dependency of the paired node from the executable graph.
    ///   The dependencies are paired based on edge order and
    ///   a dependency does not match when the nodes are already paired based on other edges examined in the graph.
    ///
    /// [`ExecutableGraph::update`] sets the update result to:
    ///
    /// * [`GraphExecUpdateResult::Error`] if passed an invalid value.
    /// * [`GraphExecUpdateResult::ErrorTopologyChanged`] if the graph topology changed.
    /// * [`GraphExecUpdateResult::ErrorNodeTypeChanged`] if the type of a node changed.
    /// * [`GraphExecUpdateResult::ErrorFunctionChanged`] if the kernel function of a node changed (CUDA driver before 11.2).
    /// * [`GraphExecUpdateResult::ErrorUnsupportedFunctionChange`] if the kernel function changed in an unsupported way.
    /// * [`GraphExecUpdateResult::ErrorParametersChanged`] if any parameters to a node changed in a way that is not supported.
    /// * [`GraphExecUpdateResult::ErrorAttributesChanged`] if any attributes of a node changed in a way that is not supported.
    /// * [`GraphExecUpdateResult::ErrorNotSupported`] if something about a node is unsupported, like the node's type or configuration.
    ///
    /// If the update fails for a reason not listed above, the result is [`GraphExecUpdateResult::Error`].
    /// If the update succeeds, the result is [`GraphExecUpdateResult::Success`].
    ///
    /// [`ExecutableGraph::update`] succeeds when the update was performed successfully.
    /// It returns [`Status::GraphExecUpdateFailure`] if the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph update, if the update violates instantiated graph
    /// update constraints, or if a previous asynchronous launch reported an error. CUDA may also
    /// return initialization-related errors such as [`Status::NotInitialized`],
    /// [`Status::CallRequiresNewerDriver`], or [`Status::NoDevice`] if this call initializes
    /// internal runtime state. Callbacks must not call CUDA functions; see
    /// [`Stream::add_callback`].
    pub fn update(&mut self, graph: &Graph) -> Result<ExecutableGraphUpdate> {
        let mut result_info = runtime::cudaGraphExecUpdateResultInfo::default();
        unsafe {
            try_ffi!(runtime::cudaGraphExecUpdate(
                self.handle,
                graph.handle,
                &raw mut result_info,
            ))?;
        }
        Ok(result_info.into())
    }

    /// Sets the parameters of a kernel node in this executable graph.
    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
    ///
    /// `node` must not have been removed from the original graph.
    /// All node parameters may change, but the following restrictions apply to function updates:
    ///
    /// * The owning device of the kernel function cannot change.
    /// * A node whose kernel function originally did not use CUDA dynamic parallelism cannot be updated to a kernel function that uses CDP
    /// * A node whose kernel function originally did not make device-side update calls cannot be updated to a kernel function that makes device-side
    ///   update calls.
    /// * If this executable graph was not instantiated for device launch, a node whose kernel function originally did not use device-side [`ExecutableGraph::launch`] cannot be updated to a kernel function that uses device-side [`ExecutableGraph::launch`] unless the node resides on the same device as nodes which contained such calls at instantiate-time.
    ///   If no such calls were
    ///   present at instantiation, these updates cannot be performed at all.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// If `node` is a device-updatable kernel node, the next upload or launch of this executable graph will overwrite any previous device-side updates.
    /// Additionally, applying host updates to a device-updatable kernel node while it is being updated from the device results in undefined behavior.
    /// This can also be used with a runtime kernel handle queried through [`sys::cudaLibraryGetKernel`](singe_cuda_sys::runtime::cudaLibraryGetKernel) or [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) and then passed as a raw pointer.
    /// The symbol passed to [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) must be registered with the same CUDA Runtime instance.
    /// Passing a symbol that belongs to a different runtime instance results in undefined behavior.
    /// The only type that can be reliably passed to a different runtime instance is the runtime kernel handle type itself.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_kernel_node_params<'a, P>(
        &mut self,
        node: GraphNode,
        function: DeviceFunction,
        config: &LaunchConfig,
        params: P,
    ) -> Result<()>
    where
        P: KernelLaunchArgs<'a>,
    {
        params.with_raw_pointers(|arguments| unsafe {
            let params = runtime::cudaKernelNodeParams {
                func: function.as_raw().cast(),
                gridDim: config.grid_dim.into(),
                blockDim: config.block_dim.into(),
                sharedMemBytes: config.shared_memory_bytes as _,
                kernelParams: arguments.as_mut_ptr().cast(),
                extra: ptr::null_mut(),
            };
            try_ffi!(runtime::cudaGraphExecKernelNodeSetParams(
                self.handle,
                node.handle,
                &raw const params,
            ))?;
            Ok(())
        })
    }

    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
    /// `node` must remain in the graph which was used to instantiate this executable graph.
    /// Changed edges to and from `node` are ignored.
    ///
    /// The source and destination must be allocated from the same contexts as the original source and destination memory.
    /// The instantiation-time memory operands must be 1-dimensional.
    /// Zero-length operations are not supported.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Returns [`Status::InvalidValue`] if the memory operands' mappings changed or the original memory operands are multidimensional.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_memcpy_node_1d_params(
        &mut self,
        node: GraphNode,
        params: &Memcpy1DNodeParams,
    ) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParams1D(
                self.handle,
                node.handle,
                params.dst.cast(),
                params.src.cast(),
                params.count as _,
                params.kind.into(),
            ))?;
        }
        Ok(())
    }

    /// Updates a memcpy node to copy between typed device byte buffers.
    ///
    /// The node copies `src.byte_len()` bytes. `dst` must have at least that
    /// many bytes.
    ///
    /// # Errors
    ///
    /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
    /// operation, if a previous asynchronous launch reported an error, or if CUDA
    /// reports runtime initialization diagnostics.
    pub fn set_memcpy_node_1d_device_to_device<D, S>(
        &mut self,
        node: GraphNode,
        dst: &mut D,
        src: &S,
    ) -> Result<()>
    where
        D: ByteBufferMut + ?Sized,
        S: ByteBuffer + ?Sized,
    {
        let count = src.byte_len();
        if dst.byte_len() < count {
            return Err(Error::InvalidMemoryAccess);
        }
        let params = unsafe {
            Memcpy1DNodeParams::new(
                dst.as_byte_mut_ptr().cast(),
                src.as_byte_ptr().cast(),
                count,
                MemoryCopyKind::DeviceToDevice,
            )
        };
        self.set_memcpy_node_1d_params(node, &params)
    }

    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
    /// `node` must remain in the graph which was used to instantiate this executable graph.
    /// Changed edges to and from `node` are ignored.
    ///
    /// The source and destination memory in `params` must be allocated from the same contexts as the original source and destination memory.
    /// Both the instantiation-time memory operands and the memory operands in `params` must be 1-dimensional.
    /// Zero-length operations are not supported.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Returns [`Status::InvalidValue`] if the memory operands' mappings changed or either the original or new memory operands are multidimensional.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_memcpy_node_params(
        &mut self,
        node: GraphNode,
        params: &Memcpy3DNodeParams,
    ) -> Result<()> {
        let params = params.into();
        unsafe {
            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParams(
                self.handle,
                node.handle,
                &raw const params,
            ))?;
        }
        Ok(())
    }

    pub fn set_memcpy_node_to_symbol_params(
        &mut self,
        node: GraphNode,
        params: &MemcpyToSymbolNodeParams,
    ) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParamsToSymbol(
                self.handle,
                node.handle,
                params.symbol.cast(),
                params.src.cast(),
                params.count as _,
                params.offset as _,
                params.kind.into(),
            ))?;
        }
        Ok(())
    }

    pub fn set_memcpy_node_from_symbol_params(
        &mut self,
        node: GraphNode,
        params: &MemcpyFromSymbolNodeParams,
    ) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParamsFromSymbol(
                self.handle,
                node.handle,
                params.dst.cast(),
                params.symbol.cast(),
                params.count as _,
                params.offset as _,
                params.kind.into(),
            ))?;
        }
        Ok(())
    }

    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
    /// `node` must remain in the graph which was used to instantiate this executable graph.
    /// Changed edges to and from `node` are ignored.
    ///
    /// Zero-sized operations are not supported.
    ///
    /// The new destination pointer in `params` must be to the same kind of allocation as the original destination pointer and have the same context association and device mapping as the original destination pointer.
    ///
    /// Both the value and pointer address may be updated.
    /// Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
    /// Specifically, for 2D memsets, all dimension changes are rejected.
    /// For 1D memsets, changes in height are explicitly rejected and other changes are opportunistically allowed if the resulting work maps onto the work resources already allocated for the node.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_memset_node_params(
        &mut self,
        node: GraphNode,
        params: &MemsetNodeParams,
    ) -> Result<()> {
        let params = params.into();
        unsafe {
            try_ffi!(runtime::cudaGraphExecMemsetNodeSetParams(
                self.handle,
                node.handle,
                &raw const params,
            ))?;
        }
        Ok(())
    }

    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
    /// `node` must remain in the graph which was used to instantiate this executable graph.
    /// Changed edges to and from `node` are ignored.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_host_node_params(&mut self, node: GraphNode, params: &HostNodeParams) -> Result<()> {
        let params = params.into();
        unsafe {
            try_ffi!(runtime::cudaGraphExecHostNodeSetParams(
                self.handle,
                node.handle,
                &raw const params,
            ))?;
        }
        Ok(())
    }

    /// Sets the event of an event record node in this executable graph.
    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_event_record_node_event(&mut self, node: GraphNode, event: &Event) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphExecEventRecordNodeSetEvent(
                self.handle,
                node.handle,
                event.as_raw(),
            ))?;
        }
        Ok(())
    }

    /// Updates the work represented by `node` in this executable graph as though the nodes contained in `node`'s graph had the parameters contained in `child_graph`'s nodes at instantiation.
    /// `node` must remain in the graph which was used to instantiate this executable graph.
    /// Changed edges to and from `node` are ignored.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// The topology of `child_graph`, as well as the node insertion order, must match that of the graph contained in `node`.
    /// See [`ExecutableGraph::update`] for a list of restrictions on what can be updated in an instantiated graph.
    /// The update is recursive, so child graph nodes contained within the top-level child graph are also updated.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_child_graph_node(&mut self, node: GraphNode, child_graph: &Graph) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphExecChildGraphNodeSetParams(
                self.handle,
                node.handle,
                child_graph.handle,
            ))?;
        }
        Ok(())
    }

    /// Sets the event of an event wait node in this executable graph.
    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn set_event_wait_node_event(&mut self, node: GraphNode, event: &Event) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphExecEventWaitNodeSetEvent(
                self.handle,
                node.handle,
                event.as_raw(),
            ))?;
        }
        Ok(())
    }

    /// Sets `node` to be either enabled or disabled.
    /// Disabled nodes are functionally equivalent to empty nodes until they are reenabled.
    /// Existing node parameters are not affected by disabling/enabling the node.
    ///
    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
    ///
    /// `node` must not have been removed from the original graph.
    ///
    /// The modifications only affect future launches of this executable graph.
    /// Already enqueued or running launches of this executable graph are not affected by this call.
    /// The original `node` is also not modified by this call.
    ///
    /// Currently only kernel, memset and memcpy nodes are supported.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    fn set_node_enabled(&mut self, node: GraphNode, enabled: bool) -> Result<()> {
        unsafe {
            try_ffi!(runtime::cudaGraphNodeSetEnabled(
                self.handle,
                node.handle,
                u32::from(enabled),
            ))?;
        }
        Ok(())
    }

    pub fn enable_node(&mut self, node: GraphNode) -> Result<()> {
        self.set_node_enabled(node, true)
    }

    pub fn disable_node(&mut self, node: GraphNode) -> Result<()> {
        self.set_node_enabled(node, false)
    }

    /// Returns whether `node` is enabled.
    ///
    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
    ///
    /// `node` must not have been removed from the original graph.
    ///
    /// Currently only kernel, memset and memcpy nodes are supported.
    ///
    /// Graph objects are not threadsafe.
    ///
    /// # Errors
    ///
    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
    /// not call CUDA functions; see [`Stream::add_callback`].
    pub fn is_node_enabled(&self, node: GraphNode) -> Result<bool> {
        let mut enabled = 0;
        unsafe {
            try_ffi!(runtime::cudaGraphNodeGetEnabled(
                self.handle,
                node.handle,
                &raw mut enabled,
            ))?;
        }
        Ok(enabled != 0)
    }

    pub const fn as_raw(&self) -> runtime::cudaGraphExec_t {
        self.handle
    }
}

impl Drop for ExecutableGraph {
    fn drop(&mut self) {
        unsafe {
            if let Err(err) = try_ffi!(runtime::cudaGraphExecDestroy(self.handle)) {
                #[cfg(debug_assertions)]
                eprintln!("failed to destroy cuda graph exec: {err}");
            }
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ExecutableGraphUpdate {
    pub result: GraphExecUpdateResult,
    pub error_node: Option<GraphNode>,
    pub error_from_node: Option<GraphNode>,
}

impl From<runtime::cudaGraphExecUpdateResultInfo> for ExecutableGraphUpdate {
    fn from(value: runtime::cudaGraphExecUpdateResultInfo) -> Self {
        Self {
            result: value.result.into(),
            error_node: if value.errorNode.is_null() {
                None
            } else {
                Some(GraphNode {
                    handle: value.errorNode,
                })
            },
            error_from_node: if value.errorFromNode.is_null() {
                None
            } else {
                Some(GraphNode {
                    handle: value.errorFromNode,
                })
            },
        }
    }
}

#[derive(Debug, Clone, Copy)]
pub struct MemsetNodeParams {
    pub dst: DevicePtr,
    pub pitch: usize,
    pub value: u32,
    pub element_size: u32,
    pub width: usize,
    pub height: usize,
}

impl MemsetNodeParams {
    /// Creates raw memset node parameters.
    ///
    /// # Safety
    ///
    /// `dst` must be valid for writes of `element_size * width` bytes when the
    /// graph executes. If `height` or `pitch` are changed after construction,
    /// the caller must account for those values as required by CUDA.
    pub const unsafe fn new(dst: DevicePtr, element_size: u32, width: usize) -> Self {
        Self {
            dst,
            pitch: 0,
            value: 0,
            element_size,
            width,
            height: 1,
        }
    }
}

impl From<&MemsetNodeParams> for driver::CUDA_MEMSET_NODE_PARAMS {
    fn from(value: &MemsetNodeParams) -> Self {
        Self {
            dst: value.dst.as_ptr() as _,
            pitch: value.pitch as _,
            value: value.value,
            elementSize: value.element_size,
            width: value.width as _,
            height: value.height as _,
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(transparent)]
pub struct ArrayHandle(runtime::cudaArray_t);

impl ArrayHandle {
    pub const unsafe fn from_raw(handle: runtime::cudaArray_t) -> Self {
        Self(handle)
    }

    pub const fn as_raw(self) -> runtime::cudaArray_t {
        self.0
    }
}