Skip to main content

singe_cuda/graph/
mod.rs

1pub mod raw;
2
3use std::{
4    any::Any,
5    ffi::CString,
6    fmt::{self, Display, Formatter},
7    hash::{Hash, Hasher},
8    marker::PhantomData,
9    mem::ManuallyDrop,
10    ops::Deref,
11    ptr,
12    sync::{
13        Arc,
14        atomic::{AtomicU64, Ordering},
15    },
16};
17
18use num_enum::{IntoPrimitive, TryFromPrimitive};
19use singe_core::{impl_enum_conversion, impl_enum_display};
20use singe_cuda_sys::{driver, runtime};
21
22use crate::{
23    context::Context,
24    dim::Dim3,
25    error::{Error, Result},
26    event::Event,
27    graph::raw::{HostNodeParams, MemoryCopyFromSymbolNodeParams, MemoryCopyToSymbolNodeParams},
28    memory::{DeviceMemory, MemoryAccessDescriptor, MemoryCopyKind, MemoryPoolProps},
29    module::{KernelLaunchArgs, LaunchConfig},
30    stream::Stream,
31    try_ffi,
32    types::{DeviceFunction, DevicePtr},
33    view::{ByteBuffer, ByteBufferMut, DeviceRepr},
34};
35use raw::{MemoryCopy1DNodeParams, MemoryCopy3DNodeParams};
36
37/// Identifiers for [`GraphKernelNodeAttribute`] values used by CUDA graph kernel nodes.
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
39#[repr(u32)]
40#[non_exhaustive]
41pub enum GraphKernelNodeAttributeId {
42    /// Identifies [`GraphKernelNodeAttribute::Cooperative`].
43    Cooperative = runtime::cudaLaunchAttributeID::cudaLaunchAttributeCooperative as _,
44    /// Identifies [`GraphKernelNodeAttribute::ClusterDimension`].
45    ClusterDimension = runtime::cudaLaunchAttributeID::cudaLaunchAttributeClusterDimension as _,
46    /// Identifies [`GraphKernelNodeAttribute::Priority`].
47    Priority = runtime::cudaLaunchAttributeID::cudaLaunchAttributePriority as _,
48    /// Identifies [`GraphKernelNodeAttribute::PreferredSharedMemoryCarveout`].
49    /// The value is a percentage in the range `0..=100` describing the preferred
50    /// shared-memory carveout for the launch. This is a hint, and the driver
51    /// may choose a different configuration if required.
52    PreferredSharedMemoryCarveout =
53        runtime::cudaLaunchAttributeID::cudaLaunchAttributePreferredSharedMemoryCarveout as _,
54}
55
56impl_enum_conversion!(
57    u32,
58    runtime::cudaLaunchAttributeID,
59    GraphKernelNodeAttributeId
60);
61
62#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
63#[non_exhaustive]
64pub enum GraphKernelNodeAttribute {
65    Cooperative(bool),
66    ClusterDimension(Dim3),
67    Priority(i32),
68    PreferredSharedMemoryCarveout(u32),
69}
70
71#[derive(Debug, Clone)]
72pub struct MemoryAllocationNodeInfo {
73    ptr: DevicePtr,
74    pub byte_size: usize,
75    graph_id: Option<GraphId>,
76    _graph: Option<Arc<GraphInner>>,
77    ctx: Option<Arc<Context>>,
78}
79
80bitflags::bitflags! {
81    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82    pub struct GraphInstantiateFlags: u64 {
83        const AUTO_FREE_ON_LAUNCH = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH as _;
84        const UPLOAD = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD as _;
85        const DEVICE_LAUNCH = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH as _;
86        const USE_NODE_PRIORITY = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY as _;
87    }
88}
89
90bitflags::bitflags! {
91    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
92    pub struct GraphDebugDotFlags: u32 {
93        const VERBOSE = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE as _;
94        const RUNTIME_TYPES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES as _;
95        const KERNEL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS as _;
96        const MEMCPY_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS as _;
97        const MEMSET_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS as _;
98        const HOST_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS as _;
99        const EVENT_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS as _;
100        const EXTERNAL_SEMAPHORE_SIGNAL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS as _;
101        const EXTERNAL_SEMAPHORE_WAIT_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS as _;
102        const KERNEL_NODE_ATTRIBUTES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES as _;
103        const HANDLES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES as _;
104        const MEMORY_ALLOC_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS as _;
105        const MEMORY_FREE_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS as _;
106        const BATCH_MEM_OP_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS as _;
107        const EXTRA_TOPOLOGY_INFO = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO as _;
108        const CONDITIONAL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS as _;
109    }
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
113#[repr(u32)]
114#[non_exhaustive]
115pub enum GraphNodeType {
116    Kernel = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_KERNEL as _,
117    Memcpy = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEMCPY as _,
118    Memset = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEMSET as _,
119    Host = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_HOST as _,
120    Graph = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_GRAPH as _,
121    Empty = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EMPTY as _,
122    WaitEvent = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_WAIT_EVENT as _,
123    EventRecord = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EVENT_RECORD as _,
124    ExternalSemaphoresSignal = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL as _,
125    ExternalSemaphoresWait = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT as _,
126    MemoryAlloc = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEM_ALLOC as _,
127    MemoryFree = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEM_FREE as _,
128    BatchMemOp = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_BATCH_MEM_OP as _,
129    Conditional = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_CONDITIONAL as _,
130}
131
132impl_enum_conversion!(u32, runtime::cudaGraphNodeType, GraphNodeType);
133
134#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
135#[repr(u8)]
136#[non_exhaustive]
137pub enum GraphDependencyType {
138    Default = driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT as _,
139    Programmatic = driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC as _,
140}
141
142impl From<driver::CUgraphDependencyType> for GraphDependencyType {
143    fn from(value: driver::CUgraphDependencyType) -> Self {
144        match value {
145            driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT => Self::Default,
146            driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC => {
147                Self::Programmatic
148            }
149        }
150    }
151}
152
153impl From<GraphDependencyType> for driver::CUgraphDependencyType {
154    fn from(value: GraphDependencyType) -> Self {
155        match value {
156            GraphDependencyType::Default => Self::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT,
157            GraphDependencyType::Programmatic => Self::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC,
158        }
159    }
160}
161
162impl_enum_display!(GraphNodeType, {
163    Self::Kernel => "cudaGraphNodeTypeKernel",
164    Self::Memcpy => "cudaGraphNodeTypeMemcpy",
165    Self::Memset => "cudaGraphNodeTypeMemset",
166    Self::Host => "cudaGraphNodeTypeHost",
167    Self::Graph => "cudaGraphNodeTypeGraph",
168    Self::Empty => "cudaGraphNodeTypeEmpty",
169    Self::WaitEvent => "cudaGraphNodeTypeWaitEvent",
170    Self::EventRecord => "cudaGraphNodeTypeEventRecord",
171    Self::ExternalSemaphoresSignal => "cudaGraphNodeTypeExternalSemaphoresSignal",
172    Self::ExternalSemaphoresWait => "cudaGraphNodeTypeExternalSemaphoresWait",
173    Self::MemoryAlloc => "cudaGraphNodeTypeMemAlloc",
174    Self::MemoryFree => "cudaGraphNodeTypeMemFree",
175    Self::BatchMemOp => "cudaGraphNodeTypeBatchMemOp",
176    Self::Conditional => "cudaGraphNodeTypeConditional",
177});
178
179#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
180#[repr(u32)]
181#[non_exhaustive]
182pub enum GraphExecUpdateResult {
183    Success = driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_SUCCESS as _,
184    Error = driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR as _,
185    ErrorTopologyChanged =
186        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED as _,
187    ErrorNodeTypeChanged =
188        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED as _,
189    ErrorFunctionChanged =
190        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED as _,
191    ErrorParametersChanged =
192        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED as _,
193    ErrorNotSupported =
194        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED as _,
195    ErrorUnsupportedFunctionChange =
196        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE
197            as _,
198    ErrorAttributesChanged =
199        driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED as _,
200}
201
202impl_enum_conversion!(driver::CUgraphExecUpdateResult, GraphExecUpdateResult);
203
204impl_enum_display!(GraphExecUpdateResult, {
205    Self::Success => "CU_GRAPH_EXEC_UPDATE_SUCCESS",
206    Self::Error => "CU_GRAPH_EXEC_UPDATE_ERROR",
207    Self::ErrorTopologyChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED",
208    Self::ErrorNodeTypeChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED",
209    Self::ErrorFunctionChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED",
210    Self::ErrorParametersChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED",
211    Self::ErrorNotSupported => "CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED",
212    Self::ErrorUnsupportedFunctionChange => "CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE",
213    Self::ErrorAttributesChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED",
214});
215
216#[derive(Debug, Clone)]
217pub struct GraphNode {
218    handle: runtime::cudaGraphNode_t,
219    graph_id: Option<GraphId>,
220    graph: Option<Arc<GraphInner>>,
221    ctx: Option<Arc<Context>>,
222}
223
224impl PartialEq for GraphNode {
225    fn eq(&self, other: &Self) -> bool {
226        self.handle == other.handle && self.graph_id == other.graph_id
227    }
228}
229impl Eq for GraphNode {}
230
231impl Hash for GraphNode {
232    fn hash<H: Hasher>(&self, state: &mut H) {
233        self.handle.hash(state);
234        self.graph_id.hash(state);
235    }
236}
237
238#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
239pub struct GraphEdgeData {
240    pub from_port: u8,
241    pub to_port: u8,
242    pub dependency_type: GraphDependencyType,
243}
244
245#[derive(Debug, Clone, PartialEq, Eq, Hash)]
246pub struct GraphDependency {
247    pub node: GraphNode,
248    pub data: GraphEdgeData,
249}
250
251#[derive(Debug, Clone, PartialEq, Eq, Hash)]
252pub struct GraphEdge {
253    pub from: GraphNode,
254    pub to: GraphNode,
255    pub data: GraphEdgeData,
256}
257
258#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
259pub struct GraphTopologySummary {
260    pub nodes: usize,
261    pub root_nodes: usize,
262    pub edges: usize,
263    pub kernel_nodes: usize,
264    pub memcpy_nodes: usize,
265    pub memset_nodes: usize,
266    pub host_nodes: usize,
267    pub child_graph_nodes: usize,
268    pub empty_nodes: usize,
269    pub wait_event_nodes: usize,
270    pub event_record_nodes: usize,
271    pub external_semaphores_signal_nodes: usize,
272    pub external_semaphores_wait_nodes: usize,
273    pub memory_alloc_nodes: usize,
274    pub memory_free_nodes: usize,
275    pub batch_mem_op_nodes: usize,
276    pub conditional_nodes: usize,
277}
278
279#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
280pub struct Position {
281    pub x: usize,
282    pub y: usize,
283    pub z: usize,
284}
285
286#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
287pub struct Extent {
288    pub width: usize,
289    pub height: usize,
290    pub depth: usize,
291}
292
293#[derive(Debug, Clone)]
294pub struct MemoryAllocationNodeParams<'a> {
295    pub pool_props: MemoryPoolProps,
296    pub access_descs: &'a [MemoryAccessDescriptor],
297    pub byte_size: usize,
298}
299
300impl Default for GraphEdgeData {
301    fn default() -> Self {
302        Self {
303            from_port: 0,
304            to_port: 0,
305            dependency_type: GraphDependencyType::Default,
306        }
307    }
308}
309
310impl From<runtime::cudaGraphEdgeData> for GraphEdgeData {
311    fn from(value: runtime::cudaGraphEdgeData) -> Self {
312        Self {
313            from_port: value.from_port,
314            to_port: value.to_port,
315            dependency_type: GraphDependencyType::try_from(value.type_)
316                .unwrap_or(GraphDependencyType::Default),
317        }
318    }
319}
320
321impl From<GraphEdgeData> for runtime::cudaGraphEdgeData {
322    fn from(value: GraphEdgeData) -> Self {
323        Self {
324            from_port: value.from_port,
325            to_port: value.to_port,
326            type_: value.dependency_type.into(),
327            reserved: [0; 5],
328        }
329    }
330}
331
332impl From<Position> for runtime::cudaPos {
333    fn from(value: Position) -> Self {
334        Self {
335            x: value.x as _,
336            y: value.y as _,
337            z: value.z as _,
338        }
339    }
340}
341
342impl From<Extent> for runtime::cudaExtent {
343    fn from(value: Extent) -> Self {
344        Self {
345            width: value.width as _,
346            height: value.height as _,
347            depth: value.depth as _,
348        }
349    }
350}
351
352impl GraphNode {
353    /// Wraps an existing CUDA graph node handle.
354    ///
355    /// The returned node is not associated with any [`Graph`] identity, so
356    /// graph and executable-graph methods cannot validate that it belongs to
357    /// the target graph before calling CUDA.
358    ///
359    /// # Safety
360    ///
361    /// `handle` must be a valid CUDA graph node handle. The caller must ensure
362    /// the node remains valid for every operation using the returned token and
363    /// that it belongs to the graph or executable graph passed to those
364    /// operations.
365    pub const unsafe fn from_raw(handle: runtime::cudaGraphNode_t) -> Self {
366        Self {
367            handle,
368            graph_id: None,
369            graph: None,
370            ctx: None,
371        }
372    }
373
374    fn from_raw_in_graph(
375        handle: runtime::cudaGraphNode_t,
376        graph_id: GraphId,
377        graph: Arc<GraphInner>,
378        ctx: Option<Arc<Context>>,
379    ) -> Self {
380        Self {
381            handle,
382            graph_id: Some(graph_id),
383            graph: Some(graph),
384            ctx,
385        }
386    }
387
388    fn from_raw_like(handle: runtime::cudaGraphNode_t, node: &Self) -> Self {
389        Self {
390            handle,
391            graph_id: node.graph_id,
392            graph: node.graph.clone(),
393            ctx: node.ctx.clone(),
394        }
395    }
396
397    fn bind_context(&self) -> Result<()> {
398        if let Some(ctx) = &self.ctx {
399            ctx.bind()?;
400        }
401        Ok(())
402    }
403
404    /// Returns the node type.
405    ///
406    /// Graph objects are not threadsafe.
407    ///
408    /// # Errors
409    ///
410    /// Returns an error if CUDA cannot query the node type or if a previous asynchronous launch
411    /// reported an error. CUDA may also return initialization-related errors such as
412    /// [`crate::error::Status::NotInitialized`], [`crate::error::Status::CallRequiresNewerDriver`], or
413    /// [`crate::error::Status::NoDevice`] if this call initializes internal runtime state. Callbacks must not
414    /// call CUDA functions; see [`Stream::add_callback`].
415    pub fn node_type(&self) -> Result<GraphNodeType> {
416        self.bind_context()?;
417        let mut kind = runtime::cudaGraphNodeType::CU_GRAPH_NODE_TYPE_KERNEL;
418        unsafe {
419            try_ffi!(runtime::cudaGraphNodeGetType(self.as_raw(), &raw mut kind))?;
420        }
421        Ok(kind.into())
422    }
423
424    /// Returns this node's dependencies.
425    ///
426    /// Graph objects are not threadsafe.
427    ///
428    /// # Errors
429    ///
430    /// Returns an error if CUDA cannot query the dependencies, a previous
431    /// asynchronous launch reports an error, or CUDA reports runtime
432    /// initialization diagnostics.
433    pub fn dependencies(&self) -> Result<Vec<GraphDependency>> {
434        self.bind_context()?;
435        unsafe {
436            let mut count = 0;
437            try_ffi!(runtime::cudaGraphNodeGetDependencies(
438                self.as_raw(),
439                ptr::null_mut(),
440                ptr::null_mut(),
441                &raw mut count,
442            ))?;
443
444            if count == 0 {
445                return Ok(Vec::new());
446            }
447
448            let mut handles = Vec::with_capacity(count as usize);
449            let mut edge_data = Vec::with_capacity(count as usize);
450            try_ffi!(runtime::cudaGraphNodeGetDependencies(
451                self.as_raw(),
452                handles.as_mut_ptr(),
453                edge_data.as_mut_ptr(),
454                &raw mut count,
455            ))?;
456            handles.set_len(count as usize);
457            edge_data.set_len(count as usize);
458
459            Ok(handles
460                .into_iter()
461                .zip(edge_data)
462                .map(|(handle, data)| GraphDependency {
463                    node: Self::from_raw_like(handle, self),
464                    data: data.into(),
465                })
466                .collect())
467        }
468    }
469
470    /// Returns this node's dependent nodes.
471    ///
472    /// Graph objects are not threadsafe.
473    ///
474    /// # Errors
475    ///
476    /// Returns an error if CUDA cannot query the dependent nodes, a previous
477    /// asynchronous launch reports an error, or CUDA reports runtime
478    /// initialization diagnostics.
479    pub fn dependent_nodes(&self) -> Result<Vec<GraphDependency>> {
480        self.bind_context()?;
481        unsafe {
482            let mut count = 0;
483            try_ffi!(runtime::cudaGraphNodeGetDependentNodes(
484                self.as_raw(),
485                ptr::null_mut(),
486                ptr::null_mut(),
487                &raw mut count,
488            ))?;
489
490            if count == 0 {
491                return Ok(Vec::new());
492            }
493
494            let mut handles = Vec::with_capacity(count as usize);
495            let mut edge_data = Vec::with_capacity(count as usize);
496            try_ffi!(runtime::cudaGraphNodeGetDependentNodes(
497                self.as_raw(),
498                handles.as_mut_ptr(),
499                edge_data.as_mut_ptr(),
500                &raw mut count,
501            ))?;
502            handles.set_len(count as usize);
503            edge_data.set_len(count as usize);
504
505            Ok(handles
506                .into_iter()
507                .zip(edge_data)
508                .map(|(handle, data)| GraphDependency {
509                    node: Self::from_raw_like(handle, self),
510                    data: data.into(),
511                })
512                .collect())
513        }
514    }
515
516    /// Returns the event of this event record node.
517    ///
518    /// Graph objects are not threadsafe.
519    ///
520    /// # Errors
521    ///
522    /// Returns an error if this is not an event-record node, CUDA cannot query
523    /// the event, CUDA returns a null event handle, a previous asynchronous
524    /// launch reports an error, or CUDA reports runtime initialization
525    /// diagnostics.
526    pub fn event_record_node_event(&self) -> Result<runtime::cudaEvent_t> {
527        self.bind_context()?;
528        let mut event = ptr::null_mut();
529        unsafe {
530            try_ffi!(runtime::cudaGraphEventRecordNodeGetEvent(
531                self.as_raw(),
532                &raw mut event,
533            ))?;
534        }
535        if event.is_null() {
536            return Err(Error::NullHandle);
537        }
538        Ok(event)
539    }
540
541    /// Returns the event of this event wait node.
542    ///
543    /// Graph objects are not threadsafe.
544    ///
545    /// # Errors
546    ///
547    /// Returns an error if this is not an event-wait node, CUDA cannot query the
548    /// event, CUDA returns a null event handle, a previous asynchronous launch
549    /// reports an error, or CUDA reports runtime initialization diagnostics.
550    pub fn event_wait_node_event(&self) -> Result<runtime::cudaEvent_t> {
551        self.bind_context()?;
552        let mut event = ptr::null_mut();
553        unsafe {
554            try_ffi!(runtime::cudaGraphEventWaitNodeGetEvent(
555                self.as_raw(),
556                &raw mut event,
557            ))?;
558        }
559        if event.is_null() {
560            return Err(Error::NullHandle);
561        }
562        Ok(event)
563    }
564
565    /// Returns a borrowed handle to the embedded graph in a child graph node.
566    /// This does not clone the graph.
567    /// Changes to the returned graph are reflected in the node, and the child
568    /// node retains ownership of the embedded graph handle.
569    /// The returned [`BorrowedGraph`] is tied to this node borrow and does not
570    /// destroy the embedded graph when dropped.
571    ///
572    /// Allocation and free nodes cannot be added to the returned graph.
573    /// Attempting to do so returns an error.
574    ///
575    /// Graph objects are not threadsafe.
576    ///
577    /// # Errors
578    ///
579    /// Returns an error if this is not a child-graph node, CUDA cannot query the
580    /// child graph, CUDA returns a null graph handle, a previous asynchronous
581    /// launch reports an error, or CUDA reports runtime initialization
582    /// diagnostics.
583    pub fn child_graph(&self) -> Result<BorrowedGraph<'_>> {
584        self.bind_context()?;
585        let mut graph = ptr::null_mut();
586        unsafe {
587            try_ffi!(runtime::cudaGraphChildGraphNodeGetGraph(
588                self.as_raw(),
589                &raw mut graph,
590            ))?;
591        }
592        if graph.is_null() {
593            return Err(Error::NullHandle);
594        }
595        unsafe { BorrowedGraph::from_raw_in_context(graph, self.ctx.clone()) }
596    }
597
598    /// Returns the parameters of this memcpy node.
599    ///
600    /// Graph objects are not threadsafe.
601    ///
602    /// # Errors
603    ///
604    /// Returns an error if this is not a memcpy node, CUDA cannot query the
605    /// parameters, a previous asynchronous launch reports an error, or CUDA
606    /// reports runtime initialization diagnostics.
607    pub fn memcpy_node_params(&self) -> Result<runtime::cudaMemcpy3DParms> {
608        self.bind_context()?;
609        let mut params = runtime::cudaMemcpy3DParms::default();
610        unsafe {
611            try_ffi!(runtime::cudaGraphMemcpyNodeGetParams(
612                self.as_raw(),
613                &raw mut params,
614            ))?;
615        }
616        Ok(params)
617    }
618
619    /// Returns the parameters of this memset node.
620    ///
621    /// Graph objects are not threadsafe.
622    ///
623    /// # Errors
624    ///
625    /// Returns an error if this is not a memset node, CUDA cannot query the
626    /// parameters, a previous asynchronous launch reports an error, or CUDA
627    /// reports runtime initialization diagnostics.
628    pub fn memset_node_params(&self) -> Result<driver::CUDA_MEMSET_NODE_PARAMS> {
629        self.bind_context()?;
630        let mut params = driver::CUDA_MEMSET_NODE_PARAMS::default();
631        unsafe {
632            try_ffi!(runtime::cudaGraphMemsetNodeGetParams(
633                self.as_raw(),
634                &raw mut params,
635            ))?;
636        }
637        Ok(params)
638    }
639
640    /// Returns the parameters of this host node.
641    ///
642    /// Graph objects are not threadsafe.
643    ///
644    /// # Errors
645    ///
646    /// Returns an error if this is not a host node, CUDA cannot query the
647    /// parameters, a previous asynchronous launch reports an error, or CUDA
648    /// reports runtime initialization diagnostics.
649    pub fn host_node_params(&self) -> Result<driver::CUDA_HOST_NODE_PARAMS> {
650        self.bind_context()?;
651        let mut params = driver::CUDA_HOST_NODE_PARAMS::default();
652        unsafe {
653            try_ffi!(runtime::cudaGraphHostNodeGetParams(
654                self.as_raw(),
655                &raw mut params,
656            ))?;
657        }
658        Ok(params)
659    }
660
661    /// Returns the parameters of a memory allocation node.
662    /// The `poolProps` and `accessDescs` values in the returned parameters are owned by the node.
663    /// This memory remains valid until the node is destroyed.
664    /// The returned parameters must not be modified.
665    ///
666    /// Graph objects are not threadsafe.
667    ///
668    /// # Errors
669    ///
670    /// Returns an error if this is not a memory-allocation node, CUDA cannot
671    /// query the parameters, a previous asynchronous launch reports an error,
672    /// or CUDA reports runtime initialization diagnostics.
673    pub fn mem_alloc_node_info(&self) -> Result<MemoryAllocationNodeInfo> {
674        self.bind_context()?;
675        let mut params = runtime::cudaMemAllocNodeParams::default();
676        unsafe {
677            try_ffi!(runtime::cudaGraphMemAllocNodeGetParams(
678                self.as_raw(),
679                &raw mut params,
680            ))?;
681        }
682        Ok(MemoryAllocationNodeInfo::from_raw(
683            unsafe { DevicePtr::new(params.dptr as _) },
684            params.bytesize as usize,
685            self.graph_id,
686            self.graph.clone(),
687            self.ctx.clone(),
688        ))
689    }
690
691    /// Returns the address of this memory free node.
692    ///
693    /// Graph objects are not threadsafe.
694    ///
695    /// # Errors
696    ///
697    /// Returns an error if this is not a memory-free node, CUDA cannot query the
698    /// pointer, a previous asynchronous launch reports an error, or CUDA reports
699    /// runtime initialization diagnostics.
700    ///
701    /// # Safety
702    ///
703    /// The node must still be a valid memory-free node in a live graph, and the
704    /// returned pointer must not be used after the graph frees it.
705    pub unsafe fn mem_free_node_ptr(&self) -> Result<DevicePtr> {
706        self.bind_context()?;
707        let mut ptr = ptr::null_mut();
708        unsafe {
709            try_ffi!(runtime::cudaGraphMemFreeNodeGetParams(
710                self.as_raw(),
711                &raw mut ptr as *mut _,
712            ))?;
713        }
714        Ok(unsafe { DevicePtr::new(ptr as _) })
715    }
716
717    /// Returns the requested kernel node attribute.
718    ///
719    /// # Errors
720    ///
721    /// Returns an error if this is not a kernel node, CUDA cannot query the
722    /// attribute, or a previous asynchronous launch reports an error.
723    pub fn kernel_node_attribute(
724        self,
725        id: GraphKernelNodeAttributeId,
726    ) -> Result<GraphKernelNodeAttribute> {
727        self.bind_context()?;
728        let mut value = runtime::cudaLaunchAttributeValue::default();
729        unsafe {
730            try_ffi!(runtime::cudaGraphKernelNodeGetAttribute(
731                self.as_raw(),
732                id.into(),
733                &raw mut value,
734            ))?;
735
736            Ok(match id {
737                GraphKernelNodeAttributeId::Cooperative => {
738                    GraphKernelNodeAttribute::Cooperative(*value.cooperative.as_ref() != 0)
739                }
740                GraphKernelNodeAttributeId::ClusterDimension => {
741                    let dim = value.clusterDim.as_ref();
742                    GraphKernelNodeAttribute::ClusterDimension(Dim3::new(dim.x, dim.y, dim.z))
743                }
744                GraphKernelNodeAttributeId::Priority => {
745                    GraphKernelNodeAttribute::Priority(*value.priority.as_ref())
746                }
747                GraphKernelNodeAttributeId::PreferredSharedMemoryCarveout => {
748                    GraphKernelNodeAttribute::PreferredSharedMemoryCarveout(
749                        *value.sharedMemCarveout.as_ref(),
750                    )
751                }
752            })
753        }
754    }
755
756    /// Sets a kernel node attribute.
757    ///
758    /// # Errors
759    ///
760    /// Returns an error if this is not a kernel node, CUDA rejects the
761    /// attribute update, or a previous asynchronous launch reports an error.
762    pub fn set_kernel_node_attribute(&mut self, attribute: GraphKernelNodeAttribute) -> Result<()> {
763        self.bind_context()?;
764        let (id, value) = match attribute {
765            GraphKernelNodeAttribute::Cooperative(value) => {
766                let mut attr = runtime::cudaLaunchAttributeValue {
767                    cooperative: runtime::__BindgenUnionField::new(),
768                    ..runtime::cudaLaunchAttributeValue::default()
769                };
770                unsafe { *attr.cooperative.as_mut() = i32::from(value) };
771                (GraphKernelNodeAttributeId::Cooperative, attr)
772            }
773            GraphKernelNodeAttribute::ClusterDimension(value) => {
774                let mut attr = runtime::cudaLaunchAttributeValue {
775                    clusterDim: runtime::__BindgenUnionField::new(),
776                    ..runtime::cudaLaunchAttributeValue::default()
777                };
778                unsafe {
779                    *attr.clusterDim.as_mut() = runtime::cudaLaunchAttributeValue__bindgen_ty_1 {
780                        x: value.x,
781                        y: value.y,
782                        z: value.z,
783                    };
784                }
785                (GraphKernelNodeAttributeId::ClusterDimension, attr)
786            }
787            GraphKernelNodeAttribute::Priority(value) => {
788                let mut attr = runtime::cudaLaunchAttributeValue {
789                    priority: runtime::__BindgenUnionField::new(),
790                    ..runtime::cudaLaunchAttributeValue::default()
791                };
792                unsafe { *attr.priority.as_mut() = value as _ };
793                (GraphKernelNodeAttributeId::Priority, attr)
794            }
795            GraphKernelNodeAttribute::PreferredSharedMemoryCarveout(value) => {
796                let mut attr = runtime::cudaLaunchAttributeValue {
797                    sharedMemCarveout: runtime::__BindgenUnionField::new(),
798                    ..runtime::cudaLaunchAttributeValue::default()
799                };
800                unsafe { *attr.sharedMemCarveout.as_mut() = value };
801                (
802                    GraphKernelNodeAttributeId::PreferredSharedMemoryCarveout,
803                    attr,
804                )
805            }
806        };
807
808        unsafe {
809            try_ffi!(runtime::cudaGraphKernelNodeSetAttribute(
810                self.as_raw(),
811                id.into(),
812                &raw const value,
813            ))?;
814        }
815        Ok(())
816    }
817
818    /// Copies attributes from `src` to this node.
819    /// Both nodes must have the same context.
820    ///
821    /// # Errors
822    ///
823    /// Returns an error if CUDA rejects the attribute copy or if a previous asynchronous launch
824    /// reported an error.
825    pub fn copy_kernel_node_attributes(self, other: Self) -> Result<()> {
826        if let (Some(ctx), Some(other_ctx)) = (self.context(), other.context())
827            && ctx != other_ctx
828        {
829            return Err(Error::GraphContextMismatch);
830        }
831        self.bind_context()?;
832        other.bind_context()?;
833        unsafe {
834            try_ffi!(runtime::cudaGraphKernelNodeCopyAttributes(
835                self.as_raw(),
836                other.handle
837            ))?;
838        }
839        Ok(())
840    }
841
842    pub const fn as_raw(&self) -> runtime::cudaGraphNode_t {
843        self.handle
844    }
845
846    pub(crate) fn graph_raw(&self) -> Option<runtime::cudaGraph_t> {
847        self.graph.as_ref().map(|graph| graph.handle)
848    }
849
850    pub fn context(&self) -> Option<&Context> {
851        self.ctx.as_deref()
852    }
853}
854
855impl MemoryAllocationNodeInfo {
856    pub const fn ptr(&self) -> DevicePtr {
857        self.ptr
858    }
859
860    pub fn context(&self) -> Option<&Context> {
861        self.ctx.as_deref()
862    }
863
864    fn from_raw_in_graph(
865        ptr: DevicePtr,
866        byte_size: usize,
867        graph_id: GraphId,
868        graph: Arc<GraphInner>,
869        ctx: Option<Arc<Context>>,
870    ) -> Self {
871        Self::from_raw(ptr, byte_size, Some(graph_id), Some(graph), ctx)
872    }
873
874    fn from_raw(
875        ptr: DevicePtr,
876        byte_size: usize,
877        graph_id: Option<GraphId>,
878        graph: Option<Arc<GraphInner>>,
879        ctx: Option<Arc<Context>>,
880    ) -> Self {
881        Self {
882            ptr,
883            byte_size,
884            graph_id,
885            _graph: graph,
886            ctx,
887        }
888    }
889}
890
891impl PartialEq for MemoryAllocationNodeInfo {
892    fn eq(&self, other: &Self) -> bool {
893        self.ptr == other.ptr
894            && self.byte_size == other.byte_size
895            && self.graph_id == other.graph_id
896    }
897}
898
899impl Eq for MemoryAllocationNodeInfo {}
900
901impl Hash for MemoryAllocationNodeInfo {
902    fn hash<H: Hasher>(&self, state: &mut H) {
903        self.ptr.hash(state);
904        self.byte_size.hash(state);
905        self.graph_id.hash(state);
906    }
907}
908
909impl GraphTopologySummary {
910    fn record_node_type(&mut self, node_type: GraphNodeType) {
911        match node_type {
912            GraphNodeType::Kernel => self.kernel_nodes += 1,
913            GraphNodeType::Memcpy => self.memcpy_nodes += 1,
914            GraphNodeType::Memset => self.memset_nodes += 1,
915            GraphNodeType::Host => self.host_nodes += 1,
916            GraphNodeType::Graph => self.child_graph_nodes += 1,
917            GraphNodeType::Empty => self.empty_nodes += 1,
918            GraphNodeType::WaitEvent => self.wait_event_nodes += 1,
919            GraphNodeType::EventRecord => self.event_record_nodes += 1,
920            GraphNodeType::ExternalSemaphoresSignal => {
921                self.external_semaphores_signal_nodes += 1;
922            }
923            GraphNodeType::ExternalSemaphoresWait => {
924                self.external_semaphores_wait_nodes += 1;
925            }
926            GraphNodeType::MemoryAlloc => self.memory_alloc_nodes += 1,
927            GraphNodeType::MemoryFree => self.memory_free_nodes += 1,
928            GraphNodeType::BatchMemOp => self.batch_mem_op_nodes += 1,
929            GraphNodeType::Conditional => self.conditional_nodes += 1,
930        }
931    }
932}
933
934#[derive(Debug)]
935pub struct Graph {
936    inner: Arc<GraphInner>,
937    id: GraphId,
938    ctx: Option<Arc<Context>>,
939    retained: Vec<RetainedAllocation>,
940}
941
942#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
943pub struct GraphId(u64);
944
945#[derive(Debug)]
946pub struct RawGraph {
947    inner: Arc<GraphInner>,
948}
949
950#[derive(Debug)]
951struct GraphInner {
952    handle: runtime::cudaGraph_t,
953    owns_handle: bool,
954}
955
956// CUDA graph handles can be retained and destroyed from any host thread after
957// binding the associated context. Mutating graph APIs require `&mut Graph`.
958unsafe impl Send for GraphInner {}
959unsafe impl Sync for GraphInner {}
960
961#[derive(Clone)]
962struct RetainedAllocation(Arc<dyn Any + Send + Sync>);
963
964impl fmt::Debug for RetainedAllocation {
965    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
966        f.debug_struct("RetainedAllocation")
967            .field("strong_count", &Arc::strong_count(&self.0))
968            .finish()
969    }
970}
971
972#[derive(Debug)]
973pub struct BorrowedGraph<'node> {
974    graph: Graph,
975    _node: PhantomData<&'node GraphNode>,
976}
977
978/// Device memory whose allocation is retained by CUDA graph objects.
979///
980/// `GraphBuffer` values are created through [`Graph::create_buffer`],
981/// [`Graph::zeroes_buffer`], or [`Graph::buffer_from_slice`]. Graph and
982/// executable-graph APIs that accept `GraphBuffer` retain the underlying
983/// allocation so graph replay cannot outlive the device pointers baked into
984/// CUDA graph nodes.
985#[derive(Debug)]
986pub struct GraphBuffer<T: DeviceRepr> {
987    memory: Arc<DeviceMemory<T>>,
988    ctx: Option<Arc<Context>>,
989}
990
991impl<T> GraphBuffer<T>
992where
993    T: DeviceRepr + Send + Sync,
994{
995    fn from_memory(memory: DeviceMemory<T>, ctx: Option<Arc<Context>>) -> Self {
996        Self {
997            memory: Arc::new(memory),
998            ctx,
999        }
1000    }
1001
1002    fn retained(&self) -> RetainedAllocation {
1003        let memory: Arc<DeviceMemory<T>> = Arc::clone(&self.memory);
1004        RetainedAllocation(memory)
1005    }
1006
1007    pub fn len(&self) -> usize {
1008        self.memory.len()
1009    }
1010
1011    pub fn is_empty(&self) -> bool {
1012        self.memory.is_empty()
1013    }
1014
1015    pub fn byte_len(&self) -> usize {
1016        self.memory.byte_len()
1017    }
1018
1019    pub fn context(&self) -> Option<&Context> {
1020        self.ctx.as_deref()
1021    }
1022
1023    pub fn as_ptr(&self) -> *const T {
1024        self.memory.as_ptr()
1025    }
1026
1027    pub fn as_mut_ptr(&mut self) -> *mut T {
1028        self.memory.as_mut_ptr()
1029    }
1030
1031    /// Copies a host slice into this graph-retained device buffer.
1032    ///
1033    /// This updates the stable allocation used by graph-buffer node APIs. The
1034    /// caller is still responsible for ordering this copy against graph launches
1035    /// that read or write the same allocation.
1036    ///
1037    /// # Errors
1038    ///
1039    /// Returns an error if `host_slice` does not have the same length as this
1040    /// buffer or if CUDA rejects the copy.
1041    pub fn copy_from_host(&mut self, host_slice: &[T]) -> Result<()> {
1042        if let Some(ctx) = &self.ctx {
1043            ctx.bind()?;
1044        }
1045        if host_slice.len() != self.len() {
1046            return Err(Error::InvalidMemoryAccess);
1047        }
1048        if self.is_empty() {
1049            return Ok(());
1050        }
1051        unsafe {
1052            DeviceMemory::<T>::copy(
1053                self.as_mut_ptr(),
1054                host_slice.as_ptr(),
1055                self.len(),
1056                MemoryCopyKind::HostToDevice,
1057            )
1058        }
1059    }
1060
1061    /// Copies this graph-retained device buffer into a host slice.
1062    ///
1063    /// # Errors
1064    ///
1065    /// Returns an error if `host_slice` does not have the same length as this
1066    /// buffer or if CUDA rejects the copy.
1067    pub fn copy_to_host(&self, host_slice: &mut [T]) -> Result<()> {
1068        if let Some(ctx) = &self.ctx {
1069            ctx.bind()?;
1070        }
1071        if host_slice.len() != self.len() {
1072            return Err(Error::InvalidMemoryAccess);
1073        }
1074        if self.is_empty() {
1075            return Ok(());
1076        }
1077        unsafe {
1078            DeviceMemory::<T>::copy(
1079                host_slice.as_mut_ptr(),
1080                self.as_ptr(),
1081                self.len(),
1082                MemoryCopyKind::DeviceToHost,
1083            )
1084        }
1085    }
1086
1087    /// Copies another graph-retained buffer into this buffer.
1088    ///
1089    /// # Errors
1090    ///
1091    /// Returns an error if the buffers have different lengths or if CUDA
1092    /// rejects the copy.
1093    pub fn copy_from_buffer(&mut self, src: &Self) -> Result<()> {
1094        if let (Some(dst_ctx), Some(src_ctx)) = (&self.ctx, &src.ctx)
1095            && dst_ctx.as_ref() != src_ctx.as_ref()
1096        {
1097            return Err(Error::GraphContextMismatch);
1098        }
1099        if let Some(ctx) = &self.ctx {
1100            ctx.bind()?;
1101        }
1102        if src.len() != self.len() {
1103            return Err(Error::InvalidMemoryAccess);
1104        }
1105        if self.is_empty() {
1106            return Ok(());
1107        }
1108        unsafe {
1109            DeviceMemory::<T>::copy(
1110                self.as_mut_ptr(),
1111                src.as_ptr(),
1112                self.len(),
1113                MemoryCopyKind::DeviceToDevice,
1114            )
1115        }
1116    }
1117
1118    pub fn copy_to_host_vec(&self) -> Result<Vec<T>> {
1119        if let Some(ctx) = &self.ctx {
1120            ctx.bind()?;
1121        }
1122        if self.is_empty() {
1123            return Ok(Vec::new());
1124        }
1125
1126        let mut host = Vec::<T>::with_capacity(self.len());
1127        unsafe {
1128            DeviceMemory::<T>::copy(
1129                host.as_mut_ptr(),
1130                self.as_ptr(),
1131                self.len(),
1132                MemoryCopyKind::DeviceToHost,
1133            )?;
1134            host.set_len(self.len());
1135        }
1136        Ok(host)
1137    }
1138}
1139
1140impl RawGraph {
1141    /// Wraps an existing CUDA graph handle and takes ownership of it.
1142    ///
1143    /// # Safety
1144    ///
1145    /// `handle` must be a valid CUDA graph handle. Ownership of `handle` is
1146    /// transferred to the returned [`RawGraph`], and the handle must not be
1147    /// destroyed elsewhere after calling this function.
1148    pub unsafe fn from_raw(handle: runtime::cudaGraph_t) -> Result<Self> {
1149        if handle.is_null() {
1150            return Err(Error::NullHandle);
1151        }
1152
1153        Ok(Self {
1154            inner: Arc::new(GraphInner {
1155                handle,
1156                owns_handle: true,
1157            }),
1158        })
1159    }
1160
1161    /// Creates an empty raw graph without a Singe context association.
1162    ///
1163    /// Prefer [`Context::create_graph`] for ordinary Singe code. Raw graphs do
1164    /// not model context association, so the caller must keep CUDA context,
1165    /// node, executable update, upload, and launch relationships coherent.
1166    ///
1167    /// # Safety
1168    ///
1169    /// The returned graph has no modeled CUDA context association. The caller
1170    /// must ensure every node, kernel, memory operand, child graph, executable
1171    /// update, upload, and launch is used with the correct CUDA context.
1172    pub unsafe fn create() -> Result<Self> {
1173        let mut handle = ptr::null_mut();
1174        unsafe {
1175            try_ffi!(runtime::cudaGraphCreate(&raw mut handle, 0))?;
1176        }
1177        unsafe { Self::from_raw(handle) }
1178    }
1179
1180    pub fn as_raw(&self) -> runtime::cudaGraph_t {
1181        self.inner.handle
1182    }
1183
1184    /// Consumes the graph and returns the raw CUDA graph handle without
1185    /// destroying it.
1186    ///
1187    /// The caller becomes responsible for eventually destroying the returned
1188    /// handle with CUDA.
1189    pub fn into_raw(self) -> runtime::cudaGraph_t {
1190        let inner = Arc::try_unwrap(self.inner)
1191            .unwrap_or_else(|_| panic!("cannot take raw graph handle while it is still shared"));
1192        let inner = ManuallyDrop::new(inner);
1193        inner.handle
1194    }
1195}
1196
1197static NEXT_GRAPH_ID: AtomicU64 = AtomicU64::new(1);
1198
1199impl GraphId {
1200    pub fn generate() -> Self {
1201        Self(NEXT_GRAPH_ID.fetch_add(1, Ordering::Relaxed))
1202    }
1203
1204    pub fn as_u64(self) -> u64 {
1205        self.0
1206    }
1207}
1208
1209impl Display for GraphId {
1210    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1211        self.0.fmt(f)
1212    }
1213}
1214
1215impl Graph {
1216    fn bind_context(&self) -> Result<()> {
1217        if let Some(ctx) = &self.ctx {
1218            ctx.bind()?;
1219        }
1220        Ok(())
1221    }
1222
1223    /// Wraps an existing CUDA graph handle associated with `ctx` and takes
1224    /// ownership of it.
1225    ///
1226    /// # Safety
1227    ///
1228    /// `handle` must be a valid CUDA graph handle associated with `ctx`.
1229    /// Ownership of `handle` is transferred to the returned [`Graph`], and the
1230    /// handle must not be destroyed elsewhere after calling this function.
1231    pub unsafe fn from_raw_in_context(
1232        handle: runtime::cudaGraph_t,
1233        ctx: Arc<Context>,
1234    ) -> Result<Self> {
1235        if handle.is_null() {
1236            return Err(Error::NullHandle);
1237        }
1238
1239        Ok(Self {
1240            inner: Arc::new(GraphInner {
1241                handle,
1242                owns_handle: true,
1243            }),
1244            id: GraphId::generate(),
1245            ctx: Some(ctx),
1246            retained: Vec::new(),
1247        })
1248    }
1249
1250    unsafe fn from_raw_borrowed_in_context(
1251        handle: runtime::cudaGraph_t,
1252        ctx: Option<Arc<Context>>,
1253    ) -> Self {
1254        Self {
1255            inner: Arc::new(GraphInner {
1256                handle,
1257                owns_handle: false,
1258            }),
1259            id: GraphId::generate(),
1260            ctx,
1261            retained: Vec::new(),
1262        }
1263    }
1264
1265    pub(crate) fn create_in_context(ctx: Arc<Context>) -> Result<Self> {
1266        ctx.bind()?;
1267        let mut handle = ptr::null_mut();
1268        unsafe {
1269            try_ffi!(runtime::cudaGraphCreate(&raw mut handle, 0))?;
1270        }
1271        Ok(Self {
1272            inner: Arc::new(GraphInner {
1273                handle,
1274                owns_handle: true,
1275            }),
1276            id: GraphId::generate(),
1277            ctx: Some(ctx),
1278            retained: Vec::new(),
1279        })
1280    }
1281
1282    fn retain_buffer<T>(&mut self, buffer: &GraphBuffer<T>)
1283    where
1284        T: DeviceRepr + Send + Sync,
1285    {
1286        self.retained.push(buffer.retained());
1287    }
1288
1289    fn check_buffer_context<T>(&self, buffer: &GraphBuffer<T>) -> Result<()>
1290    where
1291        T: DeviceRepr + Send + Sync,
1292    {
1293        if let (Some(graph_ctx), Some(buffer_ctx)) = (&self.ctx, buffer.context())
1294            && graph_ctx.as_ref() != buffer_ctx
1295        {
1296            return Err(Error::GraphContextMismatch);
1297        }
1298        Ok(())
1299    }
1300
1301    fn check_buffer_contexts<T>(&self, dst: &GraphBuffer<T>, src: &GraphBuffer<T>) -> Result<()>
1302    where
1303        T: DeviceRepr + Send + Sync,
1304    {
1305        self.check_buffer_context(dst)?;
1306        self.check_buffer_context(src)?;
1307        Ok(())
1308    }
1309
1310    /// Allocates graph-retained device memory.
1311    ///
1312    /// The returned buffer can be used with graph-buffer node APIs. Any graph or
1313    /// executable graph that records the buffer retains the underlying device
1314    /// allocation for replay.
1315    ///
1316    /// # Errors
1317    ///
1318    /// Returns an error if CUDA cannot allocate device memory, the requested
1319    /// byte count overflows, or CUDA reports runtime initialization diagnostics.
1320    pub fn create_buffer<T>(&mut self, length: usize) -> Result<GraphBuffer<T>>
1321    where
1322        T: DeviceRepr + Send + Sync,
1323    {
1324        self.bind_context()?;
1325        let buffer = GraphBuffer::from_memory(DeviceMemory::create(length)?, self.ctx.clone());
1326        self.retain_buffer(&buffer);
1327        Ok(buffer)
1328    }
1329
1330    /// Allocates graph-retained device memory initialized to zero bytes.
1331    ///
1332    /// # Errors
1333    ///
1334    /// Returns an error if CUDA cannot allocate or initialize device memory, the
1335    /// requested byte count overflows, or CUDA reports runtime initialization
1336    /// diagnostics.
1337    pub fn zeroes_buffer<T>(&mut self, length: usize) -> Result<GraphBuffer<T>>
1338    where
1339        T: DeviceRepr + Send + Sync,
1340    {
1341        self.bind_context()?;
1342        let buffer = GraphBuffer::from_memory(DeviceMemory::zeroes(length)?, self.ctx.clone());
1343        self.retain_buffer(&buffer);
1344        Ok(buffer)
1345    }
1346
1347    /// Allocates graph-retained device memory initialized from a host slice.
1348    ///
1349    /// # Errors
1350    ///
1351    /// Returns an error if CUDA cannot allocate or copy device memory, the
1352    /// requested byte count overflows, or CUDA reports runtime initialization
1353    /// diagnostics.
1354    pub fn buffer_from_slice<T>(&mut self, values: &[T]) -> Result<GraphBuffer<T>>
1355    where
1356        T: DeviceRepr + Send + Sync,
1357    {
1358        self.bind_context()?;
1359        let buffer = GraphBuffer::from_memory(DeviceMemory::from_slice(values)?, self.ctx.clone());
1360        self.retain_buffer(&buffer);
1361        Ok(buffer)
1362    }
1363
1364    pub fn instantiate(&self) -> Result<ExecutableGraph> {
1365        self.instantiate_with_flags(GraphInstantiateFlags::empty())
1366    }
1367
1368    /// Instantiates graph as an executable graph.
1369    /// The graph is validated for any structural constraints or intra-node constraints which were not previously validated.
1370    /// If instantiation is successful, returns an instantiated executable graph.
1371    ///
1372    /// `flags` controls the behavior of instantiation and subsequent graph launches.
1373    /// Valid flags are:
1374    ///
1375    /// * [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`], which configures a graph containing memory allocation nodes to automatically free any unfreed memory allocations before
1376    ///   the graph is relaunched.
1377    ///
1378    /// * [`GraphInstantiateFlags::DEVICE_LAUNCH`], which configures the graph for launch from the device.
1379    ///   If this flag is passed, the executable graph handle returned can
1380    ///   be used to launch the graph from both the host and device.
1381    ///   This flag can only be used on platforms which support unified addressing.
1382    ///   This flag cannot be used in conjunction with [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`].
1383    ///
1384    /// * [`GraphInstantiateFlags::USE_NODE_PRIORITY`], which causes the graph to use the priorities from the per-node attributes rather than the priority of the launch stream
1385    ///   during execution.
1386    ///   Priorities are only available on kernel nodes and are copied from stream priority during stream capture.
1387    ///
1388    /// If the graph contains any allocation or free nodes, there can be at most one executable graph in existence for that graph at a time.
1389    /// An attempt to instantiate a second executable graph before dropping the first results in an error.
1390    /// The same also applies if the graph contains any device-updatable kernel nodes.
1391    ///
1392    /// If the graph contains kernels which call device-side [`ExecutableGraph::launch`] from multiple devices, this results in an error.
1393    ///
1394    /// Graphs instantiated for launch on the device have additional restrictions which do not apply to host graphs:
1395    ///
1396    /// * The graph's nodes must reside on a single device.
1397    /// * The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
1398    /// * The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
1399    ///   Operation-specific restrictions are
1400    ///   outlined below.
1401    /// * Kernel nodes:
1402    ///   + Use of CUDA Dynamic Parallelism is not permitted.
1403    ///   + Cooperative launches are permitted as long as MPS is not in use.
1404    /// * Memcpy nodes:
1405    ///   + Only copies involving device memory and/or pinned device-mapped host memory are permitted.
1406    ///   + Copies involving CUDA arrays are not permitted.
1407    ///   + Both operands must be accessible from the current device, and the current device must match the device of other nodes in the
1408    ///     graph.
1409    ///
1410    /// Graph objects are not threadsafe.
1411    ///
1412    /// # Errors
1413    ///
1414    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1415    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1416    /// not call CUDA functions; see [`Stream::add_callback`].
1417    pub fn instantiate_with_flags(&self, flags: GraphInstantiateFlags) -> Result<ExecutableGraph> {
1418        self.bind_context()?;
1419        let mut handle = ptr::null_mut();
1420        unsafe {
1421            try_ffi!(runtime::cudaGraphInstantiateWithFlags(
1422                &raw mut handle,
1423                self.as_raw(),
1424                flags.bits(),
1425            ))?;
1426        }
1427        unsafe {
1428            ExecutableGraph::from_raw_with_graph(
1429                handle,
1430                self.ctx.clone(),
1431                Some(self.id),
1432                Some(Arc::clone(&self.inner)),
1433                self.retained.clone(),
1434            )
1435        }
1436    }
1437
1438    /// Creates a copy of `original_graph`.
1439    /// All parameters are copied into the cloned graph.
1440    /// The original graph may be modified after this call without affecting the clone.
1441    ///
1442    /// Child graph nodes in the original graph are recursively copied into the clone.
1443    ///
1444    /// Cloning is not supported for graphs that contain memory allocation nodes, memory free nodes, or conditional nodes.
1445    ///
1446    /// Graph objects are not threadsafe.
1447    ///
1448    /// # Errors
1449    ///
1450    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1451    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1452    /// not call CUDA functions; see [`Stream::add_callback`].
1453    pub fn try_clone(&self) -> Result<Self> {
1454        self.bind_context()?;
1455        let mut handle = ptr::null_mut();
1456        unsafe {
1457            try_ffi!(runtime::cudaGraphClone(&raw mut handle, self.as_raw()))?;
1458        }
1459        Ok(Self {
1460            inner: Arc::new(GraphInner {
1461                handle,
1462                owns_handle: true,
1463            }),
1464            id: GraphId::generate(),
1465            ctx: self.ctx.clone(),
1466            retained: self.retained.clone(),
1467        })
1468    }
1469
1470    fn node_from_raw(&self, handle: runtime::cudaGraphNode_t) -> GraphNode {
1471        GraphNode::from_raw_in_graph(handle, self.id, Arc::clone(&self.inner), self.ctx.clone())
1472    }
1473
1474    pub(crate) fn check_node(&self, node: &GraphNode) -> Result<()> {
1475        self.bind_context()?;
1476        if !matches!(node.graph_id, Some(id) if id == self.id) {
1477            return Err(Error::GraphNodeMismatch);
1478        }
1479        Ok(())
1480    }
1481
1482    pub(crate) fn check_nodes(&self, nodes: &[GraphNode]) -> Result<()> {
1483        self.bind_context()?;
1484        for node in nodes {
1485            if !matches!(node.graph_id, Some(id) if id == self.id) {
1486                return Err(Error::GraphNodeMismatch);
1487            }
1488        }
1489        Ok(())
1490    }
1491
1492    fn check_child_graph_context(&self, child_graph: &Graph) -> Result<()> {
1493        if let (Some(parent_ctx), Some(child_ctx)) = (&self.ctx, &child_graph.ctx)
1494            && parent_ctx.as_ref() != child_ctx.as_ref()
1495        {
1496            return Err(Error::GraphContextMismatch);
1497        }
1498        Ok(())
1499    }
1500
1501    fn check_event_record_context(&self, event: &Event) -> Result<()> {
1502        if let Some(ctx) = &self.ctx
1503            && ctx.as_ref() != event.context()
1504        {
1505            return Err(Error::GraphContextMismatch);
1506        }
1507        Ok(())
1508    }
1509
1510    pub fn add_dependency(&mut self, from: GraphNode, to: GraphNode) -> Result<()> {
1511        self.add_dependencies(&[from], &[to])
1512    }
1513
1514    pub fn add_dependencies(&mut self, from: &[GraphNode], to: &[GraphNode]) -> Result<()> {
1515        self.add_dependencies_with_data(from, to, &[])
1516    }
1517
1518    /// Elements in `from` and `to` at corresponding indices define each dependency to add.
1519    /// Each node in `from` and `to` must belong to this graph.
1520    ///
1521    /// If `from` and `to` are empty, the call returns without modifying the graph.
1522    /// Specifying an existing dependency returns an error.
1523    ///
1524    /// Graph objects are not threadsafe.
1525    ///
1526    /// # Errors
1527    ///
1528    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1529    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1530    /// not call CUDA functions; see [`Stream::add_callback`].
1531    pub fn add_dependencies_with_data(
1532        &mut self,
1533        from: &[GraphNode],
1534        to: &[GraphNode],
1535        edge_data: &[GraphEdgeData],
1536    ) -> Result<()> {
1537        if from.len() != to.len() {
1538            return Err(Error::GraphDependencyMismatch);
1539        }
1540        if !edge_data.is_empty() && edge_data.len() != from.len() {
1541            return Err(Error::GraphDependencyMismatch);
1542        }
1543        if from.is_empty() {
1544            return Ok(());
1545        }
1546        self.check_nodes(from)?;
1547        self.check_nodes(to)?;
1548
1549        let from_raw: Vec<_> = from.iter().map(GraphNode::as_raw).collect();
1550        let to_raw: Vec<_> = to.iter().map(GraphNode::as_raw).collect();
1551        let edge_data_raw: Vec<_> = edge_data.iter().copied().map(Into::into).collect();
1552        unsafe {
1553            try_ffi!(runtime::cudaGraphAddDependencies(
1554                self.as_raw(),
1555                from_raw.as_ptr(),
1556                to_raw.as_ptr(),
1557                if edge_data_raw.is_empty() {
1558                    ptr::null()
1559                } else {
1560                    edge_data_raw.as_ptr()
1561                },
1562                from_raw.len() as _,
1563            ))?;
1564        }
1565        Ok(())
1566    }
1567
1568    pub fn remove_dependency(&mut self, from: GraphNode, to: GraphNode) -> Result<()> {
1569        self.remove_dependencies(&[from], &[to])
1570    }
1571
1572    pub fn remove_dependencies(&mut self, from: &[GraphNode], to: &[GraphNode]) -> Result<()> {
1573        self.remove_dependencies_with_data(from, to, &[])
1574    }
1575
1576    /// Elements in `from` and `to` at corresponding indices define each dependency to remove.
1577    /// Each node in `from` and `to` must belong to this graph.
1578    ///
1579    /// If `from` and `to` are empty, the call returns without modifying the graph.
1580    /// Specifying an edge that does not exist in the graph, with data matching `edge_data`, results in an error.
1581    /// Passing an empty `edge_data` slice is equivalent to passing default edge data for each edge.
1582    ///
1583    /// Graph objects are not threadsafe.
1584    ///
1585    /// # Errors
1586    ///
1587    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1588    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1589    /// not call CUDA functions; see [`Stream::add_callback`].
1590    pub fn remove_dependencies_with_data(
1591        &mut self,
1592        from: &[GraphNode],
1593        to: &[GraphNode],
1594        edge_data: &[GraphEdgeData],
1595    ) -> Result<()> {
1596        if from.len() != to.len() {
1597            return Err(Error::GraphDependencyMismatch);
1598        }
1599        if !edge_data.is_empty() && edge_data.len() != from.len() {
1600            return Err(Error::GraphDependencyMismatch);
1601        }
1602        if from.is_empty() {
1603            return Ok(());
1604        }
1605        self.check_nodes(from)?;
1606        self.check_nodes(to)?;
1607
1608        let from_raw: Vec<_> = from.iter().map(GraphNode::as_raw).collect();
1609        let to_raw: Vec<_> = to.iter().map(GraphNode::as_raw).collect();
1610        let edge_data_raw: Vec<_> = edge_data.iter().copied().map(Into::into).collect();
1611        unsafe {
1612            try_ffi!(runtime::cudaGraphRemoveDependencies(
1613                self.as_raw(),
1614                from_raw.as_ptr(),
1615                to_raw.as_ptr(),
1616                if edge_data_raw.is_empty() {
1617                    ptr::null()
1618                } else {
1619                    edge_data_raw.as_ptr()
1620                },
1621                from_raw.len() as _,
1622            ))?;
1623        }
1624        Ok(())
1625    }
1626
1627    pub fn add_edges(&mut self, edges: &[GraphEdge]) -> Result<()> {
1628        if edges.is_empty() {
1629            return Ok(());
1630        }
1631
1632        let from: Vec<_> = edges.iter().map(|edge| edge.from.clone()).collect();
1633        let to: Vec<_> = edges.iter().map(|edge| edge.to.clone()).collect();
1634        let data: Vec<_> = edges.iter().map(|edge| edge.data).collect();
1635        self.add_dependencies_with_data(&from, &to, &data)
1636    }
1637
1638    pub fn remove_edges(&mut self, edges: &[GraphEdge]) -> Result<()> {
1639        if edges.is_empty() {
1640            return Ok(());
1641        }
1642
1643        let from: Vec<_> = edges.iter().map(|edge| edge.from.clone()).collect();
1644        let to: Vec<_> = edges.iter().map(|edge| edge.to.clone()).collect();
1645        let data: Vec<_> = edges.iter().map(|edge| edge.data).collect();
1646        self.remove_dependencies_with_data(&from, &to, &data)
1647    }
1648
1649    /// Creates a node that performs no operation and adds it to the graph with the given dependencies.
1650    /// The dependency list may be empty, in which case the node is placed at the
1651    /// graph root. It may not contain duplicate entries.
1652    ///
1653    /// An empty node performs no operation during execution, but can be used for transitive ordering.
1654    /// For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2\*n dependency edges, rather than no empty node and n^2 dependency edges.
1655    ///
1656    /// Graph objects are not threadsafe.
1657    ///
1658    /// # Errors
1659    ///
1660    /// Returns an error if CUDA rejects the graph operation or reports runtime initialization
1661    /// diagnostics. Callbacks must not call CUDA functions; see [`Stream::add_callback`].
1662    pub fn add_empty_node(&mut self, dependencies: &[GraphNode]) -> Result<GraphNode> {
1663        self.check_nodes(dependencies)?;
1664        let mut handle = ptr::null_mut();
1665        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1666        unsafe {
1667            try_ffi!(runtime::cudaGraphAddEmptyNode(
1668                &raw mut handle,
1669                self.as_raw(),
1670                dependencies_raw.as_ptr(),
1671                dependencies_raw.len() as _,
1672            ))?;
1673            Ok(self.node_from_raw(handle))
1674        }
1675    }
1676
1677    /// Creates an event record node and adds it to the graph with the given dependencies and event.
1678    /// The dependency list may be empty, in which case the node is placed at the
1679    /// graph root. It may not contain duplicate entries.
1680    ///
1681    /// Each graph launch records `event` to capture execution of the node's dependencies.
1682    ///
1683    /// These nodes may not be used in loops or conditionals.
1684    ///
1685    /// Graph objects are not threadsafe.
1686    ///
1687    /// # Errors
1688    ///
1689    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1690    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1691    /// not call CUDA functions; see [`Stream::add_callback`].
1692    pub fn add_event_record_node(
1693        &mut self,
1694        dependencies: &[GraphNode],
1695        event: &Event,
1696    ) -> Result<GraphNode> {
1697        self.check_nodes(dependencies)?;
1698        self.check_event_record_context(event)?;
1699        let mut handle = ptr::null_mut();
1700        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1701        unsafe {
1702            try_ffi!(runtime::cudaGraphAddEventRecordNode(
1703                &raw mut handle,
1704                self.as_raw(),
1705                dependencies_raw.as_ptr(),
1706                dependencies_raw.len() as _,
1707                event.as_raw(),
1708            ))?;
1709            Ok(self.node_from_raw(handle))
1710        }
1711    }
1712
1713    /// Creates an event wait node and adds it to the graph with the given dependencies and event.
1714    /// The dependency list may be empty, in which case the node is placed at the
1715    /// graph root. It may not contain duplicate entries.
1716    ///
1717    /// The graph node waits for all work captured in `event`.
1718    /// See [`sys::cuEventRecord`](singe_cuda_sys::driver::cuEventRecord) for details on what is captured by an event.
1719    /// Synchronization is performed efficiently on the device when applicable.
1720    /// `event` may come from a different context or device than the launch stream.
1721    ///
1722    /// These nodes may not be used in loops or conditionals.
1723    ///
1724    /// Graph objects are not threadsafe.
1725    ///
1726    /// # Errors
1727    ///
1728    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1729    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1730    /// not call CUDA functions; see [`Stream::add_callback`].
1731    pub fn add_event_wait_node(
1732        &mut self,
1733        dependencies: &[GraphNode],
1734        event: &Event,
1735    ) -> Result<GraphNode> {
1736        self.check_nodes(dependencies)?;
1737        let mut handle = ptr::null_mut();
1738        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1739        unsafe {
1740            try_ffi!(runtime::cudaGraphAddEventWaitNode(
1741                &raw mut handle,
1742                self.as_raw(),
1743                dependencies_raw.as_ptr(),
1744                dependencies_raw.len() as _,
1745                event.as_raw(),
1746            ))?;
1747            Ok(self.node_from_raw(handle))
1748        }
1749    }
1750
1751    /// Creates a CPU execution node and adds it to the graph with the given dependencies and host-node parameters.
1752    /// The dependency list may be empty, in which case the node is placed at the
1753    /// graph root. It may not contain duplicate entries.
1754    ///
1755    /// When the graph is launched, the node invokes the specified CPU function.
1756    /// Host nodes are not supported under MPS with pre-Volta GPUs.
1757    ///
1758    /// Graph objects are not threadsafe.
1759    ///
1760    /// # Safety
1761    ///
1762    /// CUDA stores the raw callback function and user-data pointer in the graph
1763    /// node for later replay. The caller must ensure `params` remains valid
1764    /// according to [`HostNodeParams::new`] for every graph instantiation and
1765    /// launch that can execute this node.
1766    ///
1767    /// # Errors
1768    ///
1769    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1770    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1771    /// not call CUDA functions; see [`Stream::add_callback`].
1772    pub unsafe fn add_host_node(
1773        &mut self,
1774        dependencies: &[GraphNode],
1775        params: &HostNodeParams,
1776    ) -> Result<GraphNode> {
1777        self.check_nodes(dependencies)?;
1778        let mut handle = ptr::null_mut();
1779        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1780        let params = params.into();
1781        unsafe {
1782            try_ffi!(runtime::cudaGraphAddHostNode(
1783                &raw mut handle,
1784                self.as_raw(),
1785                dependencies_raw.as_ptr(),
1786                dependencies_raw.len() as _,
1787                &raw const params,
1788            ))?;
1789            Ok(self.node_from_raw(handle))
1790        }
1791    }
1792
1793    /// Creates a kernel execution node and adds it to the graph with the given dependencies, launch configuration, and kernel parameters.
1794    /// The dependency list may be empty, in which case the node is placed at the
1795    /// graph root. It may not contain duplicate entries.
1796    ///
1797    /// When the graph is launched, the node invokes the kernel on the grid and blocks specified by [`LaunchConfig`].
1798    /// [`LaunchConfig::shared_memory_bytes`](crate::module::LaunchConfig::shared_memory_bytes) sets the amount of dynamic shared memory available to each thread block.
1799    /// Kernel parameters are passed with [`KernelParameters`](crate::module::KernelParameters) or tuples of shared or mutable references.
1800    ///
1801    /// Kernels launched using graphs must not use texture and surface references.
1802    /// Reading or writing through any texture or surface reference is undefined behavior.
1803    /// This restriction does not apply to texture and surface objects.
1804    ///
1805    /// Runtime kernel handles queried via [`sys::cudaLibraryGetKernel`](singe_cuda_sys::runtime::cudaLibraryGetKernel) or [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) may be used.
1806    /// The symbol passed to [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) must be registered with the same CUDA Runtime instance.
1807    /// Passing a symbol that belongs to a different runtime instance results in undefined behavior.
1808    ///
1809    /// Graph objects are not threadsafe.
1810    ///
1811    /// # Safety
1812    ///
1813    /// CUDA copies the kernel argument values during this call and stores those
1814    /// copied values in the graph node for later replay. If an argument value is
1815    /// itself a pointer, only the pointer address is copied. The caller must
1816    /// ensure every copied pointer value remains valid for every graph
1817    /// instantiation, update, and launch that can execute this node. Mutable
1818    /// pointer arguments must also remain exclusive for the work ordered by
1819    /// those launches.
1820    ///
1821    /// # Errors
1822    ///
1823    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1824    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1825    /// not call CUDA functions; see [`Stream::add_callback`].
1826    pub unsafe fn add_kernel_node<'a, P>(
1827        &mut self,
1828        dependencies: &[GraphNode],
1829        function: DeviceFunction,
1830        config: &LaunchConfig,
1831        params: P,
1832    ) -> Result<GraphNode>
1833    where
1834        P: KernelLaunchArgs<'a>,
1835    {
1836        self.check_nodes(dependencies)?;
1837        let mut handle = ptr::null_mut();
1838        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1839        params.with_encoded_arguments(|mut arguments| unsafe {
1840            let params = runtime::cudaKernelNodeParams {
1841                func: function.as_raw().cast(),
1842                gridDim: config.grid_dim().into(),
1843                blockDim: config.block_dim().into(),
1844                sharedMemBytes: config.shared_memory_bytes_u32(),
1845                kernelParams: arguments.as_mut_ptr().cast(),
1846                extra: ptr::null_mut(),
1847            };
1848            try_ffi!(runtime::cudaGraphAddKernelNode(
1849                &raw mut handle,
1850                self.as_raw(),
1851                dependencies_raw.as_ptr(),
1852                dependencies_raw.len() as _,
1853                &raw const params,
1854            ))?;
1855            Ok(self.node_from_raw(handle))
1856        })
1857    }
1858
1859    /// Creates a new 1D memcpy node and adds it to the graph with the given dependencies.
1860    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
1861    ///
1862    /// When the graph is launched, the node copies `count` bytes from `src` to `dst`.
1863    /// The transfer direction is described by [`MemoryCopyKind`].
1864    /// [`MemoryCopyKind::Default`] is recommended when unified virtual addressing is available, in which case the transfer direction is inferred from the pointer values.
1865    /// Launching a memcpy node with `dst` and `src` pointers that do not match the direction of the copy results in undefined behavior.
1866    ///
1867    /// Memcpy nodes have additional restrictions for managed memory if any device in the system does not support concurrent managed access.
1868    ///
1869    /// Graph objects are not threadsafe.
1870    ///
1871    /// # Safety
1872    ///
1873    /// CUDA stores the raw source and destination addresses in the graph node
1874    /// for later replay. The caller must ensure `params` remains valid
1875    /// according to [`Memcpy1DNodeParams::new`] for every graph instantiation
1876    /// and launch that can execute this node.
1877    ///
1878    /// # Errors
1879    ///
1880    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1881    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1882    /// not call CUDA functions; see [`Stream::add_callback`].
1883    pub unsafe fn add_memory_copy_node_1d(
1884        &mut self,
1885        dependencies: &[GraphNode],
1886        params: &MemoryCopy1DNodeParams,
1887    ) -> Result<GraphNode> {
1888        self.check_nodes(dependencies)?;
1889        let mut handle = ptr::null_mut();
1890        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1891        unsafe {
1892            try_ffi!(runtime::cudaGraphAddMemcpyNode1D(
1893                &raw mut handle,
1894                self.as_raw(),
1895                dependencies_raw.as_ptr(),
1896                dependencies_raw.len() as _,
1897                params.dst().cast(),
1898                params.src().cast(),
1899                params.count() as _,
1900                params.kind().into(),
1901            ))?;
1902            Ok(self.node_from_raw(handle))
1903        }
1904    }
1905
1906    /// Creates a device-to-device memcpy node from typed byte buffers.
1907    ///
1908    /// The node copies `src.byte_len()` bytes. `dst` must have at least that
1909    /// many bytes.
1910    ///
1911    /// # Safety
1912    ///
1913    /// CUDA stores the raw source and destination addresses in the graph node
1914    /// for later replay. The caller must ensure `dst` and `src` remain valid
1915    /// for every graph instantiation and launch that can execute this node.
1916    /// `dst` must not be accessed through another mutable path while graph
1917    /// launches using this node can write it.
1918    ///
1919    /// # Errors
1920    ///
1921    /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
1922    /// operation, if a previous asynchronous launch reported an error, or if CUDA
1923    /// reports runtime initialization diagnostics.
1924    pub unsafe fn add_memory_copy_node_1d_device_to_device<D, S>(
1925        &mut self,
1926        dependencies: &[GraphNode],
1927        dst: &mut D,
1928        src: &S,
1929    ) -> Result<GraphNode>
1930    where
1931        D: ByteBufferMut + ?Sized,
1932        S: ByteBuffer + ?Sized,
1933    {
1934        let count = src.byte_len();
1935        if dst.byte_len() < count {
1936            return Err(Error::InvalidMemoryAccess);
1937        }
1938        let params = unsafe {
1939            MemoryCopy1DNodeParams::new(
1940                dst.as_byte_mut_ptr().cast(),
1941                src.as_byte_ptr().cast(),
1942                count,
1943                MemoryCopyKind::DeviceToDevice,
1944            )
1945        };
1946        unsafe { self.add_memory_copy_node_1d(dependencies, &params) }
1947    }
1948
1949    /// Creates a device-to-device memcpy node between graph-retained buffers.
1950    ///
1951    /// The node copies `src.byte_len()` bytes. `dst` must have at least that
1952    /// many bytes. The graph retains both allocations so the baked CUDA graph
1953    /// pointers remain live for future instantiation and replay.
1954    ///
1955    /// # Errors
1956    ///
1957    /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
1958    /// operation, if a previous asynchronous launch reported an error, or if CUDA
1959    /// reports runtime initialization diagnostics.
1960    pub fn add_buffer_memory_copy_node_1d_device_to_device<T>(
1961        &mut self,
1962        dependencies: &[GraphNode],
1963        dst: &mut GraphBuffer<T>,
1964        src: &GraphBuffer<T>,
1965    ) -> Result<GraphNode>
1966    where
1967        T: DeviceRepr + Send + Sync,
1968    {
1969        self.check_buffer_contexts(dst, src)?;
1970        let count = src.byte_len();
1971        if dst.byte_len() < count {
1972            return Err(Error::InvalidMemoryAccess);
1973        }
1974        let params = unsafe {
1975            MemoryCopy1DNodeParams::new(
1976                dst.as_mut_ptr().cast(),
1977                src.as_ptr().cast(),
1978                count,
1979                MemoryCopyKind::DeviceToDevice,
1980            )
1981        };
1982        let node = unsafe { self.add_memory_copy_node_1d(dependencies, &params)? };
1983        self.retain_buffer(dst);
1984        self.retain_buffer(src);
1985        Ok(node)
1986    }
1987
1988    /// Creates a memcpy node and adds it to the graph with the given dependencies.
1989    /// The dependency list may be empty, in which case the node is placed at the
1990    /// graph root. It may not contain duplicate entries.
1991    ///
1992    /// When the graph is launched, the node performs the memcpy described by `params`.
1993    /// See [`sys::cudaMemcpy3D`](singe_cuda_sys::runtime::cudaMemcpy3D) for a description of the structure and its restrictions.
1994    ///
1995    /// Memcpy nodes have additional restrictions for managed memory if any device in the system does not support concurrent managed access.
1996    ///
1997    /// Graph objects are not threadsafe.
1998    ///
1999    /// # Safety
2000    ///
2001    /// CUDA stores the raw source and destination addresses in the graph node
2002    /// for later replay. The caller must ensure `params` remains valid
2003    /// according to [`Memcpy3DNodeParams`] for every graph instantiation and
2004    /// launch that can execute this node.
2005    ///
2006    /// # Errors
2007    ///
2008    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2009    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2010    /// not call CUDA functions; see [`Stream::add_callback`].
2011    pub unsafe fn add_memory_copy_node(
2012        &mut self,
2013        dependencies: &[GraphNode],
2014        params: &MemoryCopy3DNodeParams,
2015    ) -> Result<GraphNode> {
2016        self.check_nodes(dependencies)?;
2017        let mut handle = ptr::null_mut();
2018        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2019        let params = params.into();
2020        unsafe {
2021            try_ffi!(runtime::cudaGraphAddMemcpyNode(
2022                &raw mut handle,
2023                self.as_raw(),
2024                dependencies_raw.as_ptr(),
2025                dependencies_raw.len() as _,
2026                &raw const params,
2027            ))?;
2028            Ok(self.node_from_raw(handle))
2029        }
2030    }
2031
2032    /// # Safety
2033    ///
2034    /// CUDA stores the raw symbol and source pointer in the graph node for
2035    /// later replay. The caller must ensure `params` remains valid according to
2036    /// [`MemcpyToSymbolNodeParams::new`] for every graph instantiation and
2037    /// launch that can execute this node.
2038    pub unsafe fn add_memory_copy_node_to_symbol(
2039        &mut self,
2040        dependencies: &[GraphNode],
2041        params: &MemoryCopyToSymbolNodeParams,
2042    ) -> Result<GraphNode> {
2043        self.check_nodes(dependencies)?;
2044        let mut handle = ptr::null_mut();
2045        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2046        unsafe {
2047            try_ffi!(runtime::cudaGraphAddMemcpyNodeToSymbol(
2048                &raw mut handle,
2049                self.as_raw(),
2050                dependencies_raw.as_ptr(),
2051                dependencies_raw.len() as _,
2052                params.symbol().cast(),
2053                params.src().cast(),
2054                params.count() as _,
2055                params.offset() as _,
2056                params.kind().into(),
2057            ))?;
2058            Ok(self.node_from_raw(handle))
2059        }
2060    }
2061
2062    /// # Safety
2063    ///
2064    /// CUDA stores the raw destination and symbol pointer in the graph node for
2065    /// later replay. The caller must ensure `params` remains valid according to
2066    /// [`MemoryCopyFromSymbolNodeParams::new`] for every graph instantiation and
2067    /// launch that can execute this node.
2068    pub unsafe fn add_memory_copy_node_from_symbol(
2069        &mut self,
2070        dependencies: &[GraphNode],
2071        params: &MemoryCopyFromSymbolNodeParams,
2072    ) -> Result<GraphNode> {
2073        self.check_nodes(dependencies)?;
2074        let mut handle = ptr::null_mut();
2075        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2076        unsafe {
2077            try_ffi!(runtime::cudaGraphAddMemcpyNodeFromSymbol(
2078                &raw mut handle,
2079                self.as_raw(),
2080                dependencies_raw.as_ptr(),
2081                dependencies_raw.len() as _,
2082                params.dst().cast(),
2083                params.symbol().cast(),
2084                params.count() as _,
2085                params.offset() as _,
2086                params.kind().into(),
2087            ))?;
2088            Ok(self.node_from_raw(handle))
2089        }
2090    }
2091
2092    /// Creates a new memset node and adds it to the graph with the given dependencies.
2093    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2094    ///
2095    /// The element size must be 1, 2, or 4 bytes.
2096    /// When the graph is launched, the node performs the memset described by `params`.
2097    ///
2098    /// Graph objects are not threadsafe.
2099    ///
2100    /// # Safety
2101    ///
2102    /// CUDA stores the destination address in the graph node for later replay.
2103    /// The caller must ensure `params` remains valid according to
2104    /// [`MemorySetNodeParams::new`] for every graph instantiation and launch that
2105    /// can execute this node.
2106    ///
2107    /// # Errors
2108    ///
2109    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2110    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2111    /// not call CUDA functions; see [`Stream::add_callback`].
2112    pub unsafe fn add_memory_set_node(
2113        &mut self,
2114        dependencies: &[GraphNode],
2115        params: &MemorySetNodeParams,
2116    ) -> Result<GraphNode> {
2117        self.check_nodes(dependencies)?;
2118        let mut handle = ptr::null_mut();
2119        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2120        let params = params.into();
2121        unsafe {
2122            try_ffi!(runtime::cudaGraphAddMemsetNode(
2123                &raw mut handle,
2124                self.as_raw(),
2125                dependencies_raw.as_ptr(),
2126                dependencies_raw.len() as _,
2127                &raw const params,
2128            ))?;
2129            Ok(self.node_from_raw(handle))
2130        }
2131    }
2132
2133    /// Creates a new node which executes an embedded graph, and adds it to the graph with the given dependencies.
2134    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2135    ///
2136    /// If `child_graph` contains allocation nodes, free nodes, or conditional nodes, this call returns an error.
2137    ///
2138    /// The node executes an embedded child graph.
2139    /// The child graph is cloned in this call.
2140    ///
2141    /// Graph objects are not threadsafe.
2142    ///
2143    /// # Errors
2144    ///
2145    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2146    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2147    /// not call CUDA functions; see [`Stream::add_callback`].
2148    pub fn add_child_graph_node(
2149        &mut self,
2150        dependencies: &[GraphNode],
2151        child_graph: &Self,
2152    ) -> Result<GraphNode> {
2153        self.check_nodes(dependencies)?;
2154        self.check_child_graph_context(child_graph)?;
2155        let mut handle = ptr::null_mut();
2156        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2157        unsafe {
2158            try_ffi!(runtime::cudaGraphAddChildGraphNode(
2159                &raw mut handle,
2160                self.as_raw(),
2161                dependencies_raw.as_ptr(),
2162                dependencies_raw.len() as _,
2163                child_graph.as_raw(),
2164            ))?;
2165            Ok(self.node_from_raw(handle))
2166        }
2167    }
2168
2169    /// Creates a new memory free node for a graph allocation and adds it to the graph.
2170    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2171    ///
2172    /// [`Graph::add_mem_free_node`] returns [`crate::error::Status::InvalidValue`] if the caller attempts to free:
2173    ///
2174    /// * an allocation twice in the same graph.
2175    /// * an address that was not returned by an allocation node.
2176    /// * an invalid address.
2177    ///
2178    /// The following restrictions apply to graphs which contain allocation and/or memory free nodes:
2179    ///
2180    /// * Nodes and edges of the graph cannot be deleted.
2181    /// * The graph can only be used in a child node if the ownership is moved to the parent.
2182    /// * Only one instantiation of the graph may exist at any point in time.
2183    /// * The graph cannot be cloned.
2184    ///
2185    /// Graph objects are not threadsafe.
2186    ///
2187    /// # Errors
2188    ///
2189    /// Returns [`Error::GraphNodeMismatch`] if `allocation` did not come from this
2190    /// graph. Returns an error if CUDA rejects the graph operation or if a
2191    /// previous asynchronous launch reported an error.
2192    pub fn add_memory_free_node(
2193        &mut self,
2194        dependencies: &[GraphNode],
2195        allocation: &MemoryAllocationNodeInfo,
2196    ) -> Result<GraphNode> {
2197        if allocation.graph_id != Some(self.id) {
2198            return Err(Error::GraphNodeMismatch);
2199        }
2200        unsafe { self.add_memory_free_node_raw(dependencies, allocation.ptr) }
2201    }
2202
2203    /// Creates a new memory free node from a raw device address.
2204    ///
2205    /// # Safety
2206    ///
2207    /// CUDA stores the raw address in the graph. The caller must ensure `ptr`
2208    /// is a graph allocation that may be freed by this graph, is ordered after
2209    /// the allocation node, and is not freed more than once or by another graph
2210    /// in a way that violates CUDA graph allocation ownership rules.
2211    pub unsafe fn add_memory_free_node_raw(
2212        &mut self,
2213        dependencies: &[GraphNode],
2214        ptr: DevicePtr,
2215    ) -> Result<GraphNode> {
2216        self.check_nodes(dependencies)?;
2217        let mut handle = ptr::null_mut();
2218        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2219        unsafe {
2220            try_ffi!(runtime::cudaGraphAddMemFreeNode(
2221                &raw mut handle,
2222                self.as_raw(),
2223                dependencies_raw.as_ptr(),
2224                dependencies_raw.len() as _,
2225                ptr.as_ptr() as _,
2226            ))?;
2227            Ok(self.node_from_raw(handle))
2228        }
2229    }
2230
2231    /// Creates a new allocation node and adds it to the graph with the given dependencies and allocation parameters.
2232    /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2233    ///
2234    /// When [`Graph::add_mem_alloc_node`] creates an allocation node, it returns the allocation metadata in [`MemoryAllocationNodeInfo`].
2235    /// The allocation's address remains fixed across instantiations and launches.
2236    ///
2237    /// If the allocation is freed in the same graph, by creating a free node using [`Graph::add_mem_free_node`], the allocation can be accessed by nodes ordered after the allocation node but before the free node.
2238    /// These allocations cannot be freed outside the owning graph, and they can only be freed once in the owning graph.
2239    ///
2240    /// If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the graph which are ordered after the allocation node, but also by stream operations ordered after the graph's execution but before the allocation is freed.
2241    ///
2242    /// Allocations which are not freed in the same graph can be freed by:
2243    ///
2244    /// * passing the allocation to [`DeviceMemory::free_async`](crate::memory::DeviceMemory::free_async) or [`DeviceMemory::free`](crate::memory::DeviceMemory::free);
2245    /// * launching a graph with a free node for that allocation; or
2246    /// * specifying [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`] during instantiation, which makes each launch behave as though it called [`DeviceMemory::free_async`](crate::memory::DeviceMemory::free_async) for every unfreed allocation.
2247    ///
2248    /// It is not possible to free an allocation in both the owning graph and another graph.
2249    /// If the allocation is freed in the same graph, a free node cannot be added to another graph.
2250    /// If the allocation is freed in another graph, a free node can no longer be added to the owning graph.
2251    ///
2252    /// The following restrictions apply to graphs which contain allocation and/or memory free nodes:
2253    ///
2254    /// * Nodes and edges of the graph cannot be deleted.
2255    /// * The graph can only be used in a child node if the ownership is moved to the parent.
2256    /// * Only one instantiation of the graph may exist at any point in time.
2257    /// * The graph cannot be cloned.
2258    ///
2259    /// Graph objects are not threadsafe.
2260    ///
2261    /// # Errors
2262    ///
2263    /// Returns an error if CUDA rejects the graph operation or if a previous asynchronous
2264    /// launch reported an error.
2265    pub fn add_memory_allocation_node(
2266        &mut self,
2267        dependencies: &[GraphNode],
2268        params: &MemoryAllocationNodeParams<'_>,
2269    ) -> Result<(GraphNode, MemoryAllocationNodeInfo)> {
2270        self.check_nodes(dependencies)?;
2271        let mut handle = ptr::null_mut();
2272        let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2273        let access_descs: Vec<_> = params
2274            .access_descs
2275            .iter()
2276            .copied()
2277            .map(Into::into)
2278            .collect();
2279        let mut params_raw = runtime::cudaMemAllocNodeParams {
2280            poolProps: params.pool_props.into(),
2281            accessDescs: access_descs.as_ptr(),
2282            accessDescCount: access_descs.len() as _,
2283            bytesize: params.byte_size as _,
2284            dptr: 0,
2285        };
2286        unsafe {
2287            try_ffi!(runtime::cudaGraphAddMemAllocNode(
2288                &raw mut handle,
2289                self.as_raw(),
2290                dependencies_raw.as_ptr(),
2291                dependencies_raw.len() as _,
2292                &raw mut params_raw,
2293            ))?;
2294            // TODO: verify dptr?
2295            let node = self.node_from_raw(handle);
2296            let allocation = MemoryAllocationNodeInfo::from_raw_in_graph(
2297                DevicePtr::new(params_raw.dptr as *mut ()),
2298                params.byte_size,
2299                self.id,
2300                Arc::clone(&self.inner),
2301                self.ctx.clone(),
2302            );
2303            Ok((node, allocation))
2304        }
2305    }
2306
2307    /// Returns this graph's nodes.
2308    ///
2309    /// Graph objects are not threadsafe.
2310    ///
2311    /// # Errors
2312    ///
2313    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2314    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2315    /// not call CUDA functions; see [`Stream::add_callback`].
2316    pub fn nodes(&self) -> Result<Vec<GraphNode>> {
2317        unsafe {
2318            let mut count = 0;
2319            try_ffi!(runtime::cudaGraphGetNodes(
2320                self.as_raw(),
2321                ptr::null_mut(),
2322                &raw mut count,
2323            ))?;
2324
2325            if count == 0 {
2326                return Ok(Vec::new());
2327            }
2328
2329            let mut handles = Vec::with_capacity(count as usize);
2330            try_ffi!(runtime::cudaGraphGetNodes(
2331                self.as_raw(),
2332                handles.as_mut_ptr(),
2333                &raw mut count,
2334            ))?;
2335            handles.set_len(count as usize);
2336
2337            Ok(handles
2338                .into_iter()
2339                .map(|handle| self.node_from_raw(handle))
2340                .collect())
2341        }
2342    }
2343
2344    /// Returns this graph's root nodes.
2345    ///
2346    /// Graph objects are not threadsafe.
2347    ///
2348    /// # Errors
2349    ///
2350    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2351    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2352    /// not call CUDA functions; see [`Stream::add_callback`].
2353    pub fn root_nodes(&self) -> Result<Vec<GraphNode>> {
2354        unsafe {
2355            let mut count = 0;
2356            try_ffi!(runtime::cudaGraphGetRootNodes(
2357                self.as_raw(),
2358                ptr::null_mut(),
2359                &raw mut count,
2360            ))?;
2361
2362            if count == 0 {
2363                return Ok(Vec::new());
2364            }
2365
2366            let mut handles = Vec::with_capacity(count as usize);
2367            try_ffi!(runtime::cudaGraphGetRootNodes(
2368                self.as_raw(),
2369                handles.as_mut_ptr(),
2370                &raw mut count,
2371            ))?;
2372            handles.set_len(count as usize);
2373
2374            Ok(handles
2375                .into_iter()
2376                .map(|handle| self.node_from_raw(handle))
2377                .collect())
2378        }
2379    }
2380
2381    /// Returns this graph's dependency edges.
2382    ///
2383    /// Graph objects are not threadsafe.
2384    ///
2385    /// # Errors
2386    ///
2387    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2388    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2389    /// not call CUDA functions; see [`Stream::add_callback`].
2390    pub fn edges(&self) -> Result<Vec<GraphEdge>> {
2391        unsafe {
2392            let mut count = 0;
2393            try_ffi!(runtime::cudaGraphGetEdges(
2394                self.as_raw(),
2395                ptr::null_mut(),
2396                ptr::null_mut(),
2397                ptr::null_mut(),
2398                &raw mut count,
2399            ))?;
2400
2401            if count == 0 {
2402                return Ok(Vec::new());
2403            }
2404
2405            let len = count as usize;
2406            let mut from = Vec::with_capacity(len);
2407            let mut to = Vec::with_capacity(len);
2408            let mut edge_data = Vec::with_capacity(len);
2409            try_ffi!(runtime::cudaGraphGetEdges(
2410                self.as_raw(),
2411                from.as_mut_ptr(),
2412                to.as_mut_ptr(),
2413                edge_data.as_mut_ptr(),
2414                &raw mut count,
2415            ))?;
2416            let len = count as usize;
2417            from.set_len(len);
2418            to.set_len(len);
2419            edge_data.set_len(len);
2420
2421            Ok(from
2422                .into_iter()
2423                .zip(to)
2424                .zip(edge_data)
2425                .map(|((from, to), data)| GraphEdge {
2426                    from: self.node_from_raw(from),
2427                    to: self.node_from_raw(to),
2428                    data: data.into(),
2429                })
2430                .collect())
2431        }
2432    }
2433
2434    /// Returns a compact summary of this graph's native CUDA topology.
2435    ///
2436    /// The summary is computed from CUDA graph introspection APIs and counts
2437    /// node kinds, root nodes, and dependency edges in this graph. Child graph
2438    /// nodes are counted as child nodes here; callers that need recursive
2439    /// details can query the child graph returned by [`GraphNode::child_graph`].
2440    ///
2441    /// Graph objects are not threadsafe.
2442    ///
2443    /// # Errors
2444    ///
2445    /// Returns an error if CUDA rejects a topology query, if a previous
2446    /// asynchronous launch reported an error, or if CUDA reports runtime
2447    /// initialization diagnostics.
2448    pub fn topology_summary(&self) -> Result<GraphTopologySummary> {
2449        let nodes = self.nodes()?;
2450        let mut summary = GraphTopologySummary {
2451            nodes: nodes.len(),
2452            root_nodes: self.root_nodes()?.len(),
2453            edges: self.edges()?.len(),
2454            ..GraphTopologySummary::default()
2455        };
2456        for node in nodes {
2457            summary.record_node_type(node.node_type()?);
2458        }
2459        Ok(summary)
2460    }
2461
2462    /// Writes a DOT-formatted description of the graph to `path`.
2463    /// By default this includes the graph topology, node types, node ID, kernel names, and memcpy direction.
2464    /// `flags` can request more detailed information about each node type, such as parameter values, kernel attributes, node handles, and function handles.
2465    ///
2466    /// # Errors
2467    ///
2468    /// Returns an error if `path` contains an interior NUL byte or if CUDA
2469    /// Runtime cannot write the DOT file.
2470    pub fn write_dot(&self, path: &str, flags: GraphDebugDotFlags) -> Result<()> {
2471        let path = CString::new(path)?;
2472        unsafe {
2473            try_ffi!(runtime::cudaGraphDebugDotPrint(
2474                self.as_raw(),
2475                path.as_ptr(),
2476                flags.bits(),
2477            ))?;
2478        }
2479        Ok(())
2480    }
2481
2482    pub fn as_raw(&self) -> runtime::cudaGraph_t {
2483        self.inner.handle
2484    }
2485
2486    pub fn context(&self) -> Option<&Context> {
2487        self.ctx.as_deref()
2488    }
2489
2490    /// Consumes the graph and returns the raw CUDA graph handle without
2491    /// destroying it.
2492    ///
2493    /// The caller becomes responsible for eventually destroying the returned
2494    /// handle with CUDA.
2495    pub fn into_raw(self) -> runtime::cudaGraph_t {
2496        let inner = Arc::try_unwrap(self.inner)
2497            .unwrap_or_else(|_| panic!("cannot take raw graph handle while it is still shared"));
2498        let inner = ManuallyDrop::new(inner);
2499        inner.handle
2500    }
2501}
2502
2503impl Drop for GraphInner {
2504    fn drop(&mut self) {
2505        if !self.owns_handle {
2506            return;
2507        }
2508        unsafe {
2509            if let Err(err) = try_ffi!(runtime::cudaGraphDestroy(self.handle)) {
2510                #[cfg(debug_assertions)]
2511                eprintln!("failed to destroy cuda graph: {err}");
2512            }
2513        }
2514    }
2515}
2516
2517impl<'graph> BorrowedGraph<'graph> {
2518    /// Wraps an existing CUDA graph handle without taking ownership.
2519    ///
2520    /// # Safety
2521    ///
2522    /// `handle` must be a valid CUDA graph handle for the returned lifetime.
2523    /// The returned graph view will not destroy `handle` when dropped.
2524    pub unsafe fn from_raw(handle: runtime::cudaGraph_t) -> Result<Self> {
2525        unsafe { Self::from_raw_in_context(handle, None) }
2526    }
2527
2528    /// Wraps an existing CUDA graph handle without taking ownership and keeps a
2529    /// modeled context association for safe graph operations through the
2530    /// borrowed view.
2531    ///
2532    /// # Safety
2533    ///
2534    /// `handle` must be a valid CUDA graph handle for the returned lifetime,
2535    /// and it must be associated with `ctx` when `ctx` is present. The returned
2536    /// graph view will not destroy `handle` when dropped.
2537    pub unsafe fn from_raw_in_context(
2538        handle: runtime::cudaGraph_t,
2539        ctx: Option<Arc<Context>>,
2540    ) -> Result<Self> {
2541        if handle.is_null() {
2542            return Err(Error::NullHandle);
2543        }
2544
2545        Ok(Self {
2546            graph: unsafe { Graph::from_raw_borrowed_in_context(handle, ctx) },
2547            _node: PhantomData,
2548        })
2549    }
2550
2551    pub const fn as_graph(&self) -> &Graph {
2552        &self.graph
2553    }
2554
2555    pub fn as_raw(&self) -> runtime::cudaGraph_t {
2556        self.graph.as_raw()
2557    }
2558}
2559
2560impl Deref for BorrowedGraph<'_> {
2561    type Target = Graph;
2562
2563    fn deref(&self) -> &Self::Target {
2564        self.as_graph()
2565    }
2566}
2567
2568#[derive(Debug)]
2569pub struct ExecutableGraph {
2570    handle: runtime::cudaGraphExec_t,
2571    ctx: Option<Arc<Context>>,
2572    source_graph_id: Option<GraphId>,
2573    _source_graph: Option<Arc<GraphInner>>,
2574    retained: Vec<RetainedAllocation>,
2575}
2576
2577#[derive(Debug, Clone, Copy)]
2578pub struct ExecutableGraphLaunchOperation<'graph> {
2579    graph: &'graph ExecutableGraph,
2580}
2581
2582#[derive(Debug)]
2583pub struct RawExecutableGraph {
2584    handle: runtime::cudaGraphExec_t,
2585}
2586
2587impl RawExecutableGraph {
2588    /// Wraps an existing CUDA executable graph handle and takes ownership of it.
2589    ///
2590    /// # Safety
2591    ///
2592    /// `handle` must be a valid CUDA executable graph handle.
2593    /// Ownership of `handle` is transferred to the returned [`RawExecutableGraph`], and the handle must not be destroyed elsewhere after calling this function.
2594    pub unsafe fn from_raw(handle: runtime::cudaGraphExec_t) -> Result<Self> {
2595        if handle.is_null() {
2596            return Err(Error::NullHandle);
2597        }
2598
2599        Ok(Self { handle })
2600    }
2601
2602    pub const fn as_raw(&self) -> runtime::cudaGraphExec_t {
2603        self.handle
2604    }
2605
2606    /// Consumes the executable graph and returns the raw CUDA executable graph
2607    /// handle without destroying it.
2608    ///
2609    /// The caller becomes responsible for eventually destroying the returned
2610    /// handle with CUDA.
2611    pub fn into_raw(self) -> runtime::cudaGraphExec_t {
2612        let graph = ManuallyDrop::new(self);
2613        graph.as_raw()
2614    }
2615}
2616
2617impl Drop for RawExecutableGraph {
2618    fn drop(&mut self) {
2619        unsafe {
2620            if let Err(err) = try_ffi!(runtime::cudaGraphExecDestroy(self.handle)) {
2621                #[cfg(debug_assertions)]
2622                eprintln!("failed to destroy cuda graph exec: {err}");
2623            }
2624        }
2625    }
2626}
2627
2628impl ExecutableGraph {
2629    fn bind_context(&self) -> Result<()> {
2630        if let Some(ctx) = &self.ctx {
2631            ctx.bind()?;
2632        }
2633        Ok(())
2634    }
2635
2636    unsafe fn from_raw_with_graph(
2637        handle: runtime::cudaGraphExec_t,
2638        ctx: Option<Arc<Context>>,
2639        source_graph_id: Option<GraphId>,
2640        source_graph: Option<Arc<GraphInner>>,
2641        retained: Vec<RetainedAllocation>,
2642    ) -> Result<Self> {
2643        if handle.is_null() {
2644            return Err(Error::NullHandle);
2645        }
2646
2647        Ok(Self {
2648            handle,
2649            ctx,
2650            source_graph_id,
2651            _source_graph: source_graph,
2652            retained,
2653        })
2654    }
2655
2656    fn check_node(&self, node: &GraphNode) -> Result<()> {
2657        self.bind_context()?;
2658        if !matches!((self.source_graph_id, node.graph_id), (Some(source_id), Some(node_id)) if node_id == source_id)
2659        {
2660            return Err(Error::GraphNodeMismatch);
2661        }
2662        Ok(())
2663    }
2664
2665    fn retain_buffer<T>(&mut self, buffer: &GraphBuffer<T>)
2666    where
2667        T: DeviceRepr + Send + Sync,
2668    {
2669        self.retained.push(buffer.retained());
2670    }
2671
2672    /// Returns the flags that were passed to instantiation for the given executable graph.
2673    /// [`GraphInstantiateFlags::UPLOAD`] is not returned because it does not affect the resulting executable graph.
2674    ///
2675    /// Graph objects are not threadsafe.
2676    ///
2677    /// # Errors
2678    ///
2679    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2680    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2681    /// not call CUDA functions; see [`Stream::add_callback`].
2682    pub fn flags(&self) -> Result<GraphInstantiateFlags> {
2683        self.bind_context()?;
2684        let mut flags = 0;
2685        unsafe {
2686            try_ffi!(runtime::cudaGraphExecGetFlags(
2687                self.as_raw(),
2688                &raw mut flags
2689            ))?;
2690        }
2691        Ok(GraphInstantiateFlags::from_bits_retain(flags))
2692    }
2693
2694    /// Executes this executable graph in `stream`.
2695    /// Only one instance of this executable graph may be executing at a time.
2696    /// Each launch is ordered behind both any previous work in `stream` and any previous launches of this executable graph.
2697    /// To execute a graph concurrently, it must be instantiated multiple times into multiple executable graphs.
2698    ///
2699    /// If any allocations created by this executable graph remain unfreed from a previous launch and the graph was not instantiated with [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`], the launch fails with [`crate::error::Status::InvalidValue`].
2700    ///
2701    /// Graph objects are not threadsafe.
2702    ///
2703    /// # Errors
2704    ///
2705    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2706    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2707    /// not call CUDA functions; see [`Stream::add_callback`].
2708    pub fn launch(&self, stream: &Stream) -> Result<()> {
2709        if let Some(ctx) = &self.ctx
2710            && stream.context() != ctx.as_ref()
2711        {
2712            return Err(Error::StreamContextMismatch);
2713        }
2714        self.bind_context()?;
2715        unsafe {
2716            try_ffi!(runtime::cudaGraphLaunch(self.as_raw(), stream.as_raw()))?;
2717        }
2718        Ok(())
2719    }
2720
2721    /// Returns a reusable operation object that launches this executable graph.
2722    pub const fn launch_operation(&self) -> ExecutableGraphLaunchOperation<'_> {
2723        ExecutableGraphLaunchOperation { graph: self }
2724    }
2725
2726    /// Uploads this executable graph to the device in `stream` without executing it.
2727    /// Uploads of the same executable graph are serialized.
2728    /// Each upload is ordered behind both any previous work in `stream` and any previous launches of this executable graph.
2729    /// Uses memory cached by `stream` to back the allocations owned by this executable graph.
2730    ///
2731    /// # Errors
2732    ///
2733    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2734    /// reported an error, or if CUDA reports runtime initialization diagnostics.
2735    pub fn upload(&self, stream: &Stream) -> Result<()> {
2736        if let Some(ctx) = &self.ctx
2737            && stream.context() != ctx.as_ref()
2738        {
2739            return Err(Error::StreamContextMismatch);
2740        }
2741        self.bind_context()?;
2742        unsafe {
2743            try_ffi!(runtime::cudaGraphUpload(self.as_raw(), stream.as_raw()))?;
2744        }
2745        Ok(())
2746    }
2747
2748    /// Updates this executable graph with the node parameters in a topologically identical `graph`.
2749    ///
2750    /// Limitations:
2751    ///
2752    /// * Kernel nodes:
2753    ///   + The owning context of the kernel function cannot change.
2754    ///   + A node whose kernel function originally did not use CUDA dynamic parallelism cannot be updated to a kernel function that uses CDP.
2755    ///   + A node whose kernel function originally did not make device-side update calls cannot be updated to a kernel function that makes device-side
2756    ///     update calls.
2757    ///   + A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
2758    ///   + If the graph was instantiated with [`GraphInstantiateFlags::USE_NODE_PRIORITY`], the priority attribute cannot change.
2759    ///     Equality
2760    ///     is checked on the originally requested priority values, before they are clamped to the device's supported range.
2761    ///   + If this executable graph was not instantiated for device launch, a node whose kernel function originally did not use device-side [`ExecutableGraph::launch`] cannot be updated to a kernel function that uses device-side [`ExecutableGraph::launch`] unless the node resides on the same device as nodes which contained such calls at instantiate-time.
2762    ///     If no such calls were
2763    ///     present at instantiation, these updates cannot be performed at all.
2764    ///   + Neither the source graph nor this executable graph may contain device-updatable kernel nodes.
2765    /// * Memset and memcpy nodes:
2766    ///   + The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
2767    ///   + The source/destination memory must be allocated from the same contexts as the original source/destination memory.
2768    ///   + For 2D memsets, only address and assigned value may be updated.
2769    ///   + For 1D memsets, updating dimensions is also allowed, but may fail if the resulting operation does not map onto the work resources
2770    ///     already allocated for the node.
2771    /// * Additional memcpy node restrictions:
2772    ///   + Changing either the source or destination memory type, such as [`MemoryType::Device`](crate::types::MemoryType::Device) or [`MemoryType::Array`](crate::types::MemoryType::Array), is not supported.
2773    /// * Conditional nodes:
2774    ///   + Changing node parameters is not supported.
2775    ///   + Changing parameters of nodes within the conditional body graph is subject to the rules above.
2776    ///   + Conditional handle flags and default values are updated as part of the graph update.
2777    ///
2778    /// CUDA may add further restrictions in future releases.
2779    /// [`ExecutableGraph::update`] sets the update result to [`GraphExecUpdateResult::ErrorTopologyChanged`] under the following conditions:
2780    ///
2781    /// * The count of nodes directly in the executable graph and the source graph differ.
2782    /// * The source graph has more exit nodes.
2783    /// * A node in the source graph has a different number of dependencies than the paired node from the executable graph.
2784    /// * A node in the source graph has a dependency that does not match the corresponding dependency of the paired node from the executable graph.
2785    ///   The dependencies are paired based on edge order and
2786    ///   a dependency does not match when the nodes are already paired based on other edges examined in the graph.
2787    ///
2788    /// [`ExecutableGraph::update`] sets the update result to:
2789    ///
2790    /// * [`GraphExecUpdateResult::Error`] if passed an invalid value.
2791    /// * [`GraphExecUpdateResult::ErrorTopologyChanged`] if the graph topology changed.
2792    /// * [`GraphExecUpdateResult::ErrorNodeTypeChanged`] if the type of a node changed.
2793    /// * [`GraphExecUpdateResult::ErrorFunctionChanged`] if the kernel function of a node changed (CUDA driver before 11.2).
2794    /// * [`GraphExecUpdateResult::ErrorUnsupportedFunctionChange`] if the kernel function changed in an unsupported way.
2795    /// * [`GraphExecUpdateResult::ErrorParametersChanged`] if any parameters to a node changed in a way that is not supported.
2796    /// * [`GraphExecUpdateResult::ErrorAttributesChanged`] if any attributes of a node changed in a way that is not supported.
2797    /// * [`GraphExecUpdateResult::ErrorNotSupported`] if something about a node is unsupported, like the node's type or configuration.
2798    ///
2799    /// If the update fails for a reason not listed above, the result is [`GraphExecUpdateResult::Error`].
2800    /// If the update succeeds, the result is [`GraphExecUpdateResult::Success`].
2801    ///
2802    /// [`ExecutableGraph::update`] succeeds when the update was performed successfully.
2803    /// It returns [`crate::error::Status::GraphExecUpdateFailure`] if the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.
2804    ///
2805    /// Graph objects are not threadsafe.
2806    ///
2807    /// # Errors
2808    ///
2809    /// Returns an error if CUDA rejects the graph update, if the update violates instantiated graph
2810    /// update constraints, or if a previous asynchronous launch reported an error. CUDA may also
2811    /// return initialization-related errors such as [`crate::error::Status::NotInitialized`],
2812    /// [`crate::error::Status::CallRequiresNewerDriver`], or [`crate::error::Status::NoDevice`] if this call initializes
2813    /// internal runtime state. Callbacks must not call CUDA functions; see
2814    /// [`Stream::add_callback`].
2815    pub fn update(&mut self, graph: &Graph) -> Result<ExecutableGraphUpdate> {
2816        if let (Some(exec_ctx), Some(graph_ctx)) = (&self.ctx, &graph.ctx)
2817            && exec_ctx.as_ref() != graph_ctx.as_ref()
2818        {
2819            return Err(Error::GraphContextMismatch);
2820        }
2821        self.bind_context()?;
2822        let mut result_info = runtime::cudaGraphExecUpdateResultInfo::default();
2823        unsafe {
2824            try_ffi!(runtime::cudaGraphExecUpdate(
2825                self.as_raw(),
2826                graph.as_raw(),
2827                &raw mut result_info,
2828            ))?;
2829        }
2830        self.retained.extend(graph.retained.iter().cloned());
2831        Ok(ExecutableGraphUpdate::from_result_info(result_info, graph))
2832    }
2833
2834    /// Sets the parameters of a kernel node in this executable graph.
2835    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
2836    ///
2837    /// `node` must not have been removed from the original graph.
2838    /// All node parameters may change, but the following restrictions apply to function updates:
2839    ///
2840    /// * The owning device of the kernel function cannot change.
2841    /// * A node whose kernel function originally did not use CUDA dynamic parallelism cannot be updated to a kernel function that uses CDP
2842    /// * A node whose kernel function originally did not make device-side update calls cannot be updated to a kernel function that makes device-side
2843    ///   update calls.
2844    /// * If this executable graph was not instantiated for device launch, a node whose kernel function originally did not use device-side [`ExecutableGraph::launch`] cannot be updated to a kernel function that uses device-side [`ExecutableGraph::launch`] unless the node resides on the same device as nodes which contained such calls at instantiate-time.
2845    ///   If no such calls were
2846    ///   present at instantiation, these updates cannot be performed at all.
2847    ///
2848    /// The modifications only affect future launches of this executable graph.
2849    /// Already enqueued or running launches of this executable graph are not affected by this call.
2850    /// The original `node` is also not modified by this call.
2851    ///
2852    /// If `node` is a device-updatable kernel node, the next upload or launch of this executable graph will overwrite any previous device-side updates.
2853    /// Additionally, applying host updates to a device-updatable kernel node while it is being updated from the device results in undefined behavior.
2854    /// This can also be used with a runtime kernel handle queried through [`sys::cudaLibraryGetKernel`](singe_cuda_sys::runtime::cudaLibraryGetKernel) or [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) and then passed as a raw pointer.
2855    /// The symbol passed to [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) must be registered with the same CUDA Runtime instance.
2856    /// Passing a symbol that belongs to a different runtime instance results in undefined behavior.
2857    /// The only type that can be reliably passed to a different runtime instance is the runtime kernel handle type itself.
2858    ///
2859    /// Graph objects are not threadsafe.
2860    ///
2861    /// # Safety
2862    ///
2863    /// CUDA copies the kernel argument values during this call and stores those
2864    /// copied values in the executable graph for future launches. If an
2865    /// argument value is itself a pointer, only the pointer address is copied.
2866    /// The caller must ensure every copied pointer value remains valid for
2867    /// every future launch that can execute this node. Mutable pointer
2868    /// arguments must also remain exclusive for the work ordered by those
2869    /// launches.
2870    ///
2871    /// # Errors
2872    ///
2873    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2874    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2875    /// not call CUDA functions; see [`Stream::add_callback`].
2876    pub unsafe fn set_kernel_node_params<'a, P>(
2877        &mut self,
2878        node: GraphNode,
2879        function: DeviceFunction,
2880        config: &LaunchConfig,
2881        params: P,
2882    ) -> Result<()>
2883    where
2884        P: KernelLaunchArgs<'a>,
2885    {
2886        self.check_node(&node)?;
2887        params.with_encoded_arguments(|mut arguments| unsafe {
2888            let params = runtime::cudaKernelNodeParams {
2889                func: function.as_raw().cast(),
2890                gridDim: config.grid_dim().into(),
2891                blockDim: config.block_dim().into(),
2892                sharedMemBytes: config.shared_memory_bytes_u32(),
2893                kernelParams: arguments.as_mut_ptr().cast(),
2894                extra: ptr::null_mut(),
2895            };
2896            try_ffi!(runtime::cudaGraphExecKernelNodeSetParams(
2897                self.as_raw(),
2898                node.as_raw(),
2899                &raw const params,
2900            ))?;
2901            Ok(())
2902        })
2903    }
2904
2905    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
2906    /// `node` must remain in the graph which was used to instantiate this executable graph.
2907    /// Changed edges to and from `node` are ignored.
2908    ///
2909    /// The source and destination must be allocated from the same contexts as the original source and destination memory.
2910    /// The instantiation-time memory operands must be 1-dimensional.
2911    /// Zero-length operations are not supported.
2912    ///
2913    /// The modifications only affect future launches of this executable graph.
2914    /// Already enqueued or running launches of this executable graph are not affected by this call.
2915    /// The original `node` is also not modified by this call.
2916    ///
2917    /// Returns [`crate::error::Status::InvalidValue`] if the memory operands' mappings changed or the original memory operands are multidimensional.
2918    ///
2919    /// Graph objects are not threadsafe.
2920    ///
2921    /// # Safety
2922    ///
2923    /// CUDA stores the raw source and destination addresses in the executable
2924    /// graph for future launches. The caller must ensure `params` remains
2925    /// valid according to [`Memcpy1DNodeParams::new`] for every future launch
2926    /// that can execute this node.
2927    ///
2928    /// # Errors
2929    ///
2930    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2931    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2932    /// not call CUDA functions; see [`Stream::add_callback`].
2933    pub unsafe fn set_memory_copy_node_1d_params(
2934        &mut self,
2935        node: GraphNode,
2936        params: &MemoryCopy1DNodeParams,
2937    ) -> Result<()> {
2938        self.check_node(&node)?;
2939        unsafe {
2940            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParams1D(
2941                self.as_raw(),
2942                node.as_raw(),
2943                params.dst().cast(),
2944                params.src().cast(),
2945                params.count() as _,
2946                params.kind().into(),
2947            ))?;
2948        }
2949        Ok(())
2950    }
2951
2952    /// Updates a memcpy node to copy between typed device byte buffers.
2953    ///
2954    /// The node copies `src.byte_len()` bytes. `dst` must have at least that
2955    /// many bytes.
2956    ///
2957    /// # Safety
2958    ///
2959    /// CUDA stores the raw source and destination addresses in the executable
2960    /// graph for future launches. The caller must ensure `dst` and `src`
2961    /// remain valid for every future launch that can execute this node. `dst`
2962    /// must not be accessed through another mutable path while graph launches
2963    /// using this node can write it.
2964    ///
2965    /// # Errors
2966    ///
2967    /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
2968    /// operation, if a previous asynchronous launch reported an error, or if CUDA
2969    /// reports runtime initialization diagnostics.
2970    pub unsafe fn set_memory_copy_node_1d_device_to_device<D, S>(
2971        &mut self,
2972        node: GraphNode,
2973        dst: &mut D,
2974        src: &S,
2975    ) -> Result<()>
2976    where
2977        D: ByteBufferMut + ?Sized,
2978        S: ByteBuffer + ?Sized,
2979    {
2980        let count = src.byte_len();
2981        if dst.byte_len() < count {
2982            return Err(Error::InvalidMemoryAccess);
2983        }
2984        let params = unsafe {
2985            MemoryCopy1DNodeParams::new(
2986                dst.as_byte_mut_ptr().cast(),
2987                src.as_byte_ptr().cast(),
2988                count,
2989                MemoryCopyKind::DeviceToDevice,
2990            )
2991        };
2992        unsafe { self.set_memory_copy_node_1d_params(node, &params) }
2993    }
2994
2995    /// Updates a memcpy node to copy between graph-retained buffers.
2996    ///
2997    /// The node copies `src.byte_len()` bytes. `dst` must have at least that
2998    /// many bytes. The executable graph retains both allocations so future
2999    /// launches cannot outlive the baked CUDA pointer values.
3000    ///
3001    /// # Errors
3002    ///
3003    /// Returns an error if `dst` is smaller than `src`, if `node` does not
3004    /// belong to the graph used to instantiate this executable graph, if CUDA
3005    /// rejects the graph update, if a previous asynchronous launch reported an
3006    /// error, or if CUDA reports runtime initialization diagnostics.
3007    pub fn set_buffer_memory_copy_node_1d_device_to_device<T>(
3008        &mut self,
3009        node: GraphNode,
3010        dst: &mut GraphBuffer<T>,
3011        src: &GraphBuffer<T>,
3012    ) -> Result<()>
3013    where
3014        T: DeviceRepr + Send + Sync,
3015    {
3016        if let (Some(exec_ctx), Some(dst_ctx)) = (&self.ctx, dst.context())
3017            && exec_ctx.as_ref() != dst_ctx
3018        {
3019            return Err(Error::GraphContextMismatch);
3020        }
3021        if let (Some(exec_ctx), Some(src_ctx)) = (&self.ctx, src.context())
3022            && exec_ctx.as_ref() != src_ctx
3023        {
3024            return Err(Error::GraphContextMismatch);
3025        }
3026        let count = src.byte_len();
3027        if dst.byte_len() < count {
3028            return Err(Error::InvalidMemoryAccess);
3029        }
3030        let params = unsafe {
3031            MemoryCopy1DNodeParams::new(
3032                dst.as_mut_ptr().cast(),
3033                src.as_ptr().cast(),
3034                count,
3035                MemoryCopyKind::DeviceToDevice,
3036            )
3037        };
3038        unsafe { self.set_memory_copy_node_1d_params(node, &params)? };
3039        self.retain_buffer(dst);
3040        self.retain_buffer(src);
3041        Ok(())
3042    }
3043
3044    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
3045    /// `node` must remain in the graph which was used to instantiate this executable graph.
3046    /// Changed edges to and from `node` are ignored.
3047    ///
3048    /// The source and destination memory in `params` must be allocated from the same contexts as the original source and destination memory.
3049    /// Both the instantiation-time memory operands and the memory operands in `params` must be 1-dimensional.
3050    /// Zero-length operations are not supported.
3051    ///
3052    /// The modifications only affect future launches of this executable graph.
3053    /// Already enqueued or running launches of this executable graph are not affected by this call.
3054    /// The original `node` is also not modified by this call.
3055    ///
3056    /// Returns [`crate::error::Status::InvalidValue`] if the memory operands' mappings changed or either the original or new memory operands are multidimensional.
3057    ///
3058    /// Graph objects are not threadsafe.
3059    ///
3060    /// # Safety
3061    ///
3062    /// CUDA stores the raw source and destination addresses in the executable
3063    /// graph for future launches. The caller must ensure `params` remains
3064    /// valid according to [`MemoryCopy3DNodeParams`] for every future launch that
3065    /// can execute this node.
3066    ///
3067    /// # Errors
3068    ///
3069    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3070    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3071    /// not call CUDA functions; see [`Stream::add_callback`].
3072    pub unsafe fn set_memory_copy_node_params(
3073        &mut self,
3074        node: GraphNode,
3075        params: &MemoryCopy3DNodeParams,
3076    ) -> Result<()> {
3077        self.check_node(&node)?;
3078        let params = params.into();
3079        unsafe {
3080            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParams(
3081                self.as_raw(),
3082                node.as_raw(),
3083                &raw const params,
3084            ))?;
3085        }
3086        Ok(())
3087    }
3088
3089    /// # Safety
3090    ///
3091    /// CUDA stores the raw symbol and source pointer in the executable graph
3092    /// for future launches. The caller must ensure `params` remains valid
3093    /// according to [`MemoryCopyToSymbolNodeParams::new`] for every future launch
3094    /// that can execute this node.
3095    pub unsafe fn set_memory_copy_node_to_symbol_params(
3096        &mut self,
3097        node: GraphNode,
3098        params: &MemoryCopyToSymbolNodeParams,
3099    ) -> Result<()> {
3100        self.check_node(&node)?;
3101        unsafe {
3102            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParamsToSymbol(
3103                self.as_raw(),
3104                node.as_raw(),
3105                params.symbol().cast(),
3106                params.src().cast(),
3107                params.count() as _,
3108                params.offset() as _,
3109                params.kind().into(),
3110            ))?;
3111        }
3112        Ok(())
3113    }
3114
3115    /// # Safety
3116    ///
3117    /// CUDA stores the raw destination and symbol pointer in the executable
3118    /// graph for future launches. The caller must ensure `params` remains
3119    /// valid according to [`MemoryCopyFromSymbolNodeParams::new`] for every future
3120    /// launch that can execute this node.
3121    pub unsafe fn set_memory_copy_node_from_symbol_params(
3122        &mut self,
3123        node: GraphNode,
3124        params: &MemoryCopyFromSymbolNodeParams,
3125    ) -> Result<()> {
3126        self.check_node(&node)?;
3127        unsafe {
3128            try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParamsFromSymbol(
3129                self.as_raw(),
3130                node.as_raw(),
3131                params.dst().cast(),
3132                params.symbol().cast(),
3133                params.count() as _,
3134                params.offset() as _,
3135                params.kind().into(),
3136            ))?;
3137        }
3138        Ok(())
3139    }
3140
3141    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
3142    /// `node` must remain in the graph which was used to instantiate this executable graph.
3143    /// Changed edges to and from `node` are ignored.
3144    ///
3145    /// Zero-sized operations are not supported.
3146    ///
3147    /// The new destination pointer in `params` must be to the same kind of allocation as the original destination pointer and have the same context association and device mapping as the original destination pointer.
3148    ///
3149    /// Both the value and pointer address may be updated.
3150    /// Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
3151    /// Specifically, for 2D memsets, all dimension changes are rejected.
3152    /// For 1D memsets, changes in height are explicitly rejected and other changes are opportunistically allowed if the resulting work maps onto the work resources already allocated for the node.
3153    ///
3154    /// The modifications only affect future launches of this executable graph.
3155    /// Already enqueued or running launches of this executable graph are not affected by this call.
3156    /// The original `node` is also not modified by this call.
3157    ///
3158    /// Graph objects are not threadsafe.
3159    ///
3160    /// # Safety
3161    ///
3162    /// CUDA stores the raw destination address in the executable graph for
3163    /// future launches. The caller must ensure `params` remains valid according
3164    /// to [`MemorySetNodeParams::new`] for every future launch that can execute
3165    /// this node.
3166    ///
3167    /// # Errors
3168    ///
3169    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3170    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3171    /// not call CUDA functions; see [`Stream::add_callback`].
3172    pub unsafe fn set_memory_set_node_params(
3173        &mut self,
3174        node: GraphNode,
3175        params: &MemorySetNodeParams,
3176    ) -> Result<()> {
3177        self.check_node(&node)?;
3178        let params = params.into();
3179        unsafe {
3180            try_ffi!(runtime::cudaGraphExecMemsetNodeSetParams(
3181                self.as_raw(),
3182                node.as_raw(),
3183                &raw const params,
3184            ))?;
3185        }
3186        Ok(())
3187    }
3188
3189    /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
3190    /// `node` must remain in the graph which was used to instantiate this executable graph.
3191    /// Changed edges to and from `node` are ignored.
3192    ///
3193    /// The modifications only affect future launches of this executable graph.
3194    /// Already enqueued or running launches of this executable graph are not affected by this call.
3195    /// The original `node` is also not modified by this call.
3196    ///
3197    /// Graph objects are not threadsafe.
3198    ///
3199    /// # Safety
3200    ///
3201    /// CUDA stores the raw callback function and user-data pointer in the
3202    /// executable graph for future launches. The caller must ensure `params`
3203    /// remains valid according to [`HostNodeParams::new`] for every future
3204    /// launch that can execute this node.
3205    ///
3206    /// # Errors
3207    ///
3208    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3209    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3210    /// not call CUDA functions; see [`Stream::add_callback`].
3211    pub unsafe fn set_host_node_params(
3212        &mut self,
3213        node: GraphNode,
3214        params: &HostNodeParams,
3215    ) -> Result<()> {
3216        self.check_node(&node)?;
3217        let params = params.into();
3218        unsafe {
3219            try_ffi!(runtime::cudaGraphExecHostNodeSetParams(
3220                self.as_raw(),
3221                node.as_raw(),
3222                &raw const params,
3223            ))?;
3224        }
3225        Ok(())
3226    }
3227
3228    /// Sets the event of an event record node in this executable graph.
3229    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3230    ///
3231    /// The modifications only affect future launches of this executable graph.
3232    /// Already enqueued or running launches of this executable graph are not affected by this call.
3233    /// The original `node` is also not modified by this call.
3234    ///
3235    /// Graph objects are not threadsafe.
3236    ///
3237    /// # Errors
3238    ///
3239    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3240    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3241    /// not call CUDA functions; see [`Stream::add_callback`].
3242    pub fn set_event_record_node_event(&mut self, node: GraphNode, event: &Event) -> Result<()> {
3243        self.check_node(&node)?;
3244        if let Some(ctx) = &self.ctx
3245            && ctx.as_ref() != event.context()
3246        {
3247            return Err(Error::GraphContextMismatch);
3248        }
3249        unsafe {
3250            try_ffi!(runtime::cudaGraphExecEventRecordNodeSetEvent(
3251                self.as_raw(),
3252                node.as_raw(),
3253                event.as_raw(),
3254            ))?;
3255        }
3256        Ok(())
3257    }
3258
3259    /// Updates the work represented by `node` in this executable graph as though the nodes contained in `node`'s graph had the parameters contained in `child_graph`'s nodes at instantiation.
3260    /// `node` must remain in the graph which was used to instantiate this executable graph.
3261    /// Changed edges to and from `node` are ignored.
3262    ///
3263    /// The modifications only affect future launches of this executable graph.
3264    /// Already enqueued or running launches of this executable graph are not affected by this call.
3265    /// The original `node` is also not modified by this call.
3266    ///
3267    /// The topology of `child_graph`, as well as the node insertion order, must match that of the graph contained in `node`.
3268    /// See [`ExecutableGraph::update`] for a list of restrictions on what can be updated in an instantiated graph.
3269    /// The update is recursive, so child graph nodes contained within the top-level child graph are also updated.
3270    ///
3271    /// Graph objects are not threadsafe.
3272    ///
3273    /// # Errors
3274    ///
3275    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3276    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3277    /// not call CUDA functions; see [`Stream::add_callback`].
3278    pub fn set_child_graph_node(&mut self, node: GraphNode, child_graph: &Graph) -> Result<()> {
3279        self.check_node(&node)?;
3280        if let (Some(exec_ctx), Some(child_ctx)) = (&self.ctx, &child_graph.ctx)
3281            && exec_ctx.as_ref() != child_ctx.as_ref()
3282        {
3283            return Err(Error::GraphContextMismatch);
3284        }
3285        unsafe {
3286            try_ffi!(runtime::cudaGraphExecChildGraphNodeSetParams(
3287                self.as_raw(),
3288                node.as_raw(),
3289                child_graph.as_raw(),
3290            ))?;
3291        }
3292        Ok(())
3293    }
3294
3295    /// Sets the event of an event wait node in this executable graph.
3296    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3297    ///
3298    /// The modifications only affect future launches of this executable graph.
3299    /// Already enqueued or running launches of this executable graph are not affected by this call.
3300    /// The original `node` is also not modified by this call.
3301    ///
3302    /// Graph objects are not threadsafe.
3303    ///
3304    /// # Errors
3305    ///
3306    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3307    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3308    /// not call CUDA functions; see [`Stream::add_callback`].
3309    pub fn set_event_wait_node_event(&mut self, node: GraphNode, event: &Event) -> Result<()> {
3310        self.check_node(&node)?;
3311        unsafe {
3312            try_ffi!(runtime::cudaGraphExecEventWaitNodeSetEvent(
3313                self.as_raw(),
3314                node.as_raw(),
3315                event.as_raw(),
3316            ))?;
3317        }
3318        Ok(())
3319    }
3320
3321    /// Sets `node` to be either enabled or disabled.
3322    /// Disabled nodes are functionally equivalent to empty nodes until they are reenabled.
3323    /// Existing node parameters are not affected by disabling/enabling the node.
3324    ///
3325    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3326    ///
3327    /// `node` must not have been removed from the original graph.
3328    ///
3329    /// The modifications only affect future launches of this executable graph.
3330    /// Already enqueued or running launches of this executable graph are not affected by this call.
3331    /// The original `node` is also not modified by this call.
3332    ///
3333    /// Currently only kernel, memset and memcpy nodes are supported.
3334    ///
3335    /// Graph objects are not threadsafe.
3336    ///
3337    /// # Errors
3338    ///
3339    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3340    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3341    /// not call CUDA functions; see [`Stream::add_callback`].
3342    fn set_node_enabled(&mut self, node: GraphNode, enabled: bool) -> Result<()> {
3343        self.check_node(&node)?;
3344        unsafe {
3345            try_ffi!(runtime::cudaGraphNodeSetEnabled(
3346                self.as_raw(),
3347                node.as_raw(),
3348                u32::from(enabled),
3349            ))?;
3350        }
3351        Ok(())
3352    }
3353
3354    pub fn enable_node(&mut self, node: GraphNode) -> Result<()> {
3355        self.set_node_enabled(node, true)
3356    }
3357
3358    pub fn disable_node(&mut self, node: GraphNode) -> Result<()> {
3359        self.set_node_enabled(node, false)
3360    }
3361
3362    /// Returns whether `node` is enabled.
3363    ///
3364    /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3365    ///
3366    /// `node` must not have been removed from the original graph.
3367    ///
3368    /// Currently only kernel, memset and memcpy nodes are supported.
3369    ///
3370    /// Graph objects are not threadsafe.
3371    ///
3372    /// # Errors
3373    ///
3374    /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3375    /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3376    /// not call CUDA functions; see [`Stream::add_callback`].
3377    pub fn is_node_enabled(&self, node: GraphNode) -> Result<bool> {
3378        self.check_node(&node)?;
3379        let mut enabled = 0;
3380        unsafe {
3381            try_ffi!(runtime::cudaGraphNodeGetEnabled(
3382                self.as_raw(),
3383                node.as_raw(),
3384                &raw mut enabled,
3385            ))?;
3386        }
3387        Ok(enabled != 0)
3388    }
3389
3390    pub const fn as_raw(&self) -> runtime::cudaGraphExec_t {
3391        self.handle
3392    }
3393
3394    pub fn context(&self) -> Option<&Context> {
3395        self.ctx.as_deref()
3396    }
3397
3398    /// Consumes the executable graph and returns the raw CUDA executable graph
3399    /// handle without destroying it.
3400    ///
3401    /// The caller becomes responsible for eventually destroying the returned
3402    /// handle with CUDA.
3403    pub fn into_raw(self) -> runtime::cudaGraphExec_t {
3404        let graph = ManuallyDrop::new(self);
3405        graph.as_raw()
3406    }
3407}
3408
3409impl Drop for ExecutableGraph {
3410    fn drop(&mut self) {
3411        unsafe {
3412            if let Err(err) = try_ffi!(runtime::cudaGraphExecDestroy(self.handle)) {
3413                #[cfg(debug_assertions)]
3414                eprintln!("failed to destroy cuda graph exec: {err}");
3415            }
3416        }
3417    }
3418}
3419
3420impl ExecutableGraphLaunchOperation<'_> {
3421    /// Enqueues this graph launch in `stream`.
3422    ///
3423    /// # Errors
3424    ///
3425    /// Returns an error if CUDA rejects the graph operation, if `stream` belongs to a different context, if a previous asynchronous launch reported an error, or if CUDA reports runtime initialization diagnostics.
3426    pub fn enqueue(self, stream: &Stream) -> Result<()> {
3427        self.graph.launch(stream)
3428    }
3429
3430    pub const fn graph(&self) -> &ExecutableGraph {
3431        self.graph
3432    }
3433}
3434
3435#[derive(Debug, Clone, PartialEq, Eq)]
3436pub struct ExecutableGraphUpdate {
3437    pub result: GraphExecUpdateResult,
3438    pub error_node: Option<GraphNode>,
3439    pub error_from_node: Option<GraphNode>,
3440}
3441
3442impl ExecutableGraphUpdate {
3443    fn from_result_info(value: runtime::cudaGraphExecUpdateResultInfo, graph: &Graph) -> Self {
3444        Self {
3445            result: value.result.into(),
3446            error_node: if value.errorNode.is_null() {
3447                None
3448            } else {
3449                Some(graph.node_from_raw(value.errorNode))
3450            },
3451            error_from_node: if value.errorFromNode.is_null() {
3452                None
3453            } else {
3454                Some(graph.node_from_raw(value.errorFromNode))
3455            },
3456        }
3457    }
3458}
3459
3460#[derive(Debug, Clone, Copy)]
3461pub struct MemorySetNodeParams {
3462    dst: DevicePtr,
3463    pitch: usize,
3464    value: u32,
3465    element_size: u32,
3466    width: usize,
3467    height: usize,
3468}
3469
3470impl MemorySetNodeParams {
3471    /// Creates raw memset node parameters.
3472    ///
3473    /// # Safety
3474    ///
3475    /// `dst` must be valid for writes of `element_size * width` bytes when the
3476    /// graph executes. If `height` or `pitch` are changed after construction,
3477    /// the caller must account for those values as required by CUDA.
3478    pub const unsafe fn new(dst: DevicePtr, element_size: u32, width: usize) -> Self {
3479        Self {
3480            dst,
3481            pitch: 0,
3482            value: 0,
3483            element_size,
3484            width,
3485            height: 1,
3486        }
3487    }
3488
3489    pub const fn with_pitch(mut self, pitch: usize) -> Self {
3490        self.pitch = pitch;
3491        self
3492    }
3493
3494    pub const fn with_value(mut self, value: u32) -> Self {
3495        self.value = value;
3496        self
3497    }
3498
3499    pub const fn with_height(mut self, height: usize) -> Self {
3500        self.height = height;
3501        self
3502    }
3503
3504    pub const fn dst(self) -> DevicePtr {
3505        self.dst
3506    }
3507
3508    pub const fn pitch(self) -> usize {
3509        self.pitch
3510    }
3511
3512    pub const fn value(self) -> u32 {
3513        self.value
3514    }
3515
3516    pub const fn element_size(self) -> u32 {
3517        self.element_size
3518    }
3519
3520    pub const fn width(self) -> usize {
3521        self.width
3522    }
3523
3524    pub const fn height(self) -> usize {
3525        self.height
3526    }
3527}
3528
3529impl From<&MemorySetNodeParams> for driver::CUDA_MEMSET_NODE_PARAMS {
3530    fn from(value: &MemorySetNodeParams) -> Self {
3531        Self {
3532            dst: value.dst().as_ptr() as _,
3533            pitch: value.pitch() as _,
3534            value: value.value(),
3535            elementSize: value.element_size(),
3536            width: value.width() as _,
3537            height: value.height() as _,
3538        }
3539    }
3540}