singe_cuda/graph/mod.rs
1pub mod raw;
2
3use std::{
4 any::Any,
5 ffi::CString,
6 fmt::{self, Display, Formatter},
7 hash::{Hash, Hasher},
8 marker::PhantomData,
9 mem::ManuallyDrop,
10 ops::Deref,
11 ptr,
12 sync::{
13 Arc,
14 atomic::{AtomicU64, Ordering},
15 },
16};
17
18use num_enum::{IntoPrimitive, TryFromPrimitive};
19use singe_core::{impl_enum_conversion, impl_enum_display};
20use singe_cuda_sys::{driver, runtime};
21
22use crate::{
23 context::Context,
24 dim::Dim3,
25 error::{Error, Result},
26 event::Event,
27 graph::raw::{HostNodeParams, MemoryCopyFromSymbolNodeParams, MemoryCopyToSymbolNodeParams},
28 memory::{DeviceMemory, MemoryAccessDescriptor, MemoryCopyKind, MemoryPoolProps},
29 module::{KernelLaunchArgs, LaunchConfig},
30 stream::Stream,
31 try_ffi,
32 types::{DeviceFunction, DevicePtr},
33 view::{ByteBuffer, ByteBufferMut, DeviceRepr},
34};
35use raw::{MemoryCopy1DNodeParams, MemoryCopy3DNodeParams};
36
37/// Identifiers for [`GraphKernelNodeAttribute`] values used by CUDA graph kernel nodes.
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
39#[repr(u32)]
40#[non_exhaustive]
41pub enum GraphKernelNodeAttributeId {
42 /// Identifies [`GraphKernelNodeAttribute::Cooperative`].
43 Cooperative = runtime::cudaLaunchAttributeID::cudaLaunchAttributeCooperative as _,
44 /// Identifies [`GraphKernelNodeAttribute::ClusterDimension`].
45 ClusterDimension = runtime::cudaLaunchAttributeID::cudaLaunchAttributeClusterDimension as _,
46 /// Identifies [`GraphKernelNodeAttribute::Priority`].
47 Priority = runtime::cudaLaunchAttributeID::cudaLaunchAttributePriority as _,
48 /// Identifies [`GraphKernelNodeAttribute::PreferredSharedMemoryCarveout`].
49 /// The value is a percentage in the range `0..=100` describing the preferred
50 /// shared-memory carveout for the launch. This is a hint, and the driver
51 /// may choose a different configuration if required.
52 PreferredSharedMemoryCarveout =
53 runtime::cudaLaunchAttributeID::cudaLaunchAttributePreferredSharedMemoryCarveout as _,
54}
55
56impl_enum_conversion!(
57 u32,
58 runtime::cudaLaunchAttributeID,
59 GraphKernelNodeAttributeId
60);
61
62#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
63#[non_exhaustive]
64pub enum GraphKernelNodeAttribute {
65 Cooperative(bool),
66 ClusterDimension(Dim3),
67 Priority(i32),
68 PreferredSharedMemoryCarveout(u32),
69}
70
71#[derive(Debug, Clone)]
72pub struct MemoryAllocationNodeInfo {
73 ptr: DevicePtr,
74 pub byte_size: usize,
75 graph_id: Option<GraphId>,
76 _graph: Option<Arc<GraphInner>>,
77 ctx: Option<Arc<Context>>,
78}
79
80bitflags::bitflags! {
81 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82 pub struct GraphInstantiateFlags: u64 {
83 const AUTO_FREE_ON_LAUNCH = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH as _;
84 const UPLOAD = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD as _;
85 const DEVICE_LAUNCH = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH as _;
86 const USE_NODE_PRIORITY = driver::CUgraphInstantiate_flags::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY as _;
87 }
88}
89
90bitflags::bitflags! {
91 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
92 pub struct GraphDebugDotFlags: u32 {
93 const VERBOSE = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE as _;
94 const RUNTIME_TYPES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES as _;
95 const KERNEL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS as _;
96 const MEMCPY_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS as _;
97 const MEMSET_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS as _;
98 const HOST_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS as _;
99 const EVENT_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS as _;
100 const EXTERNAL_SEMAPHORE_SIGNAL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS as _;
101 const EXTERNAL_SEMAPHORE_WAIT_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS as _;
102 const KERNEL_NODE_ATTRIBUTES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES as _;
103 const HANDLES = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES as _;
104 const MEMORY_ALLOC_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS as _;
105 const MEMORY_FREE_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS as _;
106 const BATCH_MEM_OP_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS as _;
107 const EXTRA_TOPOLOGY_INFO = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO as _;
108 const CONDITIONAL_NODE_PARAMS = driver::CUgraphDebugDot_flags::CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS as _;
109 }
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
113#[repr(u32)]
114#[non_exhaustive]
115pub enum GraphNodeType {
116 Kernel = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_KERNEL as _,
117 Memcpy = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEMCPY as _,
118 Memset = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEMSET as _,
119 Host = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_HOST as _,
120 Graph = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_GRAPH as _,
121 Empty = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EMPTY as _,
122 WaitEvent = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_WAIT_EVENT as _,
123 EventRecord = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EVENT_RECORD as _,
124 ExternalSemaphoresSignal = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL as _,
125 ExternalSemaphoresWait = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT as _,
126 MemoryAlloc = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEM_ALLOC as _,
127 MemoryFree = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_MEM_FREE as _,
128 BatchMemOp = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_BATCH_MEM_OP as _,
129 Conditional = driver::CUgraphNodeType::CU_GRAPH_NODE_TYPE_CONDITIONAL as _,
130}
131
132impl_enum_conversion!(u32, runtime::cudaGraphNodeType, GraphNodeType);
133
134#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
135#[repr(u8)]
136#[non_exhaustive]
137pub enum GraphDependencyType {
138 Default = driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT as _,
139 Programmatic = driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC as _,
140}
141
142impl From<driver::CUgraphDependencyType> for GraphDependencyType {
143 fn from(value: driver::CUgraphDependencyType) -> Self {
144 match value {
145 driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT => Self::Default,
146 driver::CUgraphDependencyType::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC => {
147 Self::Programmatic
148 }
149 }
150 }
151}
152
153impl From<GraphDependencyType> for driver::CUgraphDependencyType {
154 fn from(value: GraphDependencyType) -> Self {
155 match value {
156 GraphDependencyType::Default => Self::CU_GRAPH_DEPENDENCY_TYPE_DEFAULT,
157 GraphDependencyType::Programmatic => Self::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC,
158 }
159 }
160}
161
162impl_enum_display!(GraphNodeType, {
163 Self::Kernel => "cudaGraphNodeTypeKernel",
164 Self::Memcpy => "cudaGraphNodeTypeMemcpy",
165 Self::Memset => "cudaGraphNodeTypeMemset",
166 Self::Host => "cudaGraphNodeTypeHost",
167 Self::Graph => "cudaGraphNodeTypeGraph",
168 Self::Empty => "cudaGraphNodeTypeEmpty",
169 Self::WaitEvent => "cudaGraphNodeTypeWaitEvent",
170 Self::EventRecord => "cudaGraphNodeTypeEventRecord",
171 Self::ExternalSemaphoresSignal => "cudaGraphNodeTypeExternalSemaphoresSignal",
172 Self::ExternalSemaphoresWait => "cudaGraphNodeTypeExternalSemaphoresWait",
173 Self::MemoryAlloc => "cudaGraphNodeTypeMemAlloc",
174 Self::MemoryFree => "cudaGraphNodeTypeMemFree",
175 Self::BatchMemOp => "cudaGraphNodeTypeBatchMemOp",
176 Self::Conditional => "cudaGraphNodeTypeConditional",
177});
178
179#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
180#[repr(u32)]
181#[non_exhaustive]
182pub enum GraphExecUpdateResult {
183 Success = driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_SUCCESS as _,
184 Error = driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR as _,
185 ErrorTopologyChanged =
186 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED as _,
187 ErrorNodeTypeChanged =
188 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED as _,
189 ErrorFunctionChanged =
190 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED as _,
191 ErrorParametersChanged =
192 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED as _,
193 ErrorNotSupported =
194 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED as _,
195 ErrorUnsupportedFunctionChange =
196 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE
197 as _,
198 ErrorAttributesChanged =
199 driver::CUgraphExecUpdateResult::CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED as _,
200}
201
202impl_enum_conversion!(driver::CUgraphExecUpdateResult, GraphExecUpdateResult);
203
204impl_enum_display!(GraphExecUpdateResult, {
205 Self::Success => "CU_GRAPH_EXEC_UPDATE_SUCCESS",
206 Self::Error => "CU_GRAPH_EXEC_UPDATE_ERROR",
207 Self::ErrorTopologyChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED",
208 Self::ErrorNodeTypeChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED",
209 Self::ErrorFunctionChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED",
210 Self::ErrorParametersChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED",
211 Self::ErrorNotSupported => "CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED",
212 Self::ErrorUnsupportedFunctionChange => "CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE",
213 Self::ErrorAttributesChanged => "CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED",
214});
215
216#[derive(Debug, Clone)]
217pub struct GraphNode {
218 handle: runtime::cudaGraphNode_t,
219 graph_id: Option<GraphId>,
220 graph: Option<Arc<GraphInner>>,
221 ctx: Option<Arc<Context>>,
222}
223
224impl PartialEq for GraphNode {
225 fn eq(&self, other: &Self) -> bool {
226 self.handle == other.handle && self.graph_id == other.graph_id
227 }
228}
229impl Eq for GraphNode {}
230
231impl Hash for GraphNode {
232 fn hash<H: Hasher>(&self, state: &mut H) {
233 self.handle.hash(state);
234 self.graph_id.hash(state);
235 }
236}
237
238#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
239pub struct GraphEdgeData {
240 pub from_port: u8,
241 pub to_port: u8,
242 pub dependency_type: GraphDependencyType,
243}
244
245#[derive(Debug, Clone, PartialEq, Eq, Hash)]
246pub struct GraphDependency {
247 pub node: GraphNode,
248 pub data: GraphEdgeData,
249}
250
251#[derive(Debug, Clone, PartialEq, Eq, Hash)]
252pub struct GraphEdge {
253 pub from: GraphNode,
254 pub to: GraphNode,
255 pub data: GraphEdgeData,
256}
257
258#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
259pub struct GraphTopologySummary {
260 pub nodes: usize,
261 pub root_nodes: usize,
262 pub edges: usize,
263 pub kernel_nodes: usize,
264 pub memcpy_nodes: usize,
265 pub memset_nodes: usize,
266 pub host_nodes: usize,
267 pub child_graph_nodes: usize,
268 pub empty_nodes: usize,
269 pub wait_event_nodes: usize,
270 pub event_record_nodes: usize,
271 pub external_semaphores_signal_nodes: usize,
272 pub external_semaphores_wait_nodes: usize,
273 pub memory_alloc_nodes: usize,
274 pub memory_free_nodes: usize,
275 pub batch_mem_op_nodes: usize,
276 pub conditional_nodes: usize,
277}
278
279#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
280pub struct Position {
281 pub x: usize,
282 pub y: usize,
283 pub z: usize,
284}
285
286#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
287pub struct Extent {
288 pub width: usize,
289 pub height: usize,
290 pub depth: usize,
291}
292
293#[derive(Debug, Clone)]
294pub struct MemoryAllocationNodeParams<'a> {
295 pub pool_props: MemoryPoolProps,
296 pub access_descs: &'a [MemoryAccessDescriptor],
297 pub byte_size: usize,
298}
299
300impl Default for GraphEdgeData {
301 fn default() -> Self {
302 Self {
303 from_port: 0,
304 to_port: 0,
305 dependency_type: GraphDependencyType::Default,
306 }
307 }
308}
309
310impl From<runtime::cudaGraphEdgeData> for GraphEdgeData {
311 fn from(value: runtime::cudaGraphEdgeData) -> Self {
312 Self {
313 from_port: value.from_port,
314 to_port: value.to_port,
315 dependency_type: GraphDependencyType::try_from(value.type_)
316 .unwrap_or(GraphDependencyType::Default),
317 }
318 }
319}
320
321impl From<GraphEdgeData> for runtime::cudaGraphEdgeData {
322 fn from(value: GraphEdgeData) -> Self {
323 Self {
324 from_port: value.from_port,
325 to_port: value.to_port,
326 type_: value.dependency_type.into(),
327 reserved: [0; 5],
328 }
329 }
330}
331
332impl From<Position> for runtime::cudaPos {
333 fn from(value: Position) -> Self {
334 Self {
335 x: value.x as _,
336 y: value.y as _,
337 z: value.z as _,
338 }
339 }
340}
341
342impl From<Extent> for runtime::cudaExtent {
343 fn from(value: Extent) -> Self {
344 Self {
345 width: value.width as _,
346 height: value.height as _,
347 depth: value.depth as _,
348 }
349 }
350}
351
352impl GraphNode {
353 /// Wraps an existing CUDA graph node handle.
354 ///
355 /// The returned node is not associated with any [`Graph`] identity, so
356 /// graph and executable-graph methods cannot validate that it belongs to
357 /// the target graph before calling CUDA.
358 ///
359 /// # Safety
360 ///
361 /// `handle` must be a valid CUDA graph node handle. The caller must ensure
362 /// the node remains valid for every operation using the returned token and
363 /// that it belongs to the graph or executable graph passed to those
364 /// operations.
365 pub const unsafe fn from_raw(handle: runtime::cudaGraphNode_t) -> Self {
366 Self {
367 handle,
368 graph_id: None,
369 graph: None,
370 ctx: None,
371 }
372 }
373
374 fn from_raw_in_graph(
375 handle: runtime::cudaGraphNode_t,
376 graph_id: GraphId,
377 graph: Arc<GraphInner>,
378 ctx: Option<Arc<Context>>,
379 ) -> Self {
380 Self {
381 handle,
382 graph_id: Some(graph_id),
383 graph: Some(graph),
384 ctx,
385 }
386 }
387
388 fn from_raw_like(handle: runtime::cudaGraphNode_t, node: &Self) -> Self {
389 Self {
390 handle,
391 graph_id: node.graph_id,
392 graph: node.graph.clone(),
393 ctx: node.ctx.clone(),
394 }
395 }
396
397 fn bind_context(&self) -> Result<()> {
398 if let Some(ctx) = &self.ctx {
399 ctx.bind()?;
400 }
401 Ok(())
402 }
403
404 /// Returns the node type.
405 ///
406 /// Graph objects are not threadsafe.
407 ///
408 /// # Errors
409 ///
410 /// Returns an error if CUDA cannot query the node type or if a previous asynchronous launch
411 /// reported an error. CUDA may also return initialization-related errors such as
412 /// [`crate::error::Status::NotInitialized`], [`crate::error::Status::CallRequiresNewerDriver`], or
413 /// [`crate::error::Status::NoDevice`] if this call initializes internal runtime state. Callbacks must not
414 /// call CUDA functions; see [`Stream::add_callback`].
415 pub fn node_type(&self) -> Result<GraphNodeType> {
416 self.bind_context()?;
417 let mut kind = runtime::cudaGraphNodeType::CU_GRAPH_NODE_TYPE_KERNEL;
418 unsafe {
419 try_ffi!(runtime::cudaGraphNodeGetType(self.as_raw(), &raw mut kind))?;
420 }
421 Ok(kind.into())
422 }
423
424 /// Returns this node's dependencies.
425 ///
426 /// Graph objects are not threadsafe.
427 ///
428 /// # Errors
429 ///
430 /// Returns an error if CUDA cannot query the dependencies, a previous
431 /// asynchronous launch reports an error, or CUDA reports runtime
432 /// initialization diagnostics.
433 pub fn dependencies(&self) -> Result<Vec<GraphDependency>> {
434 self.bind_context()?;
435 unsafe {
436 let mut count = 0;
437 try_ffi!(runtime::cudaGraphNodeGetDependencies(
438 self.as_raw(),
439 ptr::null_mut(),
440 ptr::null_mut(),
441 &raw mut count,
442 ))?;
443
444 if count == 0 {
445 return Ok(Vec::new());
446 }
447
448 let mut handles = Vec::with_capacity(count as usize);
449 let mut edge_data = Vec::with_capacity(count as usize);
450 try_ffi!(runtime::cudaGraphNodeGetDependencies(
451 self.as_raw(),
452 handles.as_mut_ptr(),
453 edge_data.as_mut_ptr(),
454 &raw mut count,
455 ))?;
456 handles.set_len(count as usize);
457 edge_data.set_len(count as usize);
458
459 Ok(handles
460 .into_iter()
461 .zip(edge_data)
462 .map(|(handle, data)| GraphDependency {
463 node: Self::from_raw_like(handle, self),
464 data: data.into(),
465 })
466 .collect())
467 }
468 }
469
470 /// Returns this node's dependent nodes.
471 ///
472 /// Graph objects are not threadsafe.
473 ///
474 /// # Errors
475 ///
476 /// Returns an error if CUDA cannot query the dependent nodes, a previous
477 /// asynchronous launch reports an error, or CUDA reports runtime
478 /// initialization diagnostics.
479 pub fn dependent_nodes(&self) -> Result<Vec<GraphDependency>> {
480 self.bind_context()?;
481 unsafe {
482 let mut count = 0;
483 try_ffi!(runtime::cudaGraphNodeGetDependentNodes(
484 self.as_raw(),
485 ptr::null_mut(),
486 ptr::null_mut(),
487 &raw mut count,
488 ))?;
489
490 if count == 0 {
491 return Ok(Vec::new());
492 }
493
494 let mut handles = Vec::with_capacity(count as usize);
495 let mut edge_data = Vec::with_capacity(count as usize);
496 try_ffi!(runtime::cudaGraphNodeGetDependentNodes(
497 self.as_raw(),
498 handles.as_mut_ptr(),
499 edge_data.as_mut_ptr(),
500 &raw mut count,
501 ))?;
502 handles.set_len(count as usize);
503 edge_data.set_len(count as usize);
504
505 Ok(handles
506 .into_iter()
507 .zip(edge_data)
508 .map(|(handle, data)| GraphDependency {
509 node: Self::from_raw_like(handle, self),
510 data: data.into(),
511 })
512 .collect())
513 }
514 }
515
516 /// Returns the event of this event record node.
517 ///
518 /// Graph objects are not threadsafe.
519 ///
520 /// # Errors
521 ///
522 /// Returns an error if this is not an event-record node, CUDA cannot query
523 /// the event, CUDA returns a null event handle, a previous asynchronous
524 /// launch reports an error, or CUDA reports runtime initialization
525 /// diagnostics.
526 pub fn event_record_node_event(&self) -> Result<runtime::cudaEvent_t> {
527 self.bind_context()?;
528 let mut event = ptr::null_mut();
529 unsafe {
530 try_ffi!(runtime::cudaGraphEventRecordNodeGetEvent(
531 self.as_raw(),
532 &raw mut event,
533 ))?;
534 }
535 if event.is_null() {
536 return Err(Error::NullHandle);
537 }
538 Ok(event)
539 }
540
541 /// Returns the event of this event wait node.
542 ///
543 /// Graph objects are not threadsafe.
544 ///
545 /// # Errors
546 ///
547 /// Returns an error if this is not an event-wait node, CUDA cannot query the
548 /// event, CUDA returns a null event handle, a previous asynchronous launch
549 /// reports an error, or CUDA reports runtime initialization diagnostics.
550 pub fn event_wait_node_event(&self) -> Result<runtime::cudaEvent_t> {
551 self.bind_context()?;
552 let mut event = ptr::null_mut();
553 unsafe {
554 try_ffi!(runtime::cudaGraphEventWaitNodeGetEvent(
555 self.as_raw(),
556 &raw mut event,
557 ))?;
558 }
559 if event.is_null() {
560 return Err(Error::NullHandle);
561 }
562 Ok(event)
563 }
564
565 /// Returns a borrowed handle to the embedded graph in a child graph node.
566 /// This does not clone the graph.
567 /// Changes to the returned graph are reflected in the node, and the child
568 /// node retains ownership of the embedded graph handle.
569 /// The returned [`BorrowedGraph`] is tied to this node borrow and does not
570 /// destroy the embedded graph when dropped.
571 ///
572 /// Allocation and free nodes cannot be added to the returned graph.
573 /// Attempting to do so returns an error.
574 ///
575 /// Graph objects are not threadsafe.
576 ///
577 /// # Errors
578 ///
579 /// Returns an error if this is not a child-graph node, CUDA cannot query the
580 /// child graph, CUDA returns a null graph handle, a previous asynchronous
581 /// launch reports an error, or CUDA reports runtime initialization
582 /// diagnostics.
583 pub fn child_graph(&self) -> Result<BorrowedGraph<'_>> {
584 self.bind_context()?;
585 let mut graph = ptr::null_mut();
586 unsafe {
587 try_ffi!(runtime::cudaGraphChildGraphNodeGetGraph(
588 self.as_raw(),
589 &raw mut graph,
590 ))?;
591 }
592 if graph.is_null() {
593 return Err(Error::NullHandle);
594 }
595 unsafe { BorrowedGraph::from_raw_in_context(graph, self.ctx.clone()) }
596 }
597
598 /// Returns the parameters of this memcpy node.
599 ///
600 /// Graph objects are not threadsafe.
601 ///
602 /// # Errors
603 ///
604 /// Returns an error if this is not a memcpy node, CUDA cannot query the
605 /// parameters, a previous asynchronous launch reports an error, or CUDA
606 /// reports runtime initialization diagnostics.
607 pub fn memcpy_node_params(&self) -> Result<runtime::cudaMemcpy3DParms> {
608 self.bind_context()?;
609 let mut params = runtime::cudaMemcpy3DParms::default();
610 unsafe {
611 try_ffi!(runtime::cudaGraphMemcpyNodeGetParams(
612 self.as_raw(),
613 &raw mut params,
614 ))?;
615 }
616 Ok(params)
617 }
618
619 /// Returns the parameters of this memset node.
620 ///
621 /// Graph objects are not threadsafe.
622 ///
623 /// # Errors
624 ///
625 /// Returns an error if this is not a memset node, CUDA cannot query the
626 /// parameters, a previous asynchronous launch reports an error, or CUDA
627 /// reports runtime initialization diagnostics.
628 pub fn memset_node_params(&self) -> Result<driver::CUDA_MEMSET_NODE_PARAMS> {
629 self.bind_context()?;
630 let mut params = driver::CUDA_MEMSET_NODE_PARAMS::default();
631 unsafe {
632 try_ffi!(runtime::cudaGraphMemsetNodeGetParams(
633 self.as_raw(),
634 &raw mut params,
635 ))?;
636 }
637 Ok(params)
638 }
639
640 /// Returns the parameters of this host node.
641 ///
642 /// Graph objects are not threadsafe.
643 ///
644 /// # Errors
645 ///
646 /// Returns an error if this is not a host node, CUDA cannot query the
647 /// parameters, a previous asynchronous launch reports an error, or CUDA
648 /// reports runtime initialization diagnostics.
649 pub fn host_node_params(&self) -> Result<driver::CUDA_HOST_NODE_PARAMS> {
650 self.bind_context()?;
651 let mut params = driver::CUDA_HOST_NODE_PARAMS::default();
652 unsafe {
653 try_ffi!(runtime::cudaGraphHostNodeGetParams(
654 self.as_raw(),
655 &raw mut params,
656 ))?;
657 }
658 Ok(params)
659 }
660
661 /// Returns the parameters of a memory allocation node.
662 /// The `poolProps` and `accessDescs` values in the returned parameters are owned by the node.
663 /// This memory remains valid until the node is destroyed.
664 /// The returned parameters must not be modified.
665 ///
666 /// Graph objects are not threadsafe.
667 ///
668 /// # Errors
669 ///
670 /// Returns an error if this is not a memory-allocation node, CUDA cannot
671 /// query the parameters, a previous asynchronous launch reports an error,
672 /// or CUDA reports runtime initialization diagnostics.
673 pub fn mem_alloc_node_info(&self) -> Result<MemoryAllocationNodeInfo> {
674 self.bind_context()?;
675 let mut params = runtime::cudaMemAllocNodeParams::default();
676 unsafe {
677 try_ffi!(runtime::cudaGraphMemAllocNodeGetParams(
678 self.as_raw(),
679 &raw mut params,
680 ))?;
681 }
682 Ok(MemoryAllocationNodeInfo::from_raw(
683 unsafe { DevicePtr::new(params.dptr as _) },
684 params.bytesize as usize,
685 self.graph_id,
686 self.graph.clone(),
687 self.ctx.clone(),
688 ))
689 }
690
691 /// Returns the address of this memory free node.
692 ///
693 /// Graph objects are not threadsafe.
694 ///
695 /// # Errors
696 ///
697 /// Returns an error if this is not a memory-free node, CUDA cannot query the
698 /// pointer, a previous asynchronous launch reports an error, or CUDA reports
699 /// runtime initialization diagnostics.
700 ///
701 /// # Safety
702 ///
703 /// The node must still be a valid memory-free node in a live graph, and the
704 /// returned pointer must not be used after the graph frees it.
705 pub unsafe fn mem_free_node_ptr(&self) -> Result<DevicePtr> {
706 self.bind_context()?;
707 let mut ptr = ptr::null_mut();
708 unsafe {
709 try_ffi!(runtime::cudaGraphMemFreeNodeGetParams(
710 self.as_raw(),
711 &raw mut ptr as *mut _,
712 ))?;
713 }
714 Ok(unsafe { DevicePtr::new(ptr as _) })
715 }
716
717 /// Returns the requested kernel node attribute.
718 ///
719 /// # Errors
720 ///
721 /// Returns an error if this is not a kernel node, CUDA cannot query the
722 /// attribute, or a previous asynchronous launch reports an error.
723 pub fn kernel_node_attribute(
724 self,
725 id: GraphKernelNodeAttributeId,
726 ) -> Result<GraphKernelNodeAttribute> {
727 self.bind_context()?;
728 let mut value = runtime::cudaLaunchAttributeValue::default();
729 unsafe {
730 try_ffi!(runtime::cudaGraphKernelNodeGetAttribute(
731 self.as_raw(),
732 id.into(),
733 &raw mut value,
734 ))?;
735
736 Ok(match id {
737 GraphKernelNodeAttributeId::Cooperative => {
738 GraphKernelNodeAttribute::Cooperative(*value.cooperative.as_ref() != 0)
739 }
740 GraphKernelNodeAttributeId::ClusterDimension => {
741 let dim = value.clusterDim.as_ref();
742 GraphKernelNodeAttribute::ClusterDimension(Dim3::new(dim.x, dim.y, dim.z))
743 }
744 GraphKernelNodeAttributeId::Priority => {
745 GraphKernelNodeAttribute::Priority(*value.priority.as_ref())
746 }
747 GraphKernelNodeAttributeId::PreferredSharedMemoryCarveout => {
748 GraphKernelNodeAttribute::PreferredSharedMemoryCarveout(
749 *value.sharedMemCarveout.as_ref(),
750 )
751 }
752 })
753 }
754 }
755
756 /// Sets a kernel node attribute.
757 ///
758 /// # Errors
759 ///
760 /// Returns an error if this is not a kernel node, CUDA rejects the
761 /// attribute update, or a previous asynchronous launch reports an error.
762 pub fn set_kernel_node_attribute(&mut self, attribute: GraphKernelNodeAttribute) -> Result<()> {
763 self.bind_context()?;
764 let (id, value) = match attribute {
765 GraphKernelNodeAttribute::Cooperative(value) => {
766 let mut attr = runtime::cudaLaunchAttributeValue {
767 cooperative: runtime::__BindgenUnionField::new(),
768 ..runtime::cudaLaunchAttributeValue::default()
769 };
770 unsafe { *attr.cooperative.as_mut() = i32::from(value) };
771 (GraphKernelNodeAttributeId::Cooperative, attr)
772 }
773 GraphKernelNodeAttribute::ClusterDimension(value) => {
774 let mut attr = runtime::cudaLaunchAttributeValue {
775 clusterDim: runtime::__BindgenUnionField::new(),
776 ..runtime::cudaLaunchAttributeValue::default()
777 };
778 unsafe {
779 *attr.clusterDim.as_mut() = runtime::cudaLaunchAttributeValue__bindgen_ty_1 {
780 x: value.x,
781 y: value.y,
782 z: value.z,
783 };
784 }
785 (GraphKernelNodeAttributeId::ClusterDimension, attr)
786 }
787 GraphKernelNodeAttribute::Priority(value) => {
788 let mut attr = runtime::cudaLaunchAttributeValue {
789 priority: runtime::__BindgenUnionField::new(),
790 ..runtime::cudaLaunchAttributeValue::default()
791 };
792 unsafe { *attr.priority.as_mut() = value as _ };
793 (GraphKernelNodeAttributeId::Priority, attr)
794 }
795 GraphKernelNodeAttribute::PreferredSharedMemoryCarveout(value) => {
796 let mut attr = runtime::cudaLaunchAttributeValue {
797 sharedMemCarveout: runtime::__BindgenUnionField::new(),
798 ..runtime::cudaLaunchAttributeValue::default()
799 };
800 unsafe { *attr.sharedMemCarveout.as_mut() = value };
801 (
802 GraphKernelNodeAttributeId::PreferredSharedMemoryCarveout,
803 attr,
804 )
805 }
806 };
807
808 unsafe {
809 try_ffi!(runtime::cudaGraphKernelNodeSetAttribute(
810 self.as_raw(),
811 id.into(),
812 &raw const value,
813 ))?;
814 }
815 Ok(())
816 }
817
818 /// Copies attributes from `src` to this node.
819 /// Both nodes must have the same context.
820 ///
821 /// # Errors
822 ///
823 /// Returns an error if CUDA rejects the attribute copy or if a previous asynchronous launch
824 /// reported an error.
825 pub fn copy_kernel_node_attributes(self, other: Self) -> Result<()> {
826 if let (Some(ctx), Some(other_ctx)) = (self.context(), other.context())
827 && ctx != other_ctx
828 {
829 return Err(Error::GraphContextMismatch);
830 }
831 self.bind_context()?;
832 other.bind_context()?;
833 unsafe {
834 try_ffi!(runtime::cudaGraphKernelNodeCopyAttributes(
835 self.as_raw(),
836 other.handle
837 ))?;
838 }
839 Ok(())
840 }
841
842 pub const fn as_raw(&self) -> runtime::cudaGraphNode_t {
843 self.handle
844 }
845
846 pub(crate) fn graph_raw(&self) -> Option<runtime::cudaGraph_t> {
847 self.graph.as_ref().map(|graph| graph.handle)
848 }
849
850 pub fn context(&self) -> Option<&Context> {
851 self.ctx.as_deref()
852 }
853}
854
855impl MemoryAllocationNodeInfo {
856 pub const fn ptr(&self) -> DevicePtr {
857 self.ptr
858 }
859
860 pub fn context(&self) -> Option<&Context> {
861 self.ctx.as_deref()
862 }
863
864 fn from_raw_in_graph(
865 ptr: DevicePtr,
866 byte_size: usize,
867 graph_id: GraphId,
868 graph: Arc<GraphInner>,
869 ctx: Option<Arc<Context>>,
870 ) -> Self {
871 Self::from_raw(ptr, byte_size, Some(graph_id), Some(graph), ctx)
872 }
873
874 fn from_raw(
875 ptr: DevicePtr,
876 byte_size: usize,
877 graph_id: Option<GraphId>,
878 graph: Option<Arc<GraphInner>>,
879 ctx: Option<Arc<Context>>,
880 ) -> Self {
881 Self {
882 ptr,
883 byte_size,
884 graph_id,
885 _graph: graph,
886 ctx,
887 }
888 }
889}
890
891impl PartialEq for MemoryAllocationNodeInfo {
892 fn eq(&self, other: &Self) -> bool {
893 self.ptr == other.ptr
894 && self.byte_size == other.byte_size
895 && self.graph_id == other.graph_id
896 }
897}
898
899impl Eq for MemoryAllocationNodeInfo {}
900
901impl Hash for MemoryAllocationNodeInfo {
902 fn hash<H: Hasher>(&self, state: &mut H) {
903 self.ptr.hash(state);
904 self.byte_size.hash(state);
905 self.graph_id.hash(state);
906 }
907}
908
909impl GraphTopologySummary {
910 fn record_node_type(&mut self, node_type: GraphNodeType) {
911 match node_type {
912 GraphNodeType::Kernel => self.kernel_nodes += 1,
913 GraphNodeType::Memcpy => self.memcpy_nodes += 1,
914 GraphNodeType::Memset => self.memset_nodes += 1,
915 GraphNodeType::Host => self.host_nodes += 1,
916 GraphNodeType::Graph => self.child_graph_nodes += 1,
917 GraphNodeType::Empty => self.empty_nodes += 1,
918 GraphNodeType::WaitEvent => self.wait_event_nodes += 1,
919 GraphNodeType::EventRecord => self.event_record_nodes += 1,
920 GraphNodeType::ExternalSemaphoresSignal => {
921 self.external_semaphores_signal_nodes += 1;
922 }
923 GraphNodeType::ExternalSemaphoresWait => {
924 self.external_semaphores_wait_nodes += 1;
925 }
926 GraphNodeType::MemoryAlloc => self.memory_alloc_nodes += 1,
927 GraphNodeType::MemoryFree => self.memory_free_nodes += 1,
928 GraphNodeType::BatchMemOp => self.batch_mem_op_nodes += 1,
929 GraphNodeType::Conditional => self.conditional_nodes += 1,
930 }
931 }
932}
933
934#[derive(Debug)]
935pub struct Graph {
936 inner: Arc<GraphInner>,
937 id: GraphId,
938 ctx: Option<Arc<Context>>,
939 retained: Vec<RetainedAllocation>,
940}
941
942#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
943pub struct GraphId(u64);
944
945#[derive(Debug)]
946pub struct RawGraph {
947 inner: Arc<GraphInner>,
948}
949
950#[derive(Debug)]
951struct GraphInner {
952 handle: runtime::cudaGraph_t,
953 owns_handle: bool,
954}
955
956// CUDA graph handles can be retained and destroyed from any host thread after
957// binding the associated context. Mutating graph APIs require `&mut Graph`.
958unsafe impl Send for GraphInner {}
959unsafe impl Sync for GraphInner {}
960
961#[derive(Clone)]
962struct RetainedAllocation(Arc<dyn Any + Send + Sync>);
963
964impl fmt::Debug for RetainedAllocation {
965 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
966 f.debug_struct("RetainedAllocation")
967 .field("strong_count", &Arc::strong_count(&self.0))
968 .finish()
969 }
970}
971
972#[derive(Debug)]
973pub struct BorrowedGraph<'node> {
974 graph: Graph,
975 _node: PhantomData<&'node GraphNode>,
976}
977
978/// Device memory whose allocation is retained by CUDA graph objects.
979///
980/// `GraphBuffer` values are created through [`Graph::create_buffer`],
981/// [`Graph::zeroes_buffer`], or [`Graph::buffer_from_slice`]. Graph and
982/// executable-graph APIs that accept `GraphBuffer` retain the underlying
983/// allocation so graph replay cannot outlive the device pointers baked into
984/// CUDA graph nodes.
985#[derive(Debug)]
986pub struct GraphBuffer<T: DeviceRepr> {
987 memory: Arc<DeviceMemory<T>>,
988 ctx: Option<Arc<Context>>,
989}
990
991impl<T> GraphBuffer<T>
992where
993 T: DeviceRepr + Send + Sync,
994{
995 fn from_memory(memory: DeviceMemory<T>, ctx: Option<Arc<Context>>) -> Self {
996 Self {
997 memory: Arc::new(memory),
998 ctx,
999 }
1000 }
1001
1002 fn retained(&self) -> RetainedAllocation {
1003 let memory: Arc<DeviceMemory<T>> = Arc::clone(&self.memory);
1004 RetainedAllocation(memory)
1005 }
1006
1007 pub fn len(&self) -> usize {
1008 self.memory.len()
1009 }
1010
1011 pub fn is_empty(&self) -> bool {
1012 self.memory.is_empty()
1013 }
1014
1015 pub fn byte_len(&self) -> usize {
1016 self.memory.byte_len()
1017 }
1018
1019 pub fn context(&self) -> Option<&Context> {
1020 self.ctx.as_deref()
1021 }
1022
1023 pub fn as_ptr(&self) -> *const T {
1024 self.memory.as_ptr()
1025 }
1026
1027 pub fn as_mut_ptr(&mut self) -> *mut T {
1028 self.memory.as_mut_ptr()
1029 }
1030
1031 /// Copies a host slice into this graph-retained device buffer.
1032 ///
1033 /// This updates the stable allocation used by graph-buffer node APIs. The
1034 /// caller is still responsible for ordering this copy against graph launches
1035 /// that read or write the same allocation.
1036 ///
1037 /// # Errors
1038 ///
1039 /// Returns an error if `host_slice` does not have the same length as this
1040 /// buffer or if CUDA rejects the copy.
1041 pub fn copy_from_host(&mut self, host_slice: &[T]) -> Result<()> {
1042 if let Some(ctx) = &self.ctx {
1043 ctx.bind()?;
1044 }
1045 if host_slice.len() != self.len() {
1046 return Err(Error::InvalidMemoryAccess);
1047 }
1048 if self.is_empty() {
1049 return Ok(());
1050 }
1051 unsafe {
1052 DeviceMemory::<T>::copy(
1053 self.as_mut_ptr(),
1054 host_slice.as_ptr(),
1055 self.len(),
1056 MemoryCopyKind::HostToDevice,
1057 )
1058 }
1059 }
1060
1061 /// Copies this graph-retained device buffer into a host slice.
1062 ///
1063 /// # Errors
1064 ///
1065 /// Returns an error if `host_slice` does not have the same length as this
1066 /// buffer or if CUDA rejects the copy.
1067 pub fn copy_to_host(&self, host_slice: &mut [T]) -> Result<()> {
1068 if let Some(ctx) = &self.ctx {
1069 ctx.bind()?;
1070 }
1071 if host_slice.len() != self.len() {
1072 return Err(Error::InvalidMemoryAccess);
1073 }
1074 if self.is_empty() {
1075 return Ok(());
1076 }
1077 unsafe {
1078 DeviceMemory::<T>::copy(
1079 host_slice.as_mut_ptr(),
1080 self.as_ptr(),
1081 self.len(),
1082 MemoryCopyKind::DeviceToHost,
1083 )
1084 }
1085 }
1086
1087 /// Copies another graph-retained buffer into this buffer.
1088 ///
1089 /// # Errors
1090 ///
1091 /// Returns an error if the buffers have different lengths or if CUDA
1092 /// rejects the copy.
1093 pub fn copy_from_buffer(&mut self, src: &Self) -> Result<()> {
1094 if let (Some(dst_ctx), Some(src_ctx)) = (&self.ctx, &src.ctx)
1095 && dst_ctx.as_ref() != src_ctx.as_ref()
1096 {
1097 return Err(Error::GraphContextMismatch);
1098 }
1099 if let Some(ctx) = &self.ctx {
1100 ctx.bind()?;
1101 }
1102 if src.len() != self.len() {
1103 return Err(Error::InvalidMemoryAccess);
1104 }
1105 if self.is_empty() {
1106 return Ok(());
1107 }
1108 unsafe {
1109 DeviceMemory::<T>::copy(
1110 self.as_mut_ptr(),
1111 src.as_ptr(),
1112 self.len(),
1113 MemoryCopyKind::DeviceToDevice,
1114 )
1115 }
1116 }
1117
1118 pub fn copy_to_host_vec(&self) -> Result<Vec<T>> {
1119 if let Some(ctx) = &self.ctx {
1120 ctx.bind()?;
1121 }
1122 if self.is_empty() {
1123 return Ok(Vec::new());
1124 }
1125
1126 let mut host = Vec::<T>::with_capacity(self.len());
1127 unsafe {
1128 DeviceMemory::<T>::copy(
1129 host.as_mut_ptr(),
1130 self.as_ptr(),
1131 self.len(),
1132 MemoryCopyKind::DeviceToHost,
1133 )?;
1134 host.set_len(self.len());
1135 }
1136 Ok(host)
1137 }
1138}
1139
1140impl RawGraph {
1141 /// Wraps an existing CUDA graph handle and takes ownership of it.
1142 ///
1143 /// # Safety
1144 ///
1145 /// `handle` must be a valid CUDA graph handle. Ownership of `handle` is
1146 /// transferred to the returned [`RawGraph`], and the handle must not be
1147 /// destroyed elsewhere after calling this function.
1148 pub unsafe fn from_raw(handle: runtime::cudaGraph_t) -> Result<Self> {
1149 if handle.is_null() {
1150 return Err(Error::NullHandle);
1151 }
1152
1153 Ok(Self {
1154 inner: Arc::new(GraphInner {
1155 handle,
1156 owns_handle: true,
1157 }),
1158 })
1159 }
1160
1161 /// Creates an empty raw graph without a Singe context association.
1162 ///
1163 /// Prefer [`Context::create_graph`] for ordinary Singe code. Raw graphs do
1164 /// not model context association, so the caller must keep CUDA context,
1165 /// node, executable update, upload, and launch relationships coherent.
1166 ///
1167 /// # Safety
1168 ///
1169 /// The returned graph has no modeled CUDA context association. The caller
1170 /// must ensure every node, kernel, memory operand, child graph, executable
1171 /// update, upload, and launch is used with the correct CUDA context.
1172 pub unsafe fn create() -> Result<Self> {
1173 let mut handle = ptr::null_mut();
1174 unsafe {
1175 try_ffi!(runtime::cudaGraphCreate(&raw mut handle, 0))?;
1176 }
1177 unsafe { Self::from_raw(handle) }
1178 }
1179
1180 pub fn as_raw(&self) -> runtime::cudaGraph_t {
1181 self.inner.handle
1182 }
1183
1184 /// Consumes the graph and returns the raw CUDA graph handle without
1185 /// destroying it.
1186 ///
1187 /// The caller becomes responsible for eventually destroying the returned
1188 /// handle with CUDA.
1189 pub fn into_raw(self) -> runtime::cudaGraph_t {
1190 let inner = Arc::try_unwrap(self.inner)
1191 .unwrap_or_else(|_| panic!("cannot take raw graph handle while it is still shared"));
1192 let inner = ManuallyDrop::new(inner);
1193 inner.handle
1194 }
1195}
1196
1197static NEXT_GRAPH_ID: AtomicU64 = AtomicU64::new(1);
1198
1199impl GraphId {
1200 pub fn generate() -> Self {
1201 Self(NEXT_GRAPH_ID.fetch_add(1, Ordering::Relaxed))
1202 }
1203
1204 pub fn as_u64(self) -> u64 {
1205 self.0
1206 }
1207}
1208
1209impl Display for GraphId {
1210 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1211 self.0.fmt(f)
1212 }
1213}
1214
1215impl Graph {
1216 fn bind_context(&self) -> Result<()> {
1217 if let Some(ctx) = &self.ctx {
1218 ctx.bind()?;
1219 }
1220 Ok(())
1221 }
1222
1223 /// Wraps an existing CUDA graph handle associated with `ctx` and takes
1224 /// ownership of it.
1225 ///
1226 /// # Safety
1227 ///
1228 /// `handle` must be a valid CUDA graph handle associated with `ctx`.
1229 /// Ownership of `handle` is transferred to the returned [`Graph`], and the
1230 /// handle must not be destroyed elsewhere after calling this function.
1231 pub unsafe fn from_raw_in_context(
1232 handle: runtime::cudaGraph_t,
1233 ctx: Arc<Context>,
1234 ) -> Result<Self> {
1235 if handle.is_null() {
1236 return Err(Error::NullHandle);
1237 }
1238
1239 Ok(Self {
1240 inner: Arc::new(GraphInner {
1241 handle,
1242 owns_handle: true,
1243 }),
1244 id: GraphId::generate(),
1245 ctx: Some(ctx),
1246 retained: Vec::new(),
1247 })
1248 }
1249
1250 unsafe fn from_raw_borrowed_in_context(
1251 handle: runtime::cudaGraph_t,
1252 ctx: Option<Arc<Context>>,
1253 ) -> Self {
1254 Self {
1255 inner: Arc::new(GraphInner {
1256 handle,
1257 owns_handle: false,
1258 }),
1259 id: GraphId::generate(),
1260 ctx,
1261 retained: Vec::new(),
1262 }
1263 }
1264
1265 pub(crate) fn create_in_context(ctx: Arc<Context>) -> Result<Self> {
1266 ctx.bind()?;
1267 let mut handle = ptr::null_mut();
1268 unsafe {
1269 try_ffi!(runtime::cudaGraphCreate(&raw mut handle, 0))?;
1270 }
1271 Ok(Self {
1272 inner: Arc::new(GraphInner {
1273 handle,
1274 owns_handle: true,
1275 }),
1276 id: GraphId::generate(),
1277 ctx: Some(ctx),
1278 retained: Vec::new(),
1279 })
1280 }
1281
1282 fn retain_buffer<T>(&mut self, buffer: &GraphBuffer<T>)
1283 where
1284 T: DeviceRepr + Send + Sync,
1285 {
1286 self.retained.push(buffer.retained());
1287 }
1288
1289 fn check_buffer_context<T>(&self, buffer: &GraphBuffer<T>) -> Result<()>
1290 where
1291 T: DeviceRepr + Send + Sync,
1292 {
1293 if let (Some(graph_ctx), Some(buffer_ctx)) = (&self.ctx, buffer.context())
1294 && graph_ctx.as_ref() != buffer_ctx
1295 {
1296 return Err(Error::GraphContextMismatch);
1297 }
1298 Ok(())
1299 }
1300
1301 fn check_buffer_contexts<T>(&self, dst: &GraphBuffer<T>, src: &GraphBuffer<T>) -> Result<()>
1302 where
1303 T: DeviceRepr + Send + Sync,
1304 {
1305 self.check_buffer_context(dst)?;
1306 self.check_buffer_context(src)?;
1307 Ok(())
1308 }
1309
1310 /// Allocates graph-retained device memory.
1311 ///
1312 /// The returned buffer can be used with graph-buffer node APIs. Any graph or
1313 /// executable graph that records the buffer retains the underlying device
1314 /// allocation for replay.
1315 ///
1316 /// # Errors
1317 ///
1318 /// Returns an error if CUDA cannot allocate device memory, the requested
1319 /// byte count overflows, or CUDA reports runtime initialization diagnostics.
1320 pub fn create_buffer<T>(&mut self, length: usize) -> Result<GraphBuffer<T>>
1321 where
1322 T: DeviceRepr + Send + Sync,
1323 {
1324 self.bind_context()?;
1325 let buffer = GraphBuffer::from_memory(DeviceMemory::create(length)?, self.ctx.clone());
1326 self.retain_buffer(&buffer);
1327 Ok(buffer)
1328 }
1329
1330 /// Allocates graph-retained device memory initialized to zero bytes.
1331 ///
1332 /// # Errors
1333 ///
1334 /// Returns an error if CUDA cannot allocate or initialize device memory, the
1335 /// requested byte count overflows, or CUDA reports runtime initialization
1336 /// diagnostics.
1337 pub fn zeroes_buffer<T>(&mut self, length: usize) -> Result<GraphBuffer<T>>
1338 where
1339 T: DeviceRepr + Send + Sync,
1340 {
1341 self.bind_context()?;
1342 let buffer = GraphBuffer::from_memory(DeviceMemory::zeroes(length)?, self.ctx.clone());
1343 self.retain_buffer(&buffer);
1344 Ok(buffer)
1345 }
1346
1347 /// Allocates graph-retained device memory initialized from a host slice.
1348 ///
1349 /// # Errors
1350 ///
1351 /// Returns an error if CUDA cannot allocate or copy device memory, the
1352 /// requested byte count overflows, or CUDA reports runtime initialization
1353 /// diagnostics.
1354 pub fn buffer_from_slice<T>(&mut self, values: &[T]) -> Result<GraphBuffer<T>>
1355 where
1356 T: DeviceRepr + Send + Sync,
1357 {
1358 self.bind_context()?;
1359 let buffer = GraphBuffer::from_memory(DeviceMemory::from_slice(values)?, self.ctx.clone());
1360 self.retain_buffer(&buffer);
1361 Ok(buffer)
1362 }
1363
1364 pub fn instantiate(&self) -> Result<ExecutableGraph> {
1365 self.instantiate_with_flags(GraphInstantiateFlags::empty())
1366 }
1367
1368 /// Instantiates graph as an executable graph.
1369 /// The graph is validated for any structural constraints or intra-node constraints which were not previously validated.
1370 /// If instantiation is successful, returns an instantiated executable graph.
1371 ///
1372 /// `flags` controls the behavior of instantiation and subsequent graph launches.
1373 /// Valid flags are:
1374 ///
1375 /// * [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`], which configures a graph containing memory allocation nodes to automatically free any unfreed memory allocations before
1376 /// the graph is relaunched.
1377 ///
1378 /// * [`GraphInstantiateFlags::DEVICE_LAUNCH`], which configures the graph for launch from the device.
1379 /// If this flag is passed, the executable graph handle returned can
1380 /// be used to launch the graph from both the host and device.
1381 /// This flag can only be used on platforms which support unified addressing.
1382 /// This flag cannot be used in conjunction with [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`].
1383 ///
1384 /// * [`GraphInstantiateFlags::USE_NODE_PRIORITY`], which causes the graph to use the priorities from the per-node attributes rather than the priority of the launch stream
1385 /// during execution.
1386 /// Priorities are only available on kernel nodes and are copied from stream priority during stream capture.
1387 ///
1388 /// If the graph contains any allocation or free nodes, there can be at most one executable graph in existence for that graph at a time.
1389 /// An attempt to instantiate a second executable graph before dropping the first results in an error.
1390 /// The same also applies if the graph contains any device-updatable kernel nodes.
1391 ///
1392 /// If the graph contains kernels which call device-side [`ExecutableGraph::launch`] from multiple devices, this results in an error.
1393 ///
1394 /// Graphs instantiated for launch on the device have additional restrictions which do not apply to host graphs:
1395 ///
1396 /// * The graph's nodes must reside on a single device.
1397 /// * The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
1398 /// * The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
1399 /// Operation-specific restrictions are
1400 /// outlined below.
1401 /// * Kernel nodes:
1402 /// + Use of CUDA Dynamic Parallelism is not permitted.
1403 /// + Cooperative launches are permitted as long as MPS is not in use.
1404 /// * Memcpy nodes:
1405 /// + Only copies involving device memory and/or pinned device-mapped host memory are permitted.
1406 /// + Copies involving CUDA arrays are not permitted.
1407 /// + Both operands must be accessible from the current device, and the current device must match the device of other nodes in the
1408 /// graph.
1409 ///
1410 /// Graph objects are not threadsafe.
1411 ///
1412 /// # Errors
1413 ///
1414 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1415 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1416 /// not call CUDA functions; see [`Stream::add_callback`].
1417 pub fn instantiate_with_flags(&self, flags: GraphInstantiateFlags) -> Result<ExecutableGraph> {
1418 self.bind_context()?;
1419 let mut handle = ptr::null_mut();
1420 unsafe {
1421 try_ffi!(runtime::cudaGraphInstantiateWithFlags(
1422 &raw mut handle,
1423 self.as_raw(),
1424 flags.bits(),
1425 ))?;
1426 }
1427 unsafe {
1428 ExecutableGraph::from_raw_with_graph(
1429 handle,
1430 self.ctx.clone(),
1431 Some(self.id),
1432 Some(Arc::clone(&self.inner)),
1433 self.retained.clone(),
1434 )
1435 }
1436 }
1437
1438 /// Creates a copy of `original_graph`.
1439 /// All parameters are copied into the cloned graph.
1440 /// The original graph may be modified after this call without affecting the clone.
1441 ///
1442 /// Child graph nodes in the original graph are recursively copied into the clone.
1443 ///
1444 /// Cloning is not supported for graphs that contain memory allocation nodes, memory free nodes, or conditional nodes.
1445 ///
1446 /// Graph objects are not threadsafe.
1447 ///
1448 /// # Errors
1449 ///
1450 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1451 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1452 /// not call CUDA functions; see [`Stream::add_callback`].
1453 pub fn try_clone(&self) -> Result<Self> {
1454 self.bind_context()?;
1455 let mut handle = ptr::null_mut();
1456 unsafe {
1457 try_ffi!(runtime::cudaGraphClone(&raw mut handle, self.as_raw()))?;
1458 }
1459 Ok(Self {
1460 inner: Arc::new(GraphInner {
1461 handle,
1462 owns_handle: true,
1463 }),
1464 id: GraphId::generate(),
1465 ctx: self.ctx.clone(),
1466 retained: self.retained.clone(),
1467 })
1468 }
1469
1470 fn node_from_raw(&self, handle: runtime::cudaGraphNode_t) -> GraphNode {
1471 GraphNode::from_raw_in_graph(handle, self.id, Arc::clone(&self.inner), self.ctx.clone())
1472 }
1473
1474 pub(crate) fn check_node(&self, node: &GraphNode) -> Result<()> {
1475 self.bind_context()?;
1476 if !matches!(node.graph_id, Some(id) if id == self.id) {
1477 return Err(Error::GraphNodeMismatch);
1478 }
1479 Ok(())
1480 }
1481
1482 pub(crate) fn check_nodes(&self, nodes: &[GraphNode]) -> Result<()> {
1483 self.bind_context()?;
1484 for node in nodes {
1485 if !matches!(node.graph_id, Some(id) if id == self.id) {
1486 return Err(Error::GraphNodeMismatch);
1487 }
1488 }
1489 Ok(())
1490 }
1491
1492 fn check_child_graph_context(&self, child_graph: &Graph) -> Result<()> {
1493 if let (Some(parent_ctx), Some(child_ctx)) = (&self.ctx, &child_graph.ctx)
1494 && parent_ctx.as_ref() != child_ctx.as_ref()
1495 {
1496 return Err(Error::GraphContextMismatch);
1497 }
1498 Ok(())
1499 }
1500
1501 fn check_event_record_context(&self, event: &Event) -> Result<()> {
1502 if let Some(ctx) = &self.ctx
1503 && ctx.as_ref() != event.context()
1504 {
1505 return Err(Error::GraphContextMismatch);
1506 }
1507 Ok(())
1508 }
1509
1510 pub fn add_dependency(&mut self, from: GraphNode, to: GraphNode) -> Result<()> {
1511 self.add_dependencies(&[from], &[to])
1512 }
1513
1514 pub fn add_dependencies(&mut self, from: &[GraphNode], to: &[GraphNode]) -> Result<()> {
1515 self.add_dependencies_with_data(from, to, &[])
1516 }
1517
1518 /// Elements in `from` and `to` at corresponding indices define each dependency to add.
1519 /// Each node in `from` and `to` must belong to this graph.
1520 ///
1521 /// If `from` and `to` are empty, the call returns without modifying the graph.
1522 /// Specifying an existing dependency returns an error.
1523 ///
1524 /// Graph objects are not threadsafe.
1525 ///
1526 /// # Errors
1527 ///
1528 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1529 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1530 /// not call CUDA functions; see [`Stream::add_callback`].
1531 pub fn add_dependencies_with_data(
1532 &mut self,
1533 from: &[GraphNode],
1534 to: &[GraphNode],
1535 edge_data: &[GraphEdgeData],
1536 ) -> Result<()> {
1537 if from.len() != to.len() {
1538 return Err(Error::GraphDependencyMismatch);
1539 }
1540 if !edge_data.is_empty() && edge_data.len() != from.len() {
1541 return Err(Error::GraphDependencyMismatch);
1542 }
1543 if from.is_empty() {
1544 return Ok(());
1545 }
1546 self.check_nodes(from)?;
1547 self.check_nodes(to)?;
1548
1549 let from_raw: Vec<_> = from.iter().map(GraphNode::as_raw).collect();
1550 let to_raw: Vec<_> = to.iter().map(GraphNode::as_raw).collect();
1551 let edge_data_raw: Vec<_> = edge_data.iter().copied().map(Into::into).collect();
1552 unsafe {
1553 try_ffi!(runtime::cudaGraphAddDependencies(
1554 self.as_raw(),
1555 from_raw.as_ptr(),
1556 to_raw.as_ptr(),
1557 if edge_data_raw.is_empty() {
1558 ptr::null()
1559 } else {
1560 edge_data_raw.as_ptr()
1561 },
1562 from_raw.len() as _,
1563 ))?;
1564 }
1565 Ok(())
1566 }
1567
1568 pub fn remove_dependency(&mut self, from: GraphNode, to: GraphNode) -> Result<()> {
1569 self.remove_dependencies(&[from], &[to])
1570 }
1571
1572 pub fn remove_dependencies(&mut self, from: &[GraphNode], to: &[GraphNode]) -> Result<()> {
1573 self.remove_dependencies_with_data(from, to, &[])
1574 }
1575
1576 /// Elements in `from` and `to` at corresponding indices define each dependency to remove.
1577 /// Each node in `from` and `to` must belong to this graph.
1578 ///
1579 /// If `from` and `to` are empty, the call returns without modifying the graph.
1580 /// Specifying an edge that does not exist in the graph, with data matching `edge_data`, results in an error.
1581 /// Passing an empty `edge_data` slice is equivalent to passing default edge data for each edge.
1582 ///
1583 /// Graph objects are not threadsafe.
1584 ///
1585 /// # Errors
1586 ///
1587 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1588 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1589 /// not call CUDA functions; see [`Stream::add_callback`].
1590 pub fn remove_dependencies_with_data(
1591 &mut self,
1592 from: &[GraphNode],
1593 to: &[GraphNode],
1594 edge_data: &[GraphEdgeData],
1595 ) -> Result<()> {
1596 if from.len() != to.len() {
1597 return Err(Error::GraphDependencyMismatch);
1598 }
1599 if !edge_data.is_empty() && edge_data.len() != from.len() {
1600 return Err(Error::GraphDependencyMismatch);
1601 }
1602 if from.is_empty() {
1603 return Ok(());
1604 }
1605 self.check_nodes(from)?;
1606 self.check_nodes(to)?;
1607
1608 let from_raw: Vec<_> = from.iter().map(GraphNode::as_raw).collect();
1609 let to_raw: Vec<_> = to.iter().map(GraphNode::as_raw).collect();
1610 let edge_data_raw: Vec<_> = edge_data.iter().copied().map(Into::into).collect();
1611 unsafe {
1612 try_ffi!(runtime::cudaGraphRemoveDependencies(
1613 self.as_raw(),
1614 from_raw.as_ptr(),
1615 to_raw.as_ptr(),
1616 if edge_data_raw.is_empty() {
1617 ptr::null()
1618 } else {
1619 edge_data_raw.as_ptr()
1620 },
1621 from_raw.len() as _,
1622 ))?;
1623 }
1624 Ok(())
1625 }
1626
1627 pub fn add_edges(&mut self, edges: &[GraphEdge]) -> Result<()> {
1628 if edges.is_empty() {
1629 return Ok(());
1630 }
1631
1632 let from: Vec<_> = edges.iter().map(|edge| edge.from.clone()).collect();
1633 let to: Vec<_> = edges.iter().map(|edge| edge.to.clone()).collect();
1634 let data: Vec<_> = edges.iter().map(|edge| edge.data).collect();
1635 self.add_dependencies_with_data(&from, &to, &data)
1636 }
1637
1638 pub fn remove_edges(&mut self, edges: &[GraphEdge]) -> Result<()> {
1639 if edges.is_empty() {
1640 return Ok(());
1641 }
1642
1643 let from: Vec<_> = edges.iter().map(|edge| edge.from.clone()).collect();
1644 let to: Vec<_> = edges.iter().map(|edge| edge.to.clone()).collect();
1645 let data: Vec<_> = edges.iter().map(|edge| edge.data).collect();
1646 self.remove_dependencies_with_data(&from, &to, &data)
1647 }
1648
1649 /// Creates a node that performs no operation and adds it to the graph with the given dependencies.
1650 /// The dependency list may be empty, in which case the node is placed at the
1651 /// graph root. It may not contain duplicate entries.
1652 ///
1653 /// An empty node performs no operation during execution, but can be used for transitive ordering.
1654 /// For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2\*n dependency edges, rather than no empty node and n^2 dependency edges.
1655 ///
1656 /// Graph objects are not threadsafe.
1657 ///
1658 /// # Errors
1659 ///
1660 /// Returns an error if CUDA rejects the graph operation or reports runtime initialization
1661 /// diagnostics. Callbacks must not call CUDA functions; see [`Stream::add_callback`].
1662 pub fn add_empty_node(&mut self, dependencies: &[GraphNode]) -> Result<GraphNode> {
1663 self.check_nodes(dependencies)?;
1664 let mut handle = ptr::null_mut();
1665 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1666 unsafe {
1667 try_ffi!(runtime::cudaGraphAddEmptyNode(
1668 &raw mut handle,
1669 self.as_raw(),
1670 dependencies_raw.as_ptr(),
1671 dependencies_raw.len() as _,
1672 ))?;
1673 Ok(self.node_from_raw(handle))
1674 }
1675 }
1676
1677 /// Creates an event record node and adds it to the graph with the given dependencies and event.
1678 /// The dependency list may be empty, in which case the node is placed at the
1679 /// graph root. It may not contain duplicate entries.
1680 ///
1681 /// Each graph launch records `event` to capture execution of the node's dependencies.
1682 ///
1683 /// These nodes may not be used in loops or conditionals.
1684 ///
1685 /// Graph objects are not threadsafe.
1686 ///
1687 /// # Errors
1688 ///
1689 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1690 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1691 /// not call CUDA functions; see [`Stream::add_callback`].
1692 pub fn add_event_record_node(
1693 &mut self,
1694 dependencies: &[GraphNode],
1695 event: &Event,
1696 ) -> Result<GraphNode> {
1697 self.check_nodes(dependencies)?;
1698 self.check_event_record_context(event)?;
1699 let mut handle = ptr::null_mut();
1700 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1701 unsafe {
1702 try_ffi!(runtime::cudaGraphAddEventRecordNode(
1703 &raw mut handle,
1704 self.as_raw(),
1705 dependencies_raw.as_ptr(),
1706 dependencies_raw.len() as _,
1707 event.as_raw(),
1708 ))?;
1709 Ok(self.node_from_raw(handle))
1710 }
1711 }
1712
1713 /// Creates an event wait node and adds it to the graph with the given dependencies and event.
1714 /// The dependency list may be empty, in which case the node is placed at the
1715 /// graph root. It may not contain duplicate entries.
1716 ///
1717 /// The graph node waits for all work captured in `event`.
1718 /// See [`sys::cuEventRecord`](singe_cuda_sys::driver::cuEventRecord) for details on what is captured by an event.
1719 /// Synchronization is performed efficiently on the device when applicable.
1720 /// `event` may come from a different context or device than the launch stream.
1721 ///
1722 /// These nodes may not be used in loops or conditionals.
1723 ///
1724 /// Graph objects are not threadsafe.
1725 ///
1726 /// # Errors
1727 ///
1728 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1729 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1730 /// not call CUDA functions; see [`Stream::add_callback`].
1731 pub fn add_event_wait_node(
1732 &mut self,
1733 dependencies: &[GraphNode],
1734 event: &Event,
1735 ) -> Result<GraphNode> {
1736 self.check_nodes(dependencies)?;
1737 let mut handle = ptr::null_mut();
1738 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1739 unsafe {
1740 try_ffi!(runtime::cudaGraphAddEventWaitNode(
1741 &raw mut handle,
1742 self.as_raw(),
1743 dependencies_raw.as_ptr(),
1744 dependencies_raw.len() as _,
1745 event.as_raw(),
1746 ))?;
1747 Ok(self.node_from_raw(handle))
1748 }
1749 }
1750
1751 /// Creates a CPU execution node and adds it to the graph with the given dependencies and host-node parameters.
1752 /// The dependency list may be empty, in which case the node is placed at the
1753 /// graph root. It may not contain duplicate entries.
1754 ///
1755 /// When the graph is launched, the node invokes the specified CPU function.
1756 /// Host nodes are not supported under MPS with pre-Volta GPUs.
1757 ///
1758 /// Graph objects are not threadsafe.
1759 ///
1760 /// # Safety
1761 ///
1762 /// CUDA stores the raw callback function and user-data pointer in the graph
1763 /// node for later replay. The caller must ensure `params` remains valid
1764 /// according to [`HostNodeParams::new`] for every graph instantiation and
1765 /// launch that can execute this node.
1766 ///
1767 /// # Errors
1768 ///
1769 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1770 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1771 /// not call CUDA functions; see [`Stream::add_callback`].
1772 pub unsafe fn add_host_node(
1773 &mut self,
1774 dependencies: &[GraphNode],
1775 params: &HostNodeParams,
1776 ) -> Result<GraphNode> {
1777 self.check_nodes(dependencies)?;
1778 let mut handle = ptr::null_mut();
1779 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1780 let params = params.into();
1781 unsafe {
1782 try_ffi!(runtime::cudaGraphAddHostNode(
1783 &raw mut handle,
1784 self.as_raw(),
1785 dependencies_raw.as_ptr(),
1786 dependencies_raw.len() as _,
1787 &raw const params,
1788 ))?;
1789 Ok(self.node_from_raw(handle))
1790 }
1791 }
1792
1793 /// Creates a kernel execution node and adds it to the graph with the given dependencies, launch configuration, and kernel parameters.
1794 /// The dependency list may be empty, in which case the node is placed at the
1795 /// graph root. It may not contain duplicate entries.
1796 ///
1797 /// When the graph is launched, the node invokes the kernel on the grid and blocks specified by [`LaunchConfig`].
1798 /// [`LaunchConfig::shared_memory_bytes`](crate::module::LaunchConfig::shared_memory_bytes) sets the amount of dynamic shared memory available to each thread block.
1799 /// Kernel parameters are passed with [`KernelParameters`](crate::module::KernelParameters) or tuples of shared or mutable references.
1800 ///
1801 /// Kernels launched using graphs must not use texture and surface references.
1802 /// Reading or writing through any texture or surface reference is undefined behavior.
1803 /// This restriction does not apply to texture and surface objects.
1804 ///
1805 /// Runtime kernel handles queried via [`sys::cudaLibraryGetKernel`](singe_cuda_sys::runtime::cudaLibraryGetKernel) or [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) may be used.
1806 /// The symbol passed to [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) must be registered with the same CUDA Runtime instance.
1807 /// Passing a symbol that belongs to a different runtime instance results in undefined behavior.
1808 ///
1809 /// Graph objects are not threadsafe.
1810 ///
1811 /// # Safety
1812 ///
1813 /// CUDA copies the kernel argument values during this call and stores those
1814 /// copied values in the graph node for later replay. If an argument value is
1815 /// itself a pointer, only the pointer address is copied. The caller must
1816 /// ensure every copied pointer value remains valid for every graph
1817 /// instantiation, update, and launch that can execute this node. Mutable
1818 /// pointer arguments must also remain exclusive for the work ordered by
1819 /// those launches.
1820 ///
1821 /// # Errors
1822 ///
1823 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1824 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1825 /// not call CUDA functions; see [`Stream::add_callback`].
1826 pub unsafe fn add_kernel_node<'a, P>(
1827 &mut self,
1828 dependencies: &[GraphNode],
1829 function: DeviceFunction,
1830 config: &LaunchConfig,
1831 params: P,
1832 ) -> Result<GraphNode>
1833 where
1834 P: KernelLaunchArgs<'a>,
1835 {
1836 self.check_nodes(dependencies)?;
1837 let mut handle = ptr::null_mut();
1838 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1839 params.with_encoded_arguments(|mut arguments| unsafe {
1840 let params = runtime::cudaKernelNodeParams {
1841 func: function.as_raw().cast(),
1842 gridDim: config.grid_dim().into(),
1843 blockDim: config.block_dim().into(),
1844 sharedMemBytes: config.shared_memory_bytes_u32(),
1845 kernelParams: arguments.as_mut_ptr().cast(),
1846 extra: ptr::null_mut(),
1847 };
1848 try_ffi!(runtime::cudaGraphAddKernelNode(
1849 &raw mut handle,
1850 self.as_raw(),
1851 dependencies_raw.as_ptr(),
1852 dependencies_raw.len() as _,
1853 &raw const params,
1854 ))?;
1855 Ok(self.node_from_raw(handle))
1856 })
1857 }
1858
1859 /// Creates a new 1D memcpy node and adds it to the graph with the given dependencies.
1860 /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
1861 ///
1862 /// When the graph is launched, the node copies `count` bytes from `src` to `dst`.
1863 /// The transfer direction is described by [`MemoryCopyKind`].
1864 /// [`MemoryCopyKind::Default`] is recommended when unified virtual addressing is available, in which case the transfer direction is inferred from the pointer values.
1865 /// Launching a memcpy node with `dst` and `src` pointers that do not match the direction of the copy results in undefined behavior.
1866 ///
1867 /// Memcpy nodes have additional restrictions for managed memory if any device in the system does not support concurrent managed access.
1868 ///
1869 /// Graph objects are not threadsafe.
1870 ///
1871 /// # Safety
1872 ///
1873 /// CUDA stores the raw source and destination addresses in the graph node
1874 /// for later replay. The caller must ensure `params` remains valid
1875 /// according to [`Memcpy1DNodeParams::new`] for every graph instantiation
1876 /// and launch that can execute this node.
1877 ///
1878 /// # Errors
1879 ///
1880 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
1881 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
1882 /// not call CUDA functions; see [`Stream::add_callback`].
1883 pub unsafe fn add_memory_copy_node_1d(
1884 &mut self,
1885 dependencies: &[GraphNode],
1886 params: &MemoryCopy1DNodeParams,
1887 ) -> Result<GraphNode> {
1888 self.check_nodes(dependencies)?;
1889 let mut handle = ptr::null_mut();
1890 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
1891 unsafe {
1892 try_ffi!(runtime::cudaGraphAddMemcpyNode1D(
1893 &raw mut handle,
1894 self.as_raw(),
1895 dependencies_raw.as_ptr(),
1896 dependencies_raw.len() as _,
1897 params.dst().cast(),
1898 params.src().cast(),
1899 params.count() as _,
1900 params.kind().into(),
1901 ))?;
1902 Ok(self.node_from_raw(handle))
1903 }
1904 }
1905
1906 /// Creates a device-to-device memcpy node from typed byte buffers.
1907 ///
1908 /// The node copies `src.byte_len()` bytes. `dst` must have at least that
1909 /// many bytes.
1910 ///
1911 /// # Safety
1912 ///
1913 /// CUDA stores the raw source and destination addresses in the graph node
1914 /// for later replay. The caller must ensure `dst` and `src` remain valid
1915 /// for every graph instantiation and launch that can execute this node.
1916 /// `dst` must not be accessed through another mutable path while graph
1917 /// launches using this node can write it.
1918 ///
1919 /// # Errors
1920 ///
1921 /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
1922 /// operation, if a previous asynchronous launch reported an error, or if CUDA
1923 /// reports runtime initialization diagnostics.
1924 pub unsafe fn add_memory_copy_node_1d_device_to_device<D, S>(
1925 &mut self,
1926 dependencies: &[GraphNode],
1927 dst: &mut D,
1928 src: &S,
1929 ) -> Result<GraphNode>
1930 where
1931 D: ByteBufferMut + ?Sized,
1932 S: ByteBuffer + ?Sized,
1933 {
1934 let count = src.byte_len();
1935 if dst.byte_len() < count {
1936 return Err(Error::InvalidMemoryAccess);
1937 }
1938 let params = unsafe {
1939 MemoryCopy1DNodeParams::new(
1940 dst.as_byte_mut_ptr().cast(),
1941 src.as_byte_ptr().cast(),
1942 count,
1943 MemoryCopyKind::DeviceToDevice,
1944 )
1945 };
1946 unsafe { self.add_memory_copy_node_1d(dependencies, ¶ms) }
1947 }
1948
1949 /// Creates a device-to-device memcpy node between graph-retained buffers.
1950 ///
1951 /// The node copies `src.byte_len()` bytes. `dst` must have at least that
1952 /// many bytes. The graph retains both allocations so the baked CUDA graph
1953 /// pointers remain live for future instantiation and replay.
1954 ///
1955 /// # Errors
1956 ///
1957 /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
1958 /// operation, if a previous asynchronous launch reported an error, or if CUDA
1959 /// reports runtime initialization diagnostics.
1960 pub fn add_buffer_memory_copy_node_1d_device_to_device<T>(
1961 &mut self,
1962 dependencies: &[GraphNode],
1963 dst: &mut GraphBuffer<T>,
1964 src: &GraphBuffer<T>,
1965 ) -> Result<GraphNode>
1966 where
1967 T: DeviceRepr + Send + Sync,
1968 {
1969 self.check_buffer_contexts(dst, src)?;
1970 let count = src.byte_len();
1971 if dst.byte_len() < count {
1972 return Err(Error::InvalidMemoryAccess);
1973 }
1974 let params = unsafe {
1975 MemoryCopy1DNodeParams::new(
1976 dst.as_mut_ptr().cast(),
1977 src.as_ptr().cast(),
1978 count,
1979 MemoryCopyKind::DeviceToDevice,
1980 )
1981 };
1982 let node = unsafe { self.add_memory_copy_node_1d(dependencies, ¶ms)? };
1983 self.retain_buffer(dst);
1984 self.retain_buffer(src);
1985 Ok(node)
1986 }
1987
1988 /// Creates a memcpy node and adds it to the graph with the given dependencies.
1989 /// The dependency list may be empty, in which case the node is placed at the
1990 /// graph root. It may not contain duplicate entries.
1991 ///
1992 /// When the graph is launched, the node performs the memcpy described by `params`.
1993 /// See [`sys::cudaMemcpy3D`](singe_cuda_sys::runtime::cudaMemcpy3D) for a description of the structure and its restrictions.
1994 ///
1995 /// Memcpy nodes have additional restrictions for managed memory if any device in the system does not support concurrent managed access.
1996 ///
1997 /// Graph objects are not threadsafe.
1998 ///
1999 /// # Safety
2000 ///
2001 /// CUDA stores the raw source and destination addresses in the graph node
2002 /// for later replay. The caller must ensure `params` remains valid
2003 /// according to [`Memcpy3DNodeParams`] for every graph instantiation and
2004 /// launch that can execute this node.
2005 ///
2006 /// # Errors
2007 ///
2008 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2009 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2010 /// not call CUDA functions; see [`Stream::add_callback`].
2011 pub unsafe fn add_memory_copy_node(
2012 &mut self,
2013 dependencies: &[GraphNode],
2014 params: &MemoryCopy3DNodeParams,
2015 ) -> Result<GraphNode> {
2016 self.check_nodes(dependencies)?;
2017 let mut handle = ptr::null_mut();
2018 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2019 let params = params.into();
2020 unsafe {
2021 try_ffi!(runtime::cudaGraphAddMemcpyNode(
2022 &raw mut handle,
2023 self.as_raw(),
2024 dependencies_raw.as_ptr(),
2025 dependencies_raw.len() as _,
2026 &raw const params,
2027 ))?;
2028 Ok(self.node_from_raw(handle))
2029 }
2030 }
2031
2032 /// # Safety
2033 ///
2034 /// CUDA stores the raw symbol and source pointer in the graph node for
2035 /// later replay. The caller must ensure `params` remains valid according to
2036 /// [`MemcpyToSymbolNodeParams::new`] for every graph instantiation and
2037 /// launch that can execute this node.
2038 pub unsafe fn add_memory_copy_node_to_symbol(
2039 &mut self,
2040 dependencies: &[GraphNode],
2041 params: &MemoryCopyToSymbolNodeParams,
2042 ) -> Result<GraphNode> {
2043 self.check_nodes(dependencies)?;
2044 let mut handle = ptr::null_mut();
2045 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2046 unsafe {
2047 try_ffi!(runtime::cudaGraphAddMemcpyNodeToSymbol(
2048 &raw mut handle,
2049 self.as_raw(),
2050 dependencies_raw.as_ptr(),
2051 dependencies_raw.len() as _,
2052 params.symbol().cast(),
2053 params.src().cast(),
2054 params.count() as _,
2055 params.offset() as _,
2056 params.kind().into(),
2057 ))?;
2058 Ok(self.node_from_raw(handle))
2059 }
2060 }
2061
2062 /// # Safety
2063 ///
2064 /// CUDA stores the raw destination and symbol pointer in the graph node for
2065 /// later replay. The caller must ensure `params` remains valid according to
2066 /// [`MemoryCopyFromSymbolNodeParams::new`] for every graph instantiation and
2067 /// launch that can execute this node.
2068 pub unsafe fn add_memory_copy_node_from_symbol(
2069 &mut self,
2070 dependencies: &[GraphNode],
2071 params: &MemoryCopyFromSymbolNodeParams,
2072 ) -> Result<GraphNode> {
2073 self.check_nodes(dependencies)?;
2074 let mut handle = ptr::null_mut();
2075 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2076 unsafe {
2077 try_ffi!(runtime::cudaGraphAddMemcpyNodeFromSymbol(
2078 &raw mut handle,
2079 self.as_raw(),
2080 dependencies_raw.as_ptr(),
2081 dependencies_raw.len() as _,
2082 params.dst().cast(),
2083 params.symbol().cast(),
2084 params.count() as _,
2085 params.offset() as _,
2086 params.kind().into(),
2087 ))?;
2088 Ok(self.node_from_raw(handle))
2089 }
2090 }
2091
2092 /// Creates a new memset node and adds it to the graph with the given dependencies.
2093 /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2094 ///
2095 /// The element size must be 1, 2, or 4 bytes.
2096 /// When the graph is launched, the node performs the memset described by `params`.
2097 ///
2098 /// Graph objects are not threadsafe.
2099 ///
2100 /// # Safety
2101 ///
2102 /// CUDA stores the destination address in the graph node for later replay.
2103 /// The caller must ensure `params` remains valid according to
2104 /// [`MemorySetNodeParams::new`] for every graph instantiation and launch that
2105 /// can execute this node.
2106 ///
2107 /// # Errors
2108 ///
2109 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2110 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2111 /// not call CUDA functions; see [`Stream::add_callback`].
2112 pub unsafe fn add_memory_set_node(
2113 &mut self,
2114 dependencies: &[GraphNode],
2115 params: &MemorySetNodeParams,
2116 ) -> Result<GraphNode> {
2117 self.check_nodes(dependencies)?;
2118 let mut handle = ptr::null_mut();
2119 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2120 let params = params.into();
2121 unsafe {
2122 try_ffi!(runtime::cudaGraphAddMemsetNode(
2123 &raw mut handle,
2124 self.as_raw(),
2125 dependencies_raw.as_ptr(),
2126 dependencies_raw.len() as _,
2127 &raw const params,
2128 ))?;
2129 Ok(self.node_from_raw(handle))
2130 }
2131 }
2132
2133 /// Creates a new node which executes an embedded graph, and adds it to the graph with the given dependencies.
2134 /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2135 ///
2136 /// If `child_graph` contains allocation nodes, free nodes, or conditional nodes, this call returns an error.
2137 ///
2138 /// The node executes an embedded child graph.
2139 /// The child graph is cloned in this call.
2140 ///
2141 /// Graph objects are not threadsafe.
2142 ///
2143 /// # Errors
2144 ///
2145 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2146 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2147 /// not call CUDA functions; see [`Stream::add_callback`].
2148 pub fn add_child_graph_node(
2149 &mut self,
2150 dependencies: &[GraphNode],
2151 child_graph: &Self,
2152 ) -> Result<GraphNode> {
2153 self.check_nodes(dependencies)?;
2154 self.check_child_graph_context(child_graph)?;
2155 let mut handle = ptr::null_mut();
2156 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2157 unsafe {
2158 try_ffi!(runtime::cudaGraphAddChildGraphNode(
2159 &raw mut handle,
2160 self.as_raw(),
2161 dependencies_raw.as_ptr(),
2162 dependencies_raw.len() as _,
2163 child_graph.as_raw(),
2164 ))?;
2165 Ok(self.node_from_raw(handle))
2166 }
2167 }
2168
2169 /// Creates a new memory free node for a graph allocation and adds it to the graph.
2170 /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2171 ///
2172 /// [`Graph::add_mem_free_node`] returns [`crate::error::Status::InvalidValue`] if the caller attempts to free:
2173 ///
2174 /// * an allocation twice in the same graph.
2175 /// * an address that was not returned by an allocation node.
2176 /// * an invalid address.
2177 ///
2178 /// The following restrictions apply to graphs which contain allocation and/or memory free nodes:
2179 ///
2180 /// * Nodes and edges of the graph cannot be deleted.
2181 /// * The graph can only be used in a child node if the ownership is moved to the parent.
2182 /// * Only one instantiation of the graph may exist at any point in time.
2183 /// * The graph cannot be cloned.
2184 ///
2185 /// Graph objects are not threadsafe.
2186 ///
2187 /// # Errors
2188 ///
2189 /// Returns [`Error::GraphNodeMismatch`] if `allocation` did not come from this
2190 /// graph. Returns an error if CUDA rejects the graph operation or if a
2191 /// previous asynchronous launch reported an error.
2192 pub fn add_memory_free_node(
2193 &mut self,
2194 dependencies: &[GraphNode],
2195 allocation: &MemoryAllocationNodeInfo,
2196 ) -> Result<GraphNode> {
2197 if allocation.graph_id != Some(self.id) {
2198 return Err(Error::GraphNodeMismatch);
2199 }
2200 unsafe { self.add_memory_free_node_raw(dependencies, allocation.ptr) }
2201 }
2202
2203 /// Creates a new memory free node from a raw device address.
2204 ///
2205 /// # Safety
2206 ///
2207 /// CUDA stores the raw address in the graph. The caller must ensure `ptr`
2208 /// is a graph allocation that may be freed by this graph, is ordered after
2209 /// the allocation node, and is not freed more than once or by another graph
2210 /// in a way that violates CUDA graph allocation ownership rules.
2211 pub unsafe fn add_memory_free_node_raw(
2212 &mut self,
2213 dependencies: &[GraphNode],
2214 ptr: DevicePtr,
2215 ) -> Result<GraphNode> {
2216 self.check_nodes(dependencies)?;
2217 let mut handle = ptr::null_mut();
2218 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2219 unsafe {
2220 try_ffi!(runtime::cudaGraphAddMemFreeNode(
2221 &raw mut handle,
2222 self.as_raw(),
2223 dependencies_raw.as_ptr(),
2224 dependencies_raw.len() as _,
2225 ptr.as_ptr() as _,
2226 ))?;
2227 Ok(self.node_from_raw(handle))
2228 }
2229 }
2230
2231 /// Creates a new allocation node and adds it to the graph with the given dependencies and allocation parameters.
2232 /// The dependency list may be empty, in which case the node is placed at the root of the graph, and it may not contain duplicate entries.
2233 ///
2234 /// When [`Graph::add_mem_alloc_node`] creates an allocation node, it returns the allocation metadata in [`MemoryAllocationNodeInfo`].
2235 /// The allocation's address remains fixed across instantiations and launches.
2236 ///
2237 /// If the allocation is freed in the same graph, by creating a free node using [`Graph::add_mem_free_node`], the allocation can be accessed by nodes ordered after the allocation node but before the free node.
2238 /// These allocations cannot be freed outside the owning graph, and they can only be freed once in the owning graph.
2239 ///
2240 /// If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the graph which are ordered after the allocation node, but also by stream operations ordered after the graph's execution but before the allocation is freed.
2241 ///
2242 /// Allocations which are not freed in the same graph can be freed by:
2243 ///
2244 /// * passing the allocation to [`DeviceMemory::free_async`](crate::memory::DeviceMemory::free_async) or [`DeviceMemory::free`](crate::memory::DeviceMemory::free);
2245 /// * launching a graph with a free node for that allocation; or
2246 /// * specifying [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`] during instantiation, which makes each launch behave as though it called [`DeviceMemory::free_async`](crate::memory::DeviceMemory::free_async) for every unfreed allocation.
2247 ///
2248 /// It is not possible to free an allocation in both the owning graph and another graph.
2249 /// If the allocation is freed in the same graph, a free node cannot be added to another graph.
2250 /// If the allocation is freed in another graph, a free node can no longer be added to the owning graph.
2251 ///
2252 /// The following restrictions apply to graphs which contain allocation and/or memory free nodes:
2253 ///
2254 /// * Nodes and edges of the graph cannot be deleted.
2255 /// * The graph can only be used in a child node if the ownership is moved to the parent.
2256 /// * Only one instantiation of the graph may exist at any point in time.
2257 /// * The graph cannot be cloned.
2258 ///
2259 /// Graph objects are not threadsafe.
2260 ///
2261 /// # Errors
2262 ///
2263 /// Returns an error if CUDA rejects the graph operation or if a previous asynchronous
2264 /// launch reported an error.
2265 pub fn add_memory_allocation_node(
2266 &mut self,
2267 dependencies: &[GraphNode],
2268 params: &MemoryAllocationNodeParams<'_>,
2269 ) -> Result<(GraphNode, MemoryAllocationNodeInfo)> {
2270 self.check_nodes(dependencies)?;
2271 let mut handle = ptr::null_mut();
2272 let dependencies_raw: Vec<_> = dependencies.iter().map(GraphNode::as_raw).collect();
2273 let access_descs: Vec<_> = params
2274 .access_descs
2275 .iter()
2276 .copied()
2277 .map(Into::into)
2278 .collect();
2279 let mut params_raw = runtime::cudaMemAllocNodeParams {
2280 poolProps: params.pool_props.into(),
2281 accessDescs: access_descs.as_ptr(),
2282 accessDescCount: access_descs.len() as _,
2283 bytesize: params.byte_size as _,
2284 dptr: 0,
2285 };
2286 unsafe {
2287 try_ffi!(runtime::cudaGraphAddMemAllocNode(
2288 &raw mut handle,
2289 self.as_raw(),
2290 dependencies_raw.as_ptr(),
2291 dependencies_raw.len() as _,
2292 &raw mut params_raw,
2293 ))?;
2294 // TODO: verify dptr?
2295 let node = self.node_from_raw(handle);
2296 let allocation = MemoryAllocationNodeInfo::from_raw_in_graph(
2297 DevicePtr::new(params_raw.dptr as *mut ()),
2298 params.byte_size,
2299 self.id,
2300 Arc::clone(&self.inner),
2301 self.ctx.clone(),
2302 );
2303 Ok((node, allocation))
2304 }
2305 }
2306
2307 /// Returns this graph's nodes.
2308 ///
2309 /// Graph objects are not threadsafe.
2310 ///
2311 /// # Errors
2312 ///
2313 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2314 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2315 /// not call CUDA functions; see [`Stream::add_callback`].
2316 pub fn nodes(&self) -> Result<Vec<GraphNode>> {
2317 unsafe {
2318 let mut count = 0;
2319 try_ffi!(runtime::cudaGraphGetNodes(
2320 self.as_raw(),
2321 ptr::null_mut(),
2322 &raw mut count,
2323 ))?;
2324
2325 if count == 0 {
2326 return Ok(Vec::new());
2327 }
2328
2329 let mut handles = Vec::with_capacity(count as usize);
2330 try_ffi!(runtime::cudaGraphGetNodes(
2331 self.as_raw(),
2332 handles.as_mut_ptr(),
2333 &raw mut count,
2334 ))?;
2335 handles.set_len(count as usize);
2336
2337 Ok(handles
2338 .into_iter()
2339 .map(|handle| self.node_from_raw(handle))
2340 .collect())
2341 }
2342 }
2343
2344 /// Returns this graph's root nodes.
2345 ///
2346 /// Graph objects are not threadsafe.
2347 ///
2348 /// # Errors
2349 ///
2350 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2351 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2352 /// not call CUDA functions; see [`Stream::add_callback`].
2353 pub fn root_nodes(&self) -> Result<Vec<GraphNode>> {
2354 unsafe {
2355 let mut count = 0;
2356 try_ffi!(runtime::cudaGraphGetRootNodes(
2357 self.as_raw(),
2358 ptr::null_mut(),
2359 &raw mut count,
2360 ))?;
2361
2362 if count == 0 {
2363 return Ok(Vec::new());
2364 }
2365
2366 let mut handles = Vec::with_capacity(count as usize);
2367 try_ffi!(runtime::cudaGraphGetRootNodes(
2368 self.as_raw(),
2369 handles.as_mut_ptr(),
2370 &raw mut count,
2371 ))?;
2372 handles.set_len(count as usize);
2373
2374 Ok(handles
2375 .into_iter()
2376 .map(|handle| self.node_from_raw(handle))
2377 .collect())
2378 }
2379 }
2380
2381 /// Returns this graph's dependency edges.
2382 ///
2383 /// Graph objects are not threadsafe.
2384 ///
2385 /// # Errors
2386 ///
2387 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2388 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2389 /// not call CUDA functions; see [`Stream::add_callback`].
2390 pub fn edges(&self) -> Result<Vec<GraphEdge>> {
2391 unsafe {
2392 let mut count = 0;
2393 try_ffi!(runtime::cudaGraphGetEdges(
2394 self.as_raw(),
2395 ptr::null_mut(),
2396 ptr::null_mut(),
2397 ptr::null_mut(),
2398 &raw mut count,
2399 ))?;
2400
2401 if count == 0 {
2402 return Ok(Vec::new());
2403 }
2404
2405 let len = count as usize;
2406 let mut from = Vec::with_capacity(len);
2407 let mut to = Vec::with_capacity(len);
2408 let mut edge_data = Vec::with_capacity(len);
2409 try_ffi!(runtime::cudaGraphGetEdges(
2410 self.as_raw(),
2411 from.as_mut_ptr(),
2412 to.as_mut_ptr(),
2413 edge_data.as_mut_ptr(),
2414 &raw mut count,
2415 ))?;
2416 let len = count as usize;
2417 from.set_len(len);
2418 to.set_len(len);
2419 edge_data.set_len(len);
2420
2421 Ok(from
2422 .into_iter()
2423 .zip(to)
2424 .zip(edge_data)
2425 .map(|((from, to), data)| GraphEdge {
2426 from: self.node_from_raw(from),
2427 to: self.node_from_raw(to),
2428 data: data.into(),
2429 })
2430 .collect())
2431 }
2432 }
2433
2434 /// Returns a compact summary of this graph's native CUDA topology.
2435 ///
2436 /// The summary is computed from CUDA graph introspection APIs and counts
2437 /// node kinds, root nodes, and dependency edges in this graph. Child graph
2438 /// nodes are counted as child nodes here; callers that need recursive
2439 /// details can query the child graph returned by [`GraphNode::child_graph`].
2440 ///
2441 /// Graph objects are not threadsafe.
2442 ///
2443 /// # Errors
2444 ///
2445 /// Returns an error if CUDA rejects a topology query, if a previous
2446 /// asynchronous launch reported an error, or if CUDA reports runtime
2447 /// initialization diagnostics.
2448 pub fn topology_summary(&self) -> Result<GraphTopologySummary> {
2449 let nodes = self.nodes()?;
2450 let mut summary = GraphTopologySummary {
2451 nodes: nodes.len(),
2452 root_nodes: self.root_nodes()?.len(),
2453 edges: self.edges()?.len(),
2454 ..GraphTopologySummary::default()
2455 };
2456 for node in nodes {
2457 summary.record_node_type(node.node_type()?);
2458 }
2459 Ok(summary)
2460 }
2461
2462 /// Writes a DOT-formatted description of the graph to `path`.
2463 /// By default this includes the graph topology, node types, node ID, kernel names, and memcpy direction.
2464 /// `flags` can request more detailed information about each node type, such as parameter values, kernel attributes, node handles, and function handles.
2465 ///
2466 /// # Errors
2467 ///
2468 /// Returns an error if `path` contains an interior NUL byte or if CUDA
2469 /// Runtime cannot write the DOT file.
2470 pub fn write_dot(&self, path: &str, flags: GraphDebugDotFlags) -> Result<()> {
2471 let path = CString::new(path)?;
2472 unsafe {
2473 try_ffi!(runtime::cudaGraphDebugDotPrint(
2474 self.as_raw(),
2475 path.as_ptr(),
2476 flags.bits(),
2477 ))?;
2478 }
2479 Ok(())
2480 }
2481
2482 pub fn as_raw(&self) -> runtime::cudaGraph_t {
2483 self.inner.handle
2484 }
2485
2486 pub fn context(&self) -> Option<&Context> {
2487 self.ctx.as_deref()
2488 }
2489
2490 /// Consumes the graph and returns the raw CUDA graph handle without
2491 /// destroying it.
2492 ///
2493 /// The caller becomes responsible for eventually destroying the returned
2494 /// handle with CUDA.
2495 pub fn into_raw(self) -> runtime::cudaGraph_t {
2496 let inner = Arc::try_unwrap(self.inner)
2497 .unwrap_or_else(|_| panic!("cannot take raw graph handle while it is still shared"));
2498 let inner = ManuallyDrop::new(inner);
2499 inner.handle
2500 }
2501}
2502
2503impl Drop for GraphInner {
2504 fn drop(&mut self) {
2505 if !self.owns_handle {
2506 return;
2507 }
2508 unsafe {
2509 if let Err(err) = try_ffi!(runtime::cudaGraphDestroy(self.handle)) {
2510 #[cfg(debug_assertions)]
2511 eprintln!("failed to destroy cuda graph: {err}");
2512 }
2513 }
2514 }
2515}
2516
2517impl<'graph> BorrowedGraph<'graph> {
2518 /// Wraps an existing CUDA graph handle without taking ownership.
2519 ///
2520 /// # Safety
2521 ///
2522 /// `handle` must be a valid CUDA graph handle for the returned lifetime.
2523 /// The returned graph view will not destroy `handle` when dropped.
2524 pub unsafe fn from_raw(handle: runtime::cudaGraph_t) -> Result<Self> {
2525 unsafe { Self::from_raw_in_context(handle, None) }
2526 }
2527
2528 /// Wraps an existing CUDA graph handle without taking ownership and keeps a
2529 /// modeled context association for safe graph operations through the
2530 /// borrowed view.
2531 ///
2532 /// # Safety
2533 ///
2534 /// `handle` must be a valid CUDA graph handle for the returned lifetime,
2535 /// and it must be associated with `ctx` when `ctx` is present. The returned
2536 /// graph view will not destroy `handle` when dropped.
2537 pub unsafe fn from_raw_in_context(
2538 handle: runtime::cudaGraph_t,
2539 ctx: Option<Arc<Context>>,
2540 ) -> Result<Self> {
2541 if handle.is_null() {
2542 return Err(Error::NullHandle);
2543 }
2544
2545 Ok(Self {
2546 graph: unsafe { Graph::from_raw_borrowed_in_context(handle, ctx) },
2547 _node: PhantomData,
2548 })
2549 }
2550
2551 pub const fn as_graph(&self) -> &Graph {
2552 &self.graph
2553 }
2554
2555 pub fn as_raw(&self) -> runtime::cudaGraph_t {
2556 self.graph.as_raw()
2557 }
2558}
2559
2560impl Deref for BorrowedGraph<'_> {
2561 type Target = Graph;
2562
2563 fn deref(&self) -> &Self::Target {
2564 self.as_graph()
2565 }
2566}
2567
2568#[derive(Debug)]
2569pub struct ExecutableGraph {
2570 handle: runtime::cudaGraphExec_t,
2571 ctx: Option<Arc<Context>>,
2572 source_graph_id: Option<GraphId>,
2573 _source_graph: Option<Arc<GraphInner>>,
2574 retained: Vec<RetainedAllocation>,
2575}
2576
2577#[derive(Debug, Clone, Copy)]
2578pub struct ExecutableGraphLaunchOperation<'graph> {
2579 graph: &'graph ExecutableGraph,
2580}
2581
2582#[derive(Debug)]
2583pub struct RawExecutableGraph {
2584 handle: runtime::cudaGraphExec_t,
2585}
2586
2587impl RawExecutableGraph {
2588 /// Wraps an existing CUDA executable graph handle and takes ownership of it.
2589 ///
2590 /// # Safety
2591 ///
2592 /// `handle` must be a valid CUDA executable graph handle.
2593 /// Ownership of `handle` is transferred to the returned [`RawExecutableGraph`], and the handle must not be destroyed elsewhere after calling this function.
2594 pub unsafe fn from_raw(handle: runtime::cudaGraphExec_t) -> Result<Self> {
2595 if handle.is_null() {
2596 return Err(Error::NullHandle);
2597 }
2598
2599 Ok(Self { handle })
2600 }
2601
2602 pub const fn as_raw(&self) -> runtime::cudaGraphExec_t {
2603 self.handle
2604 }
2605
2606 /// Consumes the executable graph and returns the raw CUDA executable graph
2607 /// handle without destroying it.
2608 ///
2609 /// The caller becomes responsible for eventually destroying the returned
2610 /// handle with CUDA.
2611 pub fn into_raw(self) -> runtime::cudaGraphExec_t {
2612 let graph = ManuallyDrop::new(self);
2613 graph.as_raw()
2614 }
2615}
2616
2617impl Drop for RawExecutableGraph {
2618 fn drop(&mut self) {
2619 unsafe {
2620 if let Err(err) = try_ffi!(runtime::cudaGraphExecDestroy(self.handle)) {
2621 #[cfg(debug_assertions)]
2622 eprintln!("failed to destroy cuda graph exec: {err}");
2623 }
2624 }
2625 }
2626}
2627
2628impl ExecutableGraph {
2629 fn bind_context(&self) -> Result<()> {
2630 if let Some(ctx) = &self.ctx {
2631 ctx.bind()?;
2632 }
2633 Ok(())
2634 }
2635
2636 unsafe fn from_raw_with_graph(
2637 handle: runtime::cudaGraphExec_t,
2638 ctx: Option<Arc<Context>>,
2639 source_graph_id: Option<GraphId>,
2640 source_graph: Option<Arc<GraphInner>>,
2641 retained: Vec<RetainedAllocation>,
2642 ) -> Result<Self> {
2643 if handle.is_null() {
2644 return Err(Error::NullHandle);
2645 }
2646
2647 Ok(Self {
2648 handle,
2649 ctx,
2650 source_graph_id,
2651 _source_graph: source_graph,
2652 retained,
2653 })
2654 }
2655
2656 fn check_node(&self, node: &GraphNode) -> Result<()> {
2657 self.bind_context()?;
2658 if !matches!((self.source_graph_id, node.graph_id), (Some(source_id), Some(node_id)) if node_id == source_id)
2659 {
2660 return Err(Error::GraphNodeMismatch);
2661 }
2662 Ok(())
2663 }
2664
2665 fn retain_buffer<T>(&mut self, buffer: &GraphBuffer<T>)
2666 where
2667 T: DeviceRepr + Send + Sync,
2668 {
2669 self.retained.push(buffer.retained());
2670 }
2671
2672 /// Returns the flags that were passed to instantiation for the given executable graph.
2673 /// [`GraphInstantiateFlags::UPLOAD`] is not returned because it does not affect the resulting executable graph.
2674 ///
2675 /// Graph objects are not threadsafe.
2676 ///
2677 /// # Errors
2678 ///
2679 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2680 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2681 /// not call CUDA functions; see [`Stream::add_callback`].
2682 pub fn flags(&self) -> Result<GraphInstantiateFlags> {
2683 self.bind_context()?;
2684 let mut flags = 0;
2685 unsafe {
2686 try_ffi!(runtime::cudaGraphExecGetFlags(
2687 self.as_raw(),
2688 &raw mut flags
2689 ))?;
2690 }
2691 Ok(GraphInstantiateFlags::from_bits_retain(flags))
2692 }
2693
2694 /// Executes this executable graph in `stream`.
2695 /// Only one instance of this executable graph may be executing at a time.
2696 /// Each launch is ordered behind both any previous work in `stream` and any previous launches of this executable graph.
2697 /// To execute a graph concurrently, it must be instantiated multiple times into multiple executable graphs.
2698 ///
2699 /// If any allocations created by this executable graph remain unfreed from a previous launch and the graph was not instantiated with [`GraphInstantiateFlags::AUTO_FREE_ON_LAUNCH`], the launch fails with [`crate::error::Status::InvalidValue`].
2700 ///
2701 /// Graph objects are not threadsafe.
2702 ///
2703 /// # Errors
2704 ///
2705 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2706 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2707 /// not call CUDA functions; see [`Stream::add_callback`].
2708 pub fn launch(&self, stream: &Stream) -> Result<()> {
2709 if let Some(ctx) = &self.ctx
2710 && stream.context() != ctx.as_ref()
2711 {
2712 return Err(Error::StreamContextMismatch);
2713 }
2714 self.bind_context()?;
2715 unsafe {
2716 try_ffi!(runtime::cudaGraphLaunch(self.as_raw(), stream.as_raw()))?;
2717 }
2718 Ok(())
2719 }
2720
2721 /// Returns a reusable operation object that launches this executable graph.
2722 pub const fn launch_operation(&self) -> ExecutableGraphLaunchOperation<'_> {
2723 ExecutableGraphLaunchOperation { graph: self }
2724 }
2725
2726 /// Uploads this executable graph to the device in `stream` without executing it.
2727 /// Uploads of the same executable graph are serialized.
2728 /// Each upload is ordered behind both any previous work in `stream` and any previous launches of this executable graph.
2729 /// Uses memory cached by `stream` to back the allocations owned by this executable graph.
2730 ///
2731 /// # Errors
2732 ///
2733 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2734 /// reported an error, or if CUDA reports runtime initialization diagnostics.
2735 pub fn upload(&self, stream: &Stream) -> Result<()> {
2736 if let Some(ctx) = &self.ctx
2737 && stream.context() != ctx.as_ref()
2738 {
2739 return Err(Error::StreamContextMismatch);
2740 }
2741 self.bind_context()?;
2742 unsafe {
2743 try_ffi!(runtime::cudaGraphUpload(self.as_raw(), stream.as_raw()))?;
2744 }
2745 Ok(())
2746 }
2747
2748 /// Updates this executable graph with the node parameters in a topologically identical `graph`.
2749 ///
2750 /// Limitations:
2751 ///
2752 /// * Kernel nodes:
2753 /// + The owning context of the kernel function cannot change.
2754 /// + A node whose kernel function originally did not use CUDA dynamic parallelism cannot be updated to a kernel function that uses CDP.
2755 /// + A node whose kernel function originally did not make device-side update calls cannot be updated to a kernel function that makes device-side
2756 /// update calls.
2757 /// + A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
2758 /// + If the graph was instantiated with [`GraphInstantiateFlags::USE_NODE_PRIORITY`], the priority attribute cannot change.
2759 /// Equality
2760 /// is checked on the originally requested priority values, before they are clamped to the device's supported range.
2761 /// + If this executable graph was not instantiated for device launch, a node whose kernel function originally did not use device-side [`ExecutableGraph::launch`] cannot be updated to a kernel function that uses device-side [`ExecutableGraph::launch`] unless the node resides on the same device as nodes which contained such calls at instantiate-time.
2762 /// If no such calls were
2763 /// present at instantiation, these updates cannot be performed at all.
2764 /// + Neither the source graph nor this executable graph may contain device-updatable kernel nodes.
2765 /// * Memset and memcpy nodes:
2766 /// + The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
2767 /// + The source/destination memory must be allocated from the same contexts as the original source/destination memory.
2768 /// + For 2D memsets, only address and assigned value may be updated.
2769 /// + For 1D memsets, updating dimensions is also allowed, but may fail if the resulting operation does not map onto the work resources
2770 /// already allocated for the node.
2771 /// * Additional memcpy node restrictions:
2772 /// + Changing either the source or destination memory type, such as [`MemoryType::Device`](crate::types::MemoryType::Device) or [`MemoryType::Array`](crate::types::MemoryType::Array), is not supported.
2773 /// * Conditional nodes:
2774 /// + Changing node parameters is not supported.
2775 /// + Changing parameters of nodes within the conditional body graph is subject to the rules above.
2776 /// + Conditional handle flags and default values are updated as part of the graph update.
2777 ///
2778 /// CUDA may add further restrictions in future releases.
2779 /// [`ExecutableGraph::update`] sets the update result to [`GraphExecUpdateResult::ErrorTopologyChanged`] under the following conditions:
2780 ///
2781 /// * The count of nodes directly in the executable graph and the source graph differ.
2782 /// * The source graph has more exit nodes.
2783 /// * A node in the source graph has a different number of dependencies than the paired node from the executable graph.
2784 /// * A node in the source graph has a dependency that does not match the corresponding dependency of the paired node from the executable graph.
2785 /// The dependencies are paired based on edge order and
2786 /// a dependency does not match when the nodes are already paired based on other edges examined in the graph.
2787 ///
2788 /// [`ExecutableGraph::update`] sets the update result to:
2789 ///
2790 /// * [`GraphExecUpdateResult::Error`] if passed an invalid value.
2791 /// * [`GraphExecUpdateResult::ErrorTopologyChanged`] if the graph topology changed.
2792 /// * [`GraphExecUpdateResult::ErrorNodeTypeChanged`] if the type of a node changed.
2793 /// * [`GraphExecUpdateResult::ErrorFunctionChanged`] if the kernel function of a node changed (CUDA driver before 11.2).
2794 /// * [`GraphExecUpdateResult::ErrorUnsupportedFunctionChange`] if the kernel function changed in an unsupported way.
2795 /// * [`GraphExecUpdateResult::ErrorParametersChanged`] if any parameters to a node changed in a way that is not supported.
2796 /// * [`GraphExecUpdateResult::ErrorAttributesChanged`] if any attributes of a node changed in a way that is not supported.
2797 /// * [`GraphExecUpdateResult::ErrorNotSupported`] if something about a node is unsupported, like the node's type or configuration.
2798 ///
2799 /// If the update fails for a reason not listed above, the result is [`GraphExecUpdateResult::Error`].
2800 /// If the update succeeds, the result is [`GraphExecUpdateResult::Success`].
2801 ///
2802 /// [`ExecutableGraph::update`] succeeds when the update was performed successfully.
2803 /// It returns [`crate::error::Status::GraphExecUpdateFailure`] if the graph update was not performed because it included changes which violated constraints specific to instantiated graph update.
2804 ///
2805 /// Graph objects are not threadsafe.
2806 ///
2807 /// # Errors
2808 ///
2809 /// Returns an error if CUDA rejects the graph update, if the update violates instantiated graph
2810 /// update constraints, or if a previous asynchronous launch reported an error. CUDA may also
2811 /// return initialization-related errors such as [`crate::error::Status::NotInitialized`],
2812 /// [`crate::error::Status::CallRequiresNewerDriver`], or [`crate::error::Status::NoDevice`] if this call initializes
2813 /// internal runtime state. Callbacks must not call CUDA functions; see
2814 /// [`Stream::add_callback`].
2815 pub fn update(&mut self, graph: &Graph) -> Result<ExecutableGraphUpdate> {
2816 if let (Some(exec_ctx), Some(graph_ctx)) = (&self.ctx, &graph.ctx)
2817 && exec_ctx.as_ref() != graph_ctx.as_ref()
2818 {
2819 return Err(Error::GraphContextMismatch);
2820 }
2821 self.bind_context()?;
2822 let mut result_info = runtime::cudaGraphExecUpdateResultInfo::default();
2823 unsafe {
2824 try_ffi!(runtime::cudaGraphExecUpdate(
2825 self.as_raw(),
2826 graph.as_raw(),
2827 &raw mut result_info,
2828 ))?;
2829 }
2830 self.retained.extend(graph.retained.iter().cloned());
2831 Ok(ExecutableGraphUpdate::from_result_info(result_info, graph))
2832 }
2833
2834 /// Sets the parameters of a kernel node in this executable graph.
2835 /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
2836 ///
2837 /// `node` must not have been removed from the original graph.
2838 /// All node parameters may change, but the following restrictions apply to function updates:
2839 ///
2840 /// * The owning device of the kernel function cannot change.
2841 /// * A node whose kernel function originally did not use CUDA dynamic parallelism cannot be updated to a kernel function that uses CDP
2842 /// * A node whose kernel function originally did not make device-side update calls cannot be updated to a kernel function that makes device-side
2843 /// update calls.
2844 /// * If this executable graph was not instantiated for device launch, a node whose kernel function originally did not use device-side [`ExecutableGraph::launch`] cannot be updated to a kernel function that uses device-side [`ExecutableGraph::launch`] unless the node resides on the same device as nodes which contained such calls at instantiate-time.
2845 /// If no such calls were
2846 /// present at instantiation, these updates cannot be performed at all.
2847 ///
2848 /// The modifications only affect future launches of this executable graph.
2849 /// Already enqueued or running launches of this executable graph are not affected by this call.
2850 /// The original `node` is also not modified by this call.
2851 ///
2852 /// If `node` is a device-updatable kernel node, the next upload or launch of this executable graph will overwrite any previous device-side updates.
2853 /// Additionally, applying host updates to a device-updatable kernel node while it is being updated from the device results in undefined behavior.
2854 /// This can also be used with a runtime kernel handle queried through [`sys::cudaLibraryGetKernel`](singe_cuda_sys::runtime::cudaLibraryGetKernel) or [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) and then passed as a raw pointer.
2855 /// The symbol passed to [`sys::cudaGetKernel`](singe_cuda_sys::runtime::cudaGetKernel) must be registered with the same CUDA Runtime instance.
2856 /// Passing a symbol that belongs to a different runtime instance results in undefined behavior.
2857 /// The only type that can be reliably passed to a different runtime instance is the runtime kernel handle type itself.
2858 ///
2859 /// Graph objects are not threadsafe.
2860 ///
2861 /// # Safety
2862 ///
2863 /// CUDA copies the kernel argument values during this call and stores those
2864 /// copied values in the executable graph for future launches. If an
2865 /// argument value is itself a pointer, only the pointer address is copied.
2866 /// The caller must ensure every copied pointer value remains valid for
2867 /// every future launch that can execute this node. Mutable pointer
2868 /// arguments must also remain exclusive for the work ordered by those
2869 /// launches.
2870 ///
2871 /// # Errors
2872 ///
2873 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2874 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2875 /// not call CUDA functions; see [`Stream::add_callback`].
2876 pub unsafe fn set_kernel_node_params<'a, P>(
2877 &mut self,
2878 node: GraphNode,
2879 function: DeviceFunction,
2880 config: &LaunchConfig,
2881 params: P,
2882 ) -> Result<()>
2883 where
2884 P: KernelLaunchArgs<'a>,
2885 {
2886 self.check_node(&node)?;
2887 params.with_encoded_arguments(|mut arguments| unsafe {
2888 let params = runtime::cudaKernelNodeParams {
2889 func: function.as_raw().cast(),
2890 gridDim: config.grid_dim().into(),
2891 blockDim: config.block_dim().into(),
2892 sharedMemBytes: config.shared_memory_bytes_u32(),
2893 kernelParams: arguments.as_mut_ptr().cast(),
2894 extra: ptr::null_mut(),
2895 };
2896 try_ffi!(runtime::cudaGraphExecKernelNodeSetParams(
2897 self.as_raw(),
2898 node.as_raw(),
2899 &raw const params,
2900 ))?;
2901 Ok(())
2902 })
2903 }
2904
2905 /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
2906 /// `node` must remain in the graph which was used to instantiate this executable graph.
2907 /// Changed edges to and from `node` are ignored.
2908 ///
2909 /// The source and destination must be allocated from the same contexts as the original source and destination memory.
2910 /// The instantiation-time memory operands must be 1-dimensional.
2911 /// Zero-length operations are not supported.
2912 ///
2913 /// The modifications only affect future launches of this executable graph.
2914 /// Already enqueued or running launches of this executable graph are not affected by this call.
2915 /// The original `node` is also not modified by this call.
2916 ///
2917 /// Returns [`crate::error::Status::InvalidValue`] if the memory operands' mappings changed or the original memory operands are multidimensional.
2918 ///
2919 /// Graph objects are not threadsafe.
2920 ///
2921 /// # Safety
2922 ///
2923 /// CUDA stores the raw source and destination addresses in the executable
2924 /// graph for future launches. The caller must ensure `params` remains
2925 /// valid according to [`Memcpy1DNodeParams::new`] for every future launch
2926 /// that can execute this node.
2927 ///
2928 /// # Errors
2929 ///
2930 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
2931 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
2932 /// not call CUDA functions; see [`Stream::add_callback`].
2933 pub unsafe fn set_memory_copy_node_1d_params(
2934 &mut self,
2935 node: GraphNode,
2936 params: &MemoryCopy1DNodeParams,
2937 ) -> Result<()> {
2938 self.check_node(&node)?;
2939 unsafe {
2940 try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParams1D(
2941 self.as_raw(),
2942 node.as_raw(),
2943 params.dst().cast(),
2944 params.src().cast(),
2945 params.count() as _,
2946 params.kind().into(),
2947 ))?;
2948 }
2949 Ok(())
2950 }
2951
2952 /// Updates a memcpy node to copy between typed device byte buffers.
2953 ///
2954 /// The node copies `src.byte_len()` bytes. `dst` must have at least that
2955 /// many bytes.
2956 ///
2957 /// # Safety
2958 ///
2959 /// CUDA stores the raw source and destination addresses in the executable
2960 /// graph for future launches. The caller must ensure `dst` and `src`
2961 /// remain valid for every future launch that can execute this node. `dst`
2962 /// must not be accessed through another mutable path while graph launches
2963 /// using this node can write it.
2964 ///
2965 /// # Errors
2966 ///
2967 /// Returns an error if `dst` is smaller than `src`, if CUDA rejects the graph
2968 /// operation, if a previous asynchronous launch reported an error, or if CUDA
2969 /// reports runtime initialization diagnostics.
2970 pub unsafe fn set_memory_copy_node_1d_device_to_device<D, S>(
2971 &mut self,
2972 node: GraphNode,
2973 dst: &mut D,
2974 src: &S,
2975 ) -> Result<()>
2976 where
2977 D: ByteBufferMut + ?Sized,
2978 S: ByteBuffer + ?Sized,
2979 {
2980 let count = src.byte_len();
2981 if dst.byte_len() < count {
2982 return Err(Error::InvalidMemoryAccess);
2983 }
2984 let params = unsafe {
2985 MemoryCopy1DNodeParams::new(
2986 dst.as_byte_mut_ptr().cast(),
2987 src.as_byte_ptr().cast(),
2988 count,
2989 MemoryCopyKind::DeviceToDevice,
2990 )
2991 };
2992 unsafe { self.set_memory_copy_node_1d_params(node, ¶ms) }
2993 }
2994
2995 /// Updates a memcpy node to copy between graph-retained buffers.
2996 ///
2997 /// The node copies `src.byte_len()` bytes. `dst` must have at least that
2998 /// many bytes. The executable graph retains both allocations so future
2999 /// launches cannot outlive the baked CUDA pointer values.
3000 ///
3001 /// # Errors
3002 ///
3003 /// Returns an error if `dst` is smaller than `src`, if `node` does not
3004 /// belong to the graph used to instantiate this executable graph, if CUDA
3005 /// rejects the graph update, if a previous asynchronous launch reported an
3006 /// error, or if CUDA reports runtime initialization diagnostics.
3007 pub fn set_buffer_memory_copy_node_1d_device_to_device<T>(
3008 &mut self,
3009 node: GraphNode,
3010 dst: &mut GraphBuffer<T>,
3011 src: &GraphBuffer<T>,
3012 ) -> Result<()>
3013 where
3014 T: DeviceRepr + Send + Sync,
3015 {
3016 if let (Some(exec_ctx), Some(dst_ctx)) = (&self.ctx, dst.context())
3017 && exec_ctx.as_ref() != dst_ctx
3018 {
3019 return Err(Error::GraphContextMismatch);
3020 }
3021 if let (Some(exec_ctx), Some(src_ctx)) = (&self.ctx, src.context())
3022 && exec_ctx.as_ref() != src_ctx
3023 {
3024 return Err(Error::GraphContextMismatch);
3025 }
3026 let count = src.byte_len();
3027 if dst.byte_len() < count {
3028 return Err(Error::InvalidMemoryAccess);
3029 }
3030 let params = unsafe {
3031 MemoryCopy1DNodeParams::new(
3032 dst.as_mut_ptr().cast(),
3033 src.as_ptr().cast(),
3034 count,
3035 MemoryCopyKind::DeviceToDevice,
3036 )
3037 };
3038 unsafe { self.set_memory_copy_node_1d_params(node, ¶ms)? };
3039 self.retain_buffer(dst);
3040 self.retain_buffer(src);
3041 Ok(())
3042 }
3043
3044 /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
3045 /// `node` must remain in the graph which was used to instantiate this executable graph.
3046 /// Changed edges to and from `node` are ignored.
3047 ///
3048 /// The source and destination memory in `params` must be allocated from the same contexts as the original source and destination memory.
3049 /// Both the instantiation-time memory operands and the memory operands in `params` must be 1-dimensional.
3050 /// Zero-length operations are not supported.
3051 ///
3052 /// The modifications only affect future launches of this executable graph.
3053 /// Already enqueued or running launches of this executable graph are not affected by this call.
3054 /// The original `node` is also not modified by this call.
3055 ///
3056 /// Returns [`crate::error::Status::InvalidValue`] if the memory operands' mappings changed or either the original or new memory operands are multidimensional.
3057 ///
3058 /// Graph objects are not threadsafe.
3059 ///
3060 /// # Safety
3061 ///
3062 /// CUDA stores the raw source and destination addresses in the executable
3063 /// graph for future launches. The caller must ensure `params` remains
3064 /// valid according to [`MemoryCopy3DNodeParams`] for every future launch that
3065 /// can execute this node.
3066 ///
3067 /// # Errors
3068 ///
3069 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3070 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3071 /// not call CUDA functions; see [`Stream::add_callback`].
3072 pub unsafe fn set_memory_copy_node_params(
3073 &mut self,
3074 node: GraphNode,
3075 params: &MemoryCopy3DNodeParams,
3076 ) -> Result<()> {
3077 self.check_node(&node)?;
3078 let params = params.into();
3079 unsafe {
3080 try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParams(
3081 self.as_raw(),
3082 node.as_raw(),
3083 &raw const params,
3084 ))?;
3085 }
3086 Ok(())
3087 }
3088
3089 /// # Safety
3090 ///
3091 /// CUDA stores the raw symbol and source pointer in the executable graph
3092 /// for future launches. The caller must ensure `params` remains valid
3093 /// according to [`MemoryCopyToSymbolNodeParams::new`] for every future launch
3094 /// that can execute this node.
3095 pub unsafe fn set_memory_copy_node_to_symbol_params(
3096 &mut self,
3097 node: GraphNode,
3098 params: &MemoryCopyToSymbolNodeParams,
3099 ) -> Result<()> {
3100 self.check_node(&node)?;
3101 unsafe {
3102 try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParamsToSymbol(
3103 self.as_raw(),
3104 node.as_raw(),
3105 params.symbol().cast(),
3106 params.src().cast(),
3107 params.count() as _,
3108 params.offset() as _,
3109 params.kind().into(),
3110 ))?;
3111 }
3112 Ok(())
3113 }
3114
3115 /// # Safety
3116 ///
3117 /// CUDA stores the raw destination and symbol pointer in the executable
3118 /// graph for future launches. The caller must ensure `params` remains
3119 /// valid according to [`MemoryCopyFromSymbolNodeParams::new`] for every future
3120 /// launch that can execute this node.
3121 pub unsafe fn set_memory_copy_node_from_symbol_params(
3122 &mut self,
3123 node: GraphNode,
3124 params: &MemoryCopyFromSymbolNodeParams,
3125 ) -> Result<()> {
3126 self.check_node(&node)?;
3127 unsafe {
3128 try_ffi!(runtime::cudaGraphExecMemcpyNodeSetParamsFromSymbol(
3129 self.as_raw(),
3130 node.as_raw(),
3131 params.dst().cast(),
3132 params.symbol().cast(),
3133 params.count() as _,
3134 params.offset() as _,
3135 params.kind().into(),
3136 ))?;
3137 }
3138 Ok(())
3139 }
3140
3141 /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
3142 /// `node` must remain in the graph which was used to instantiate this executable graph.
3143 /// Changed edges to and from `node` are ignored.
3144 ///
3145 /// Zero-sized operations are not supported.
3146 ///
3147 /// The new destination pointer in `params` must be to the same kind of allocation as the original destination pointer and have the same context association and device mapping as the original destination pointer.
3148 ///
3149 /// Both the value and pointer address may be updated.
3150 /// Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
3151 /// Specifically, for 2D memsets, all dimension changes are rejected.
3152 /// For 1D memsets, changes in height are explicitly rejected and other changes are opportunistically allowed if the resulting work maps onto the work resources already allocated for the node.
3153 ///
3154 /// The modifications only affect future launches of this executable graph.
3155 /// Already enqueued or running launches of this executable graph are not affected by this call.
3156 /// The original `node` is also not modified by this call.
3157 ///
3158 /// Graph objects are not threadsafe.
3159 ///
3160 /// # Safety
3161 ///
3162 /// CUDA stores the raw destination address in the executable graph for
3163 /// future launches. The caller must ensure `params` remains valid according
3164 /// to [`MemorySetNodeParams::new`] for every future launch that can execute
3165 /// this node.
3166 ///
3167 /// # Errors
3168 ///
3169 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3170 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3171 /// not call CUDA functions; see [`Stream::add_callback`].
3172 pub unsafe fn set_memory_set_node_params(
3173 &mut self,
3174 node: GraphNode,
3175 params: &MemorySetNodeParams,
3176 ) -> Result<()> {
3177 self.check_node(&node)?;
3178 let params = params.into();
3179 unsafe {
3180 try_ffi!(runtime::cudaGraphExecMemsetNodeSetParams(
3181 self.as_raw(),
3182 node.as_raw(),
3183 &raw const params,
3184 ))?;
3185 }
3186 Ok(())
3187 }
3188
3189 /// Updates the work represented by `node` in this executable graph as though `node` had contained the given `params` at instantiation.
3190 /// `node` must remain in the graph which was used to instantiate this executable graph.
3191 /// Changed edges to and from `node` are ignored.
3192 ///
3193 /// The modifications only affect future launches of this executable graph.
3194 /// Already enqueued or running launches of this executable graph are not affected by this call.
3195 /// The original `node` is also not modified by this call.
3196 ///
3197 /// Graph objects are not threadsafe.
3198 ///
3199 /// # Safety
3200 ///
3201 /// CUDA stores the raw callback function and user-data pointer in the
3202 /// executable graph for future launches. The caller must ensure `params`
3203 /// remains valid according to [`HostNodeParams::new`] for every future
3204 /// launch that can execute this node.
3205 ///
3206 /// # Errors
3207 ///
3208 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3209 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3210 /// not call CUDA functions; see [`Stream::add_callback`].
3211 pub unsafe fn set_host_node_params(
3212 &mut self,
3213 node: GraphNode,
3214 params: &HostNodeParams,
3215 ) -> Result<()> {
3216 self.check_node(&node)?;
3217 let params = params.into();
3218 unsafe {
3219 try_ffi!(runtime::cudaGraphExecHostNodeSetParams(
3220 self.as_raw(),
3221 node.as_raw(),
3222 &raw const params,
3223 ))?;
3224 }
3225 Ok(())
3226 }
3227
3228 /// Sets the event of an event record node in this executable graph.
3229 /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3230 ///
3231 /// The modifications only affect future launches of this executable graph.
3232 /// Already enqueued or running launches of this executable graph are not affected by this call.
3233 /// The original `node` is also not modified by this call.
3234 ///
3235 /// Graph objects are not threadsafe.
3236 ///
3237 /// # Errors
3238 ///
3239 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3240 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3241 /// not call CUDA functions; see [`Stream::add_callback`].
3242 pub fn set_event_record_node_event(&mut self, node: GraphNode, event: &Event) -> Result<()> {
3243 self.check_node(&node)?;
3244 if let Some(ctx) = &self.ctx
3245 && ctx.as_ref() != event.context()
3246 {
3247 return Err(Error::GraphContextMismatch);
3248 }
3249 unsafe {
3250 try_ffi!(runtime::cudaGraphExecEventRecordNodeSetEvent(
3251 self.as_raw(),
3252 node.as_raw(),
3253 event.as_raw(),
3254 ))?;
3255 }
3256 Ok(())
3257 }
3258
3259 /// Updates the work represented by `node` in this executable graph as though the nodes contained in `node`'s graph had the parameters contained in `child_graph`'s nodes at instantiation.
3260 /// `node` must remain in the graph which was used to instantiate this executable graph.
3261 /// Changed edges to and from `node` are ignored.
3262 ///
3263 /// The modifications only affect future launches of this executable graph.
3264 /// Already enqueued or running launches of this executable graph are not affected by this call.
3265 /// The original `node` is also not modified by this call.
3266 ///
3267 /// The topology of `child_graph`, as well as the node insertion order, must match that of the graph contained in `node`.
3268 /// See [`ExecutableGraph::update`] for a list of restrictions on what can be updated in an instantiated graph.
3269 /// The update is recursive, so child graph nodes contained within the top-level child graph are also updated.
3270 ///
3271 /// Graph objects are not threadsafe.
3272 ///
3273 /// # Errors
3274 ///
3275 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3276 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3277 /// not call CUDA functions; see [`Stream::add_callback`].
3278 pub fn set_child_graph_node(&mut self, node: GraphNode, child_graph: &Graph) -> Result<()> {
3279 self.check_node(&node)?;
3280 if let (Some(exec_ctx), Some(child_ctx)) = (&self.ctx, &child_graph.ctx)
3281 && exec_ctx.as_ref() != child_ctx.as_ref()
3282 {
3283 return Err(Error::GraphContextMismatch);
3284 }
3285 unsafe {
3286 try_ffi!(runtime::cudaGraphExecChildGraphNodeSetParams(
3287 self.as_raw(),
3288 node.as_raw(),
3289 child_graph.as_raw(),
3290 ))?;
3291 }
3292 Ok(())
3293 }
3294
3295 /// Sets the event of an event wait node in this executable graph.
3296 /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3297 ///
3298 /// The modifications only affect future launches of this executable graph.
3299 /// Already enqueued or running launches of this executable graph are not affected by this call.
3300 /// The original `node` is also not modified by this call.
3301 ///
3302 /// Graph objects are not threadsafe.
3303 ///
3304 /// # Errors
3305 ///
3306 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3307 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3308 /// not call CUDA functions; see [`Stream::add_callback`].
3309 pub fn set_event_wait_node_event(&mut self, node: GraphNode, event: &Event) -> Result<()> {
3310 self.check_node(&node)?;
3311 unsafe {
3312 try_ffi!(runtime::cudaGraphExecEventWaitNodeSetEvent(
3313 self.as_raw(),
3314 node.as_raw(),
3315 event.as_raw(),
3316 ))?;
3317 }
3318 Ok(())
3319 }
3320
3321 /// Sets `node` to be either enabled or disabled.
3322 /// Disabled nodes are functionally equivalent to empty nodes until they are reenabled.
3323 /// Existing node parameters are not affected by disabling/enabling the node.
3324 ///
3325 /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3326 ///
3327 /// `node` must not have been removed from the original graph.
3328 ///
3329 /// The modifications only affect future launches of this executable graph.
3330 /// Already enqueued or running launches of this executable graph are not affected by this call.
3331 /// The original `node` is also not modified by this call.
3332 ///
3333 /// Currently only kernel, memset and memcpy nodes are supported.
3334 ///
3335 /// Graph objects are not threadsafe.
3336 ///
3337 /// # Errors
3338 ///
3339 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3340 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3341 /// not call CUDA functions; see [`Stream::add_callback`].
3342 fn set_node_enabled(&mut self, node: GraphNode, enabled: bool) -> Result<()> {
3343 self.check_node(&node)?;
3344 unsafe {
3345 try_ffi!(runtime::cudaGraphNodeSetEnabled(
3346 self.as_raw(),
3347 node.as_raw(),
3348 u32::from(enabled),
3349 ))?;
3350 }
3351 Ok(())
3352 }
3353
3354 pub fn enable_node(&mut self, node: GraphNode) -> Result<()> {
3355 self.set_node_enabled(node, true)
3356 }
3357
3358 pub fn disable_node(&mut self, node: GraphNode) -> Result<()> {
3359 self.set_node_enabled(node, false)
3360 }
3361
3362 /// Returns whether `node` is enabled.
3363 ///
3364 /// The node is identified by the corresponding `node` in the non-executable graph from which this executable graph was instantiated.
3365 ///
3366 /// `node` must not have been removed from the original graph.
3367 ///
3368 /// Currently only kernel, memset and memcpy nodes are supported.
3369 ///
3370 /// Graph objects are not threadsafe.
3371 ///
3372 /// # Errors
3373 ///
3374 /// Returns an error if CUDA rejects the graph operation, if a previous asynchronous launch
3375 /// reported an error, or if CUDA reports runtime initialization diagnostics. Callbacks must
3376 /// not call CUDA functions; see [`Stream::add_callback`].
3377 pub fn is_node_enabled(&self, node: GraphNode) -> Result<bool> {
3378 self.check_node(&node)?;
3379 let mut enabled = 0;
3380 unsafe {
3381 try_ffi!(runtime::cudaGraphNodeGetEnabled(
3382 self.as_raw(),
3383 node.as_raw(),
3384 &raw mut enabled,
3385 ))?;
3386 }
3387 Ok(enabled != 0)
3388 }
3389
3390 pub const fn as_raw(&self) -> runtime::cudaGraphExec_t {
3391 self.handle
3392 }
3393
3394 pub fn context(&self) -> Option<&Context> {
3395 self.ctx.as_deref()
3396 }
3397
3398 /// Consumes the executable graph and returns the raw CUDA executable graph
3399 /// handle without destroying it.
3400 ///
3401 /// The caller becomes responsible for eventually destroying the returned
3402 /// handle with CUDA.
3403 pub fn into_raw(self) -> runtime::cudaGraphExec_t {
3404 let graph = ManuallyDrop::new(self);
3405 graph.as_raw()
3406 }
3407}
3408
3409impl Drop for ExecutableGraph {
3410 fn drop(&mut self) {
3411 unsafe {
3412 if let Err(err) = try_ffi!(runtime::cudaGraphExecDestroy(self.handle)) {
3413 #[cfg(debug_assertions)]
3414 eprintln!("failed to destroy cuda graph exec: {err}");
3415 }
3416 }
3417 }
3418}
3419
3420impl ExecutableGraphLaunchOperation<'_> {
3421 /// Enqueues this graph launch in `stream`.
3422 ///
3423 /// # Errors
3424 ///
3425 /// Returns an error if CUDA rejects the graph operation, if `stream` belongs to a different context, if a previous asynchronous launch reported an error, or if CUDA reports runtime initialization diagnostics.
3426 pub fn enqueue(self, stream: &Stream) -> Result<()> {
3427 self.graph.launch(stream)
3428 }
3429
3430 pub const fn graph(&self) -> &ExecutableGraph {
3431 self.graph
3432 }
3433}
3434
3435#[derive(Debug, Clone, PartialEq, Eq)]
3436pub struct ExecutableGraphUpdate {
3437 pub result: GraphExecUpdateResult,
3438 pub error_node: Option<GraphNode>,
3439 pub error_from_node: Option<GraphNode>,
3440}
3441
3442impl ExecutableGraphUpdate {
3443 fn from_result_info(value: runtime::cudaGraphExecUpdateResultInfo, graph: &Graph) -> Self {
3444 Self {
3445 result: value.result.into(),
3446 error_node: if value.errorNode.is_null() {
3447 None
3448 } else {
3449 Some(graph.node_from_raw(value.errorNode))
3450 },
3451 error_from_node: if value.errorFromNode.is_null() {
3452 None
3453 } else {
3454 Some(graph.node_from_raw(value.errorFromNode))
3455 },
3456 }
3457 }
3458}
3459
3460#[derive(Debug, Clone, Copy)]
3461pub struct MemorySetNodeParams {
3462 dst: DevicePtr,
3463 pitch: usize,
3464 value: u32,
3465 element_size: u32,
3466 width: usize,
3467 height: usize,
3468}
3469
3470impl MemorySetNodeParams {
3471 /// Creates raw memset node parameters.
3472 ///
3473 /// # Safety
3474 ///
3475 /// `dst` must be valid for writes of `element_size * width` bytes when the
3476 /// graph executes. If `height` or `pitch` are changed after construction,
3477 /// the caller must account for those values as required by CUDA.
3478 pub const unsafe fn new(dst: DevicePtr, element_size: u32, width: usize) -> Self {
3479 Self {
3480 dst,
3481 pitch: 0,
3482 value: 0,
3483 element_size,
3484 width,
3485 height: 1,
3486 }
3487 }
3488
3489 pub const fn with_pitch(mut self, pitch: usize) -> Self {
3490 self.pitch = pitch;
3491 self
3492 }
3493
3494 pub const fn with_value(mut self, value: u32) -> Self {
3495 self.value = value;
3496 self
3497 }
3498
3499 pub const fn with_height(mut self, height: usize) -> Self {
3500 self.height = height;
3501 self
3502 }
3503
3504 pub const fn dst(self) -> DevicePtr {
3505 self.dst
3506 }
3507
3508 pub const fn pitch(self) -> usize {
3509 self.pitch
3510 }
3511
3512 pub const fn value(self) -> u32 {
3513 self.value
3514 }
3515
3516 pub const fn element_size(self) -> u32 {
3517 self.element_size
3518 }
3519
3520 pub const fn width(self) -> usize {
3521 self.width
3522 }
3523
3524 pub const fn height(self) -> usize {
3525 self.height
3526 }
3527}
3528
3529impl From<&MemorySetNodeParams> for driver::CUDA_MEMSET_NODE_PARAMS {
3530 fn from(value: &MemorySetNodeParams) -> Self {
3531 Self {
3532 dst: value.dst().as_ptr() as _,
3533 pitch: value.pitch() as _,
3534 value: value.value(),
3535 elementSize: value.element_size(),
3536 width: value.width() as _,
3537 height: value.height() as _,
3538 }
3539 }
3540}