Skip to main content

baracuda_cuda_sys/
types.rs

1//! Core handle types used by the CUDA Driver API.
2//!
3//! These mirror the C typedefs in `cuda.h`. Pointer-typed handles
4//! (`CUcontext`, `CUstream`, ...) are opaque to the Rust side and are
5//! simply raw pointers; integer-typed handles (`CUdevice`, `CUdeviceptr`)
6//! are `#[repr(transparent)]` newtypes so they cannot be accidentally
7//! confused with other integer parameters.
8
9use core::ffi::c_void;
10
11/// Ordinal of a CUDA device. `cuDeviceGet(&dev, ordinal)` yields one of these.
12#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Default)]
13#[repr(transparent)]
14pub struct CUdevice(pub i32);
15
16/// A device-side virtual address. 64-bit on every platform baracuda supports
17/// (CUDA 4.0+).
18#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Default)]
19#[repr(transparent)]
20pub struct CUdeviceptr(pub u64);
21
22// SAFETY: `CUdevice` and `CUdeviceptr` are `#[repr(transparent)]` over plain
23// integers, so their ABI-layout matches a kernel arg of the same width.
24// `CUdeviceptr` in particular is what every device-pointer kernel parameter
25// expects (`void*` / `T*` on the CUDA side is the same 64 bits).
26unsafe impl baracuda_types::DeviceRepr for CUdevice {}
27unsafe impl baracuda_types::DeviceRepr for CUdeviceptr {}
28
29/// Opaque context handle.
30pub type CUcontext = *mut c_void;
31
32/// Opaque module handle (holds compiled kernels).
33pub type CUmodule = *mut c_void;
34
35/// Opaque function handle (a kernel entry point within a module).
36pub type CUfunction = *mut c_void;
37
38/// Opaque library handle (CUDA 12.0+ context-independent module).
39pub type CUlibrary = *mut c_void;
40
41/// Opaque kernel handle (CUDA 12.0+ library-based equivalent of `CUfunction`).
42pub type CUkernel = *mut c_void;
43
44/// Opaque stream handle.
45pub type CUstream = *mut c_void;
46
47/// Opaque event handle.
48pub type CUevent = *mut c_void;
49
50/// Opaque graph handle.
51pub type CUgraph = *mut c_void;
52
53/// Opaque graph-node handle.
54pub type CUgraphNode = *mut c_void;
55
56/// Opaque executable-graph handle.
57pub type CUgraphExec = *mut c_void;
58
59/// Opaque memory-pool handle (CUDA 11.2+).
60pub type CUmemoryPool = *mut c_void;
61
62/// Opaque CUDA array handle (backing storage for textures / surfaces).
63pub type CUarray = *mut c_void;
64
65/// 64-bit texture-object handle (created via `cuTexObjectCreate`).
66pub type CUtexObject = u64;
67
68/// 64-bit surface-object handle (created via `cuSurfObjectCreate`).
69pub type CUsurfObject = u64;
70
71/// Generic allocation handle for the CUDA VMM (Virtual Memory Management)
72/// API — opaque 64-bit cookie returned by `cuMemCreate`.
73pub type CUmemGenericAllocationHandle = u64;
74
75/// Opaque external-memory handle.
76pub type CUexternalMemory = *mut c_void;
77
78/// Opaque external-semaphore handle.
79pub type CUexternalSemaphore = *mut c_void;
80
81/// The special "null stream" (legacy default stream).
82pub const CU_STREAM_LEGACY: CUstream = 0x1 as CUstream;
83
84/// The special "per-thread default stream" (CUDA 7.0+).
85pub const CU_STREAM_PER_THREAD: CUstream = 0x2 as CUstream;
86
87/// Event flags (OR-able into `cuEventCreate`).
88#[allow(non_snake_case)]
89pub mod CUevent_flags {
90    pub const DEFAULT: u32 = 0x0;
91    pub const BLOCKING_SYNC: u32 = 0x1;
92    pub const DISABLE_TIMING: u32 = 0x2;
93    pub const INTERPROCESS: u32 = 0x4;
94}
95
96/// Stream flags (OR-able into `cuStreamCreate`).
97#[allow(non_snake_case)]
98pub mod CUstream_flags {
99    pub const DEFAULT: u32 = 0x0;
100    pub const NON_BLOCKING: u32 = 0x1;
101}
102
103/// Context flags (OR-able into `cuCtxCreate`).
104#[allow(non_snake_case)]
105pub mod CUcontext_flags {
106    pub const SCHED_AUTO: u32 = 0x0;
107    pub const SCHED_SPIN: u32 = 0x1;
108    pub const SCHED_YIELD: u32 = 0x2;
109    pub const SCHED_BLOCKING_SYNC: u32 = 0x4;
110    pub const MAP_HOST: u32 = 0x8;
111    pub const LMEM_RESIZE_TO_MAX: u32 = 0x10;
112}
113
114/// `CUlimit` — selector for `cuCtxGetLimit` / `cuCtxSetLimit`.
115#[allow(non_snake_case)]
116pub mod CUlimit {
117    pub const STACK_SIZE: u32 = 0x00;
118    pub const PRINTF_FIFO_SIZE: u32 = 0x01;
119    pub const MALLOC_HEAP_SIZE: u32 = 0x02;
120    pub const DEV_RUNTIME_SYNC_DEPTH: u32 = 0x03;
121    pub const DEV_RUNTIME_PENDING_LAUNCH_COUNT: u32 = 0x04;
122    pub const MAX_L2_FETCH_GRANULARITY: u32 = 0x05;
123    pub const PERSISTING_L2_CACHE_SIZE: u32 = 0x06;
124}
125
126/// `CUfunc_cache` — L1-vs-shared carveout preference.
127#[allow(non_snake_case)]
128pub mod CUfunc_cache {
129    pub const PREFER_NONE: u32 = 0x00;
130    pub const PREFER_SHARED: u32 = 0x01;
131    pub const PREFER_L1: u32 = 0x02;
132    pub const PREFER_EQUAL: u32 = 0x03;
133}
134
135/// Memory-attach flags for `cuMemAllocManaged`.
136#[allow(non_snake_case)]
137pub mod CUmemAttach_flags {
138    /// Accessible from any stream on any device (default).
139    pub const GLOBAL: u32 = 0x01;
140    /// Accessible only from the host.
141    pub const HOST: u32 = 0x02;
142    /// Accessible only from the stream it was attached to.
143    pub const SINGLE: u32 = 0x04;
144}
145
146/// `CUmem_advise` — hints for `cuMemAdvise`.
147#[allow(non_snake_case)]
148pub mod CUmem_advise {
149    pub const SET_READ_MOSTLY: i32 = 1;
150    pub const UNSET_READ_MOSTLY: i32 = 2;
151    pub const SET_PREFERRED_LOCATION: i32 = 3;
152    pub const UNSET_PREFERRED_LOCATION: i32 = 4;
153    pub const SET_ACCESSED_BY: i32 = 5;
154    pub const UNSET_ACCESSED_BY: i32 = 6;
155}
156
157/// `CUmemRangeHandleType` — for `cuMemGetHandleForAddressRange`.
158#[allow(non_snake_case)]
159pub mod CUmemRangeHandleType {
160    pub const DMA_BUF_FD: i32 = 1;
161}
162
163/// `CUarraySparseSubresourceType` — tag for the `subresource` union inside
164/// [`CUarrayMapInfo`].
165#[allow(non_snake_case)]
166pub mod CUarraySparseSubresourceType {
167    /// Tile-indexed sparse level update.
168    pub const SPARSE_LEVEL: i32 = 0;
169    /// Mipmap-tail update.
170    pub const MIPTAIL: i32 = 1;
171}
172
173/// `CUmemOperationType` — whether a [`CUarrayMapInfo`] describes a map
174/// or an unmap operation.
175#[allow(non_snake_case)]
176pub mod CUmemOperationType {
177    pub const MAP: i32 = 1;
178    pub const UNMAP: i32 = 2;
179}
180
181/// `CUmemHandleType` — handle kind for the `memHandle` union inside
182/// [`CUarrayMapInfo`].
183#[allow(non_snake_case)]
184pub mod CUmemHandleType {
185    pub const GENERIC: i32 = 0;
186}
187
188/// `CUarrayMapInfo` — 96-byte descriptor for `cuMemMapArrayAsync`.
189///
190/// Three tagged unions in one struct (resource / subresource / memHandle).
191/// The typed builder methods below populate them correctly; raw field
192/// access is available for advanced users.
193///
194/// Layout (CUDA 13.x):
195/// ```text
196/// struct CUarrayMapInfo {
197///     CUresourcetype resourceType;       // offset 0, 4 bytes
198///     // 4 bytes pad
199///     union { CUmipmappedArray; CUarray; } resource; // offset 8, 8 bytes
200///     CUarraySparseSubresourceType subresourceType;  // offset 16, 4 bytes
201///     // 4 bytes pad
202///     union {                            // offset 24, 32 bytes
203///         struct { u32 level, layer, ox, oy, oz, ew, eh, ed; } sparseLevel;
204///         struct { u32 layer; u64 offset; u64 size; } miptail;
205///     } subresource;
206///     CUmemOperationType memOperationType; // offset 56, 4 bytes
207///     CUmemHandleType memHandleType;       // offset 60, 4 bytes
208///     union { CUmemGenericAllocationHandle; } memHandle; // offset 64, 8 bytes
209///     u64 offset;                          // offset 72, 8 bytes
210///     u32 deviceBitMask;                   // offset 80, 4 bytes
211///     u32 flags;                           // offset 84, 4 bytes
212///     u32 reserved[2];                     // offset 88, 8 bytes
213/// };
214/// ```
215#[repr(C)]
216#[derive(Copy, Clone)]
217#[allow(non_camel_case_types)]
218pub struct CUarrayMapInfo {
219    pub resource_type: core::ffi::c_int,
220    _pad0: u32,
221    /// Union payload: `CUarray` or `CUmipmappedArray` (both pointer-sized).
222    pub resource_raw: u64,
223    pub subresource_type: core::ffi::c_int,
224    _pad1: u32,
225    /// Union payload for the subresource (32 bytes, enough for
226    /// `sparseLevel`'s 8 u32s or `miptail`'s 3-field struct).
227    pub subresource_raw: [u64; 4],
228    pub mem_operation_type: core::ffi::c_int,
229    pub mem_handle_type: core::ffi::c_int,
230    pub mem_handle_raw: u64,
231    pub offset: u64,
232    pub device_bit_mask: core::ffi::c_uint,
233    pub flags: core::ffi::c_uint,
234    pub reserved: [core::ffi::c_uint; 2],
235}
236
237impl Default for CUarrayMapInfo {
238    fn default() -> Self {
239        Self {
240            resource_type: 0,
241            _pad0: 0,
242            resource_raw: 0,
243            subresource_type: 0,
244            _pad1: 0,
245            subresource_raw: [0; 4],
246            mem_operation_type: 0,
247            mem_handle_type: CUmemHandleType::GENERIC,
248            mem_handle_raw: 0,
249            offset: 0,
250            device_bit_mask: 0,
251            flags: 0,
252            reserved: [0; 2],
253        }
254    }
255}
256
257impl core::fmt::Debug for CUarrayMapInfo {
258    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
259        f.debug_struct("CUarrayMapInfo")
260            .field("resource_type", &self.resource_type)
261            .field("subresource_type", &self.subresource_type)
262            .field("mem_operation_type", &self.mem_operation_type)
263            .field("offset", &self.offset)
264            .finish_non_exhaustive()
265    }
266}
267
268impl CUarrayMapInfo {
269    /// Point the resource union at a [`CUarray`] handle.
270    pub fn with_array(mut self, array: CUarray) -> Self {
271        self.resource_type = CUresourcetype::ARRAY as core::ffi::c_int;
272        self.resource_raw = array as usize as u64;
273        self
274    }
275
276    /// Point the resource union at a [`CUmipmappedArray`] handle.
277    pub fn with_mipmapped_array(mut self, mipmap: CUmipmappedArray) -> Self {
278        self.resource_type = CUresourcetype::MIPMAPPED_ARRAY as core::ffi::c_int;
279        self.resource_raw = mipmap as usize as u64;
280        self
281    }
282
283    /// Set the subresource to a sparse-level tile update. `offset_*`
284    /// and `extent_*` are in tiles (not bytes).
285    #[allow(clippy::too_many_arguments)]
286    pub fn with_sparse_level(
287        mut self,
288        level: u32,
289        layer: u32,
290        offset_x: u32,
291        offset_y: u32,
292        offset_z: u32,
293        extent_width: u32,
294        extent_height: u32,
295        extent_depth: u32,
296    ) -> Self {
297        self.subresource_type = CUarraySparseSubresourceType::SPARSE_LEVEL;
298        // Layout: 8 u32s packed into 32 bytes = 4 u64s.
299        // SAFETY: subresource_raw is [u64; 4] = 32 bytes, 8-aligned; we
300        // write an 8-u32 little-/native-endian struct through a pointer.
301        let sl = [
302            level,
303            layer,
304            offset_x,
305            offset_y,
306            offset_z,
307            extent_width,
308            extent_height,
309            extent_depth,
310        ];
311        unsafe {
312            let p = self.subresource_raw.as_mut_ptr() as *mut [u32; 8];
313            p.write(sl);
314        }
315        self
316    }
317
318    /// Set the subresource to a mipmap-tail update.
319    pub fn with_miptail(mut self, layer: u32, tail_offset: u64, tail_size: u64) -> Self {
320        self.subresource_type = CUarraySparseSubresourceType::MIPTAIL;
321        // Layout: { u32 layer; u64 offset; u64 size; } with 4-byte pad
322        // after `layer` to align the u64s. Total 24 bytes.
323        #[repr(C)]
324        struct Miptail {
325            layer: u32,
326            _pad: u32,
327            offset: u64,
328            size: u64,
329        }
330        let m = Miptail {
331            layer,
332            _pad: 0,
333            offset: tail_offset,
334            size: tail_size,
335        };
336        unsafe {
337            let p = self.subresource_raw.as_mut_ptr() as *mut Miptail;
338            p.write(m);
339        }
340        self
341    }
342
343    /// Set the mem-handle union to a VMM generic allocation handle.
344    pub fn with_mem_handle(mut self, handle: CUmemGenericAllocationHandle) -> Self {
345        self.mem_handle_type = CUmemHandleType::GENERIC;
346        self.mem_handle_raw = handle;
347        self
348    }
349
350    /// Mark this entry as a map operation.
351    pub fn as_map(mut self) -> Self {
352        self.mem_operation_type = CUmemOperationType::MAP;
353        self
354    }
355
356    /// Mark this entry as an unmap operation.
357    pub fn as_unmap(mut self) -> Self {
358        self.mem_operation_type = CUmemOperationType::UNMAP;
359        self
360    }
361
362    /// Byte offset into the backing allocation handle.
363    pub fn with_offset(mut self, offset: u64) -> Self {
364        self.offset = offset;
365        self
366    }
367
368    /// Bitmask of devices the mapping applies to (one bit per peer).
369    pub fn with_device_bit_mask(mut self, mask: u32) -> Self {
370        self.device_bit_mask = mask;
371        self
372    }
373}
374
375// ---- Wave 28: medium-value consolidated ---------------------------------
376
377/// `CUexecAffinityType` — kind of per-context execution affinity.
378#[allow(non_snake_case)]
379pub mod CUexecAffinityType {
380    pub const SM_COUNT: i32 = 0;
381}
382
383/// `CUdevice_P2PAttribute` — passed to `cuDeviceGetP2PAttribute`.
384#[allow(non_snake_case)]
385pub mod CUdevice_P2PAttribute {
386    pub const PERFORMANCE_RANK: i32 = 1;
387    pub const ACCESS_SUPPORTED: i32 = 2;
388    pub const NATIVE_ATOMIC_SUPPORTED: i32 = 3;
389    pub const CUDA_ARRAY_ACCESS_SUPPORTED: i32 = 4;
390}
391
392/// `CUflushGPUDirectRDMAWritesTarget`.
393#[allow(non_snake_case)]
394pub mod CUflushGPUDirectRDMAWritesTarget {
395    pub const CURRENT_CTX: i32 = 0;
396}
397
398/// `CUflushGPUDirectRDMAWritesScope`.
399#[allow(non_snake_case)]
400pub mod CUflushGPUDirectRDMAWritesScope {
401    pub const TO_OWNER: i32 = 100;
402    pub const TO_ALL_DEVICES: i32 = 200;
403}
404
405/// `CUcoredumpSettings` — attribute selectors for `cuCoredumpGet/SetAttribute`.
406#[allow(non_snake_case)]
407pub mod CUcoredumpSettings {
408    pub const ENABLE_ON_EXCEPTION: i32 = 1;
409    pub const TRIGGER_HOST: i32 = 2;
410    pub const LIGHTWEIGHT: i32 = 3;
411    pub const ENABLE_USER_TRIGGER: i32 = 4;
412    pub const FILE: i32 = 5;
413    pub const PIPE: i32 = 6;
414    pub const GENERATION_FLAGS: i32 = 7;
415}
416
417/// `CUDA_ARRAY_SPARSE_PROPERTIES` — per-array sparse / tiled layout info.
418#[repr(C)]
419#[derive(Copy, Clone, Debug, Default)]
420#[allow(non_camel_case_types)]
421pub struct CUDA_ARRAY_SPARSE_PROPERTIES {
422    pub tile_extent_width: core::ffi::c_uint,
423    pub tile_extent_height: core::ffi::c_uint,
424    pub tile_extent_depth: core::ffi::c_uint,
425    pub miptail_first_level: core::ffi::c_uint,
426    pub miptail_size: u64,
427    pub flags: core::ffi::c_uint,
428    pub reserved: [core::ffi::c_uint; 4],
429}
430
431/// `CUDA_ARRAY_MEMORY_REQUIREMENTS` — size/alignment for an array's
432/// backing VMM allocation.
433#[repr(C)]
434#[derive(Copy, Clone, Debug, Default)]
435#[allow(non_camel_case_types)]
436pub struct CUDA_ARRAY_MEMORY_REQUIREMENTS {
437    pub size: usize,
438    pub alignment: usize,
439    pub reserved: [core::ffi::c_uint; 4],
440}
441
442// ---- Wave 29-31: graphics interop + Jetson NvSci ------------------------
443
444/// `CUgraphicsMapResourceFlags` — map-time access hints.
445#[allow(non_snake_case)]
446pub mod CUgraphicsMapResourceFlags {
447    pub const NONE: u32 = 0;
448    pub const READ_ONLY: u32 = 1;
449    pub const WRITE_DISCARD: u32 = 2;
450}
451
452/// `CUgraphicsRegisterFlags` — shared register-time flags across GL /
453/// D3D / VDPAU / EGL.
454#[allow(non_snake_case)]
455pub mod CUgraphicsRegisterFlags {
456    pub const NONE: u32 = 0;
457    pub const READ_ONLY: u32 = 1;
458    pub const WRITE_DISCARD: u32 = 2;
459    pub const SURFACE_LDST: u32 = 4;
460    pub const TEXTURE_GATHER: u32 = 8;
461}
462
463/// Selector for `cuGLGetDevices_v2` / `cu<API>GetDevices`.
464#[allow(non_snake_case)]
465pub mod CUGLDeviceList {
466    pub const ALL: u32 = 0x01;
467    pub const CURRENT_FRAME: u32 = 0x02;
468    pub const NEXT_FRAME: u32 = 0x03;
469}
470
471/// Alias for D3D-family enum kinds (shared with `CUGLDeviceList`).
472pub use CUGLDeviceList as CUd3dXDeviceList;
473
474// --- OpenGL handle types ---
475// GL types live in `gl.h`; we use minimum-compatible Rust types.
476pub type GLuint = core::ffi::c_uint;
477pub type GLenum = core::ffi::c_uint;
478
479// --- Direct3D handle types ---
480// All D3D* device / resource pointers are opaque from CUDA's POV.
481pub type ID3DDevice = *mut c_void;
482pub type ID3DResource = *mut c_void;
483
484// --- VDPAU handle types ---
485// `VdpDevice`, `VdpGetProcAddress`, `VdpVideoSurface`, `VdpOutputSurface`
486// are all 32-bit unsigned handles in libvdpau.
487pub type VdpDevice = core::ffi::c_uint;
488pub type VdpGetProcAddress = *mut c_void;
489pub type VdpVideoSurface = core::ffi::c_uint;
490pub type VdpOutputSurface = core::ffi::c_uint;
491
492// --- EGL handle types ---
493pub type EGLImageKHR = *mut c_void;
494pub type EGLStreamKHR = *mut c_void;
495pub type EGLSyncKHR = *mut c_void;
496
497/// `CUeglFrame` — YUV / RGB frame layout used by EGL stream interop
498/// (Jetson video pipelines). 80 bytes in `cuda.h`; exposed as an opaque
499/// blob so callers can populate from bindgen-generated layouts.
500#[repr(C)]
501#[derive(Copy, Clone)]
502#[allow(non_camel_case_types)]
503pub struct CUeglFrame {
504    pub raw: [u64; 10],
505}
506
507impl Default for CUeglFrame {
508    #[allow(clippy::derivable_impls)]
509    fn default() -> Self {
510        Self { raw: [0; 10] }
511    }
512}
513
514impl core::fmt::Debug for CUeglFrame {
515    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
516        f.debug_struct("CUeglFrame").finish_non_exhaustive()
517    }
518}
519
520// --- NvSci handle types (Jetson / DRIVE) ---
521// `NvSciSyncAttrList` and `NvSciBufObj` are opaque pointers from
522// NVIDIA's NvSci libraries. Users working with NvSci pass in pointers
523// obtained from libnvsciSync / libnvsciBuf.
524pub type NvSciSyncAttrList = *mut c_void;
525pub type NvSciSyncObj = *mut c_void;
526pub type NvSciSyncFence = *mut c_void;
527pub type NvSciBufObj = *mut c_void;
528
529/// `CUnvSciSyncAttr` — direction flags for
530/// `cuDeviceGetNvSciSyncAttributes`.
531#[allow(non_snake_case)]
532pub mod CUnvSciSyncAttr {
533    pub const SIGNAL: i32 = 1;
534    pub const WAIT: i32 = 2;
535}
536
537/// `CUpointer_attribute` — selector for `cuPointerGetAttribute`.
538#[allow(non_snake_case)]
539pub mod CUpointer_attribute {
540    pub const CONTEXT: i32 = 1;
541    pub const MEMORY_TYPE: i32 = 2;
542    pub const DEVICE_POINTER: i32 = 3;
543    pub const HOST_POINTER: i32 = 4;
544    pub const P2P_TOKENS: i32 = 5;
545    pub const SYNC_MEMOPS: i32 = 6;
546    pub const BUFFER_ID: i32 = 7;
547    pub const IS_MANAGED: i32 = 8;
548    pub const DEVICE_ORDINAL: i32 = 9;
549    pub const IS_LEGACY_CUDA_IPC_CAPABLE: i32 = 10;
550    pub const RANGE_START_ADDR: i32 = 11;
551    pub const RANGE_SIZE: i32 = 12;
552    pub const MAPPED: i32 = 13;
553    pub const ALLOWED_HANDLE_TYPES: i32 = 14;
554    pub const IS_GPU_DIRECT_RDMA_CAPABLE: i32 = 15;
555    pub const ACCESS_FLAGS: i32 = 16;
556    pub const MEMPOOL_HANDLE: i32 = 17;
557    pub const MAPPING_SIZE: i32 = 18;
558    pub const MAPPING_BASE_ADDR: i32 = 19;
559    pub const MEMORY_BLOCK_ID: i32 = 20;
560}
561
562/// `CUmemorytype` — values returned via `CUpointer_attribute::MEMORY_TYPE`.
563#[allow(non_snake_case)]
564pub mod CUmemorytype {
565    pub const HOST: u32 = 0x01;
566    pub const DEVICE: u32 = 0x02;
567    pub const ARRAY: u32 = 0x03;
568    pub const UNIFIED: u32 = 0x04;
569}
570
571/// `CUlaunchAttributeID` — selector for entries in a `CUlaunchConfig`'s
572/// attribute array (passed to `cuLaunchKernelEx`, CUDA 12.0+).
573#[allow(non_snake_case)]
574pub mod CUlaunchAttributeID {
575    pub const IGNORE: u32 = 0;
576    pub const ACCESS_POLICY_WINDOW: u32 = 1;
577    pub const COOPERATIVE: u32 = 2;
578    pub const SYNCHRONIZATION_POLICY: u32 = 3;
579    pub const CLUSTER_DIMENSION: u32 = 4;
580    pub const CLUSTER_SCHEDULING_POLICY_PREFERENCE: u32 = 5;
581    pub const PROGRAMMATIC_STREAM_SERIALIZATION: u32 = 6;
582    pub const PROGRAMMATIC_EVENT: u32 = 7;
583    pub const PRIORITY: u32 = 8;
584    pub const MEM_SYNC_DOMAIN_MAP: u32 = 9;
585    pub const MEM_SYNC_DOMAIN: u32 = 10;
586    pub const LAUNCH_COMPLETION_EVENT: u32 = 12;
587    pub const DEVICE_UPDATABLE_KERNEL_NODE: u32 = 13;
588}
589
590/// `CUlaunchAttributeValue` — union of payloads for a launch attribute.
591/// 64-byte fixed-size union in `cuda.h`; we expose it as an opaque byte
592/// array so callers can bit-cast. Zero-initialized for "no payload".
593#[repr(C)]
594#[derive(Copy, Clone, Debug)]
595#[allow(non_camel_case_types)]
596pub struct CUlaunchAttributeValue(pub [u8; 64]);
597
598impl Default for CUlaunchAttributeValue {
599    fn default() -> Self {
600        Self([0u8; 64])
601    }
602}
603
604/// `CUaccessProperty` — hit/miss cache policy used inside a
605/// [`CUaccessPolicyWindow`].
606#[allow(non_snake_case)]
607pub mod CUaccessProperty {
608    pub const NORMAL: i32 = 0;
609    pub const STREAMING: i32 = 1;
610    pub const PERSISTING: i32 = 2;
611}
612
613/// `CUaccessPolicyWindow` — describes an L2-persistence hint attached
614/// to a launch via the `ACCESS_POLICY_WINDOW` attribute.
615#[repr(C)]
616#[derive(Copy, Clone, Debug)]
617#[allow(non_camel_case_types)]
618pub struct CUaccessPolicyWindow {
619    pub base_ptr: *mut c_void,
620    pub num_bytes: usize,
621    pub hit_ratio: f32,
622    pub hit_prop: core::ffi::c_int,
623    pub miss_prop: core::ffi::c_int,
624}
625
626impl Default for CUaccessPolicyWindow {
627    fn default() -> Self {
628        Self {
629            base_ptr: core::ptr::null_mut(),
630            num_bytes: 0,
631            hit_ratio: 0.0,
632            hit_prop: CUaccessProperty::NORMAL,
633            miss_prop: CUaccessProperty::NORMAL,
634        }
635    }
636}
637
638/// `CUlaunchAttribute` — one entry in a `CUlaunchConfig`'s attribute list.
639#[repr(C)]
640#[derive(Copy, Clone, Debug, Default)]
641#[allow(non_camel_case_types)]
642pub struct CUlaunchAttribute {
643    pub id: core::ffi::c_uint,
644    /// `cuda.h` inserts 4 bytes of padding before the union.
645    pub pad: [u8; 4],
646    pub value: CUlaunchAttributeValue,
647}
648
649/// `CUlaunchConfig` — the descriptor passed to `cuLaunchKernelEx`.
650#[repr(C)]
651#[derive(Copy, Clone, Debug)]
652#[allow(non_camel_case_types)]
653pub struct CUlaunchConfig {
654    pub grid_dim_x: core::ffi::c_uint,
655    pub grid_dim_y: core::ffi::c_uint,
656    pub grid_dim_z: core::ffi::c_uint,
657    pub block_dim_x: core::ffi::c_uint,
658    pub block_dim_y: core::ffi::c_uint,
659    pub block_dim_z: core::ffi::c_uint,
660    pub shared_mem_bytes: core::ffi::c_uint,
661    pub stream: CUstream,
662    pub attrs: *mut CUlaunchAttribute,
663    pub num_attrs: core::ffi::c_uint,
664}
665
666// Null-initialized default for CUlaunchConfig so callers can `..Default::default()`.
667impl Default for CUlaunchConfig {
668    fn default() -> Self {
669        Self {
670            grid_dim_x: 1,
671            grid_dim_y: 1,
672            grid_dim_z: 1,
673            block_dim_x: 1,
674            block_dim_y: 1,
675            block_dim_z: 1,
676            shared_mem_bytes: 0,
677            stream: core::ptr::null_mut(),
678            attrs: core::ptr::null_mut(),
679            num_attrs: 0,
680        }
681    }
682}
683
684/// `CUfunction_attribute` — selector for `cuFuncGetAttribute` / `cuFuncSetAttribute`.
685#[allow(non_snake_case)]
686pub mod CUfunction_attribute {
687    pub const MAX_THREADS_PER_BLOCK: i32 = 0;
688    pub const SHARED_SIZE_BYTES: i32 = 1;
689    pub const CONST_SIZE_BYTES: i32 = 2;
690    pub const LOCAL_SIZE_BYTES: i32 = 3;
691    pub const NUM_REGS: i32 = 4;
692    pub const PTX_VERSION: i32 = 5;
693    pub const BINARY_VERSION: i32 = 6;
694    pub const CACHE_MODE_CA: i32 = 7;
695    pub const MAX_DYNAMIC_SHARED_SIZE_BYTES: i32 = 8;
696    pub const PREFERRED_SHARED_MEMORY_CARVEOUT: i32 = 9;
697    pub const CLUSTER_SIZE_MUST_BE_SET: i32 = 10;
698    pub const REQUIRED_CLUSTER_WIDTH: i32 = 11;
699    pub const REQUIRED_CLUSTER_HEIGHT: i32 = 12;
700    pub const REQUIRED_CLUSTER_DEPTH: i32 = 13;
701    pub const NON_PORTABLE_CLUSTER_SIZE_ALLOWED: i32 = 14;
702    pub const CLUSTER_SCHEDULING_POLICY_PREFERENCE: i32 = 15;
703}
704
705/// `CUDA_MEMCPY2D` — descriptor for 2D memory copies between any combination
706/// of host / device / array memory.
707#[repr(C)]
708#[derive(Copy, Clone, Debug)]
709#[allow(non_camel_case_types)]
710pub struct CUDA_MEMCPY2D {
711    pub src_x_in_bytes: usize,
712    pub src_y: usize,
713    pub src_memory_type: u32,
714    pub src_host: *const c_void,
715    pub src_device: CUdeviceptr,
716    pub src_array: *mut c_void,
717    pub src_pitch: usize,
718
719    pub dst_x_in_bytes: usize,
720    pub dst_y: usize,
721    pub dst_memory_type: u32,
722    pub dst_host: *mut c_void,
723    pub dst_device: CUdeviceptr,
724    pub dst_array: *mut c_void,
725    pub dst_pitch: usize,
726
727    pub width_in_bytes: usize,
728    pub height: usize,
729}
730
731impl Default for CUDA_MEMCPY2D {
732    fn default() -> Self {
733        Self {
734            src_x_in_bytes: 0,
735            src_y: 0,
736            src_memory_type: 0,
737            src_host: core::ptr::null(),
738            src_device: CUdeviceptr(0),
739            src_array: core::ptr::null_mut(),
740            src_pitch: 0,
741            dst_x_in_bytes: 0,
742            dst_y: 0,
743            dst_memory_type: 0,
744            dst_host: core::ptr::null_mut(),
745            dst_device: CUdeviceptr(0),
746            dst_array: core::ptr::null_mut(),
747            dst_pitch: 0,
748            width_in_bytes: 0,
749            height: 0,
750        }
751    }
752}
753
754/// `CUDA_KERNEL_NODE_PARAMS` — shape passed to `cuGraphAddKernelNode` /
755/// `cuGraphKernelNodeSetParams`.
756///
757/// The `kern` / `ctx` fields only exist in CUDA 12.0+. On older drivers
758/// they're silently ignored, so writing zero for both is portable.
759#[repr(C)]
760#[derive(Copy, Clone, Debug)]
761#[allow(non_camel_case_types)]
762pub struct CUDA_KERNEL_NODE_PARAMS {
763    pub func: CUfunction,
764    pub grid_dim_x: core::ffi::c_uint,
765    pub grid_dim_y: core::ffi::c_uint,
766    pub grid_dim_z: core::ffi::c_uint,
767    pub block_dim_x: core::ffi::c_uint,
768    pub block_dim_y: core::ffi::c_uint,
769    pub block_dim_z: core::ffi::c_uint,
770    pub shared_mem_bytes: core::ffi::c_uint,
771    pub kernel_params: *mut *mut c_void,
772    pub extra: *mut *mut c_void,
773    pub kern: CUkernel,
774    pub ctx: CUcontext,
775}
776
777impl Default for CUDA_KERNEL_NODE_PARAMS {
778    fn default() -> Self {
779        Self {
780            func: core::ptr::null_mut(),
781            grid_dim_x: 1,
782            grid_dim_y: 1,
783            grid_dim_z: 1,
784            block_dim_x: 1,
785            block_dim_y: 1,
786            block_dim_z: 1,
787            shared_mem_bytes: 0,
788            kernel_params: core::ptr::null_mut(),
789            extra: core::ptr::null_mut(),
790            kern: core::ptr::null_mut(),
791            ctx: core::ptr::null_mut(),
792        }
793    }
794}
795
796/// `CUDA_MEMSET_NODE_PARAMS` — shape passed to `cuGraphAddMemsetNode`.
797#[repr(C)]
798#[derive(Copy, Clone, Debug, Default)]
799#[allow(non_camel_case_types)]
800pub struct CUDA_MEMSET_NODE_PARAMS {
801    pub dst: CUdeviceptr,
802    pub pitch: usize,
803    pub value: core::ffi::c_uint,
804    pub element_size: core::ffi::c_uint,
805    pub width: usize,
806    pub height: usize,
807}
808
809/// Host-function signature used by `cuGraphAddHostNode` / `cuLaunchHostFunc`.
810pub type CUhostFnRaw = Option<unsafe extern "C" fn(user_data: *mut c_void)>;
811
812/// `CUDA_HOST_NODE_PARAMS` — `{ fn, user_data }` for `cuGraphAddHostNode`.
813#[repr(C)]
814#[derive(Copy, Clone, Debug)]
815#[allow(non_camel_case_types)]
816pub struct CUDA_HOST_NODE_PARAMS {
817    pub fn_: CUhostFnRaw,
818    pub user_data: *mut c_void,
819}
820
821impl Default for CUDA_HOST_NODE_PARAMS {
822    fn default() -> Self {
823        Self {
824            fn_: None,
825            user_data: core::ptr::null_mut(),
826        }
827    }
828}
829
830/// `CUtensorMap` — 128-byte opaque Hopper TMA descriptor. Created via
831/// `cuTensorMapEncodeTiled` / `cuTensorMapEncodeIm2col`; consumed by TMA
832/// instructions in SM 9.0+ kernels.
833#[repr(C, align(64))]
834#[derive(Copy, Clone, Debug)]
835#[allow(non_camel_case_types)]
836pub struct CUtensorMap {
837    pub opaque: [u64; 16],
838}
839
840#[allow(clippy::derivable_impls)]
841impl Default for CUtensorMap {
842    fn default() -> Self {
843        Self { opaque: [0; 16] }
844    }
845}
846
847/// `CUtensorMapDataType` — element type encoding for TMA descriptors.
848#[allow(non_snake_case)]
849pub mod CUtensorMapDataType {
850    pub const UINT8: i32 = 0;
851    pub const UINT16: i32 = 1;
852    pub const UINT32: i32 = 2;
853    pub const INT32: i32 = 3;
854    pub const UINT64: i32 = 4;
855    pub const INT64: i32 = 5;
856    pub const FLOAT16: i32 = 6;
857    pub const FLOAT32: i32 = 7;
858    pub const FLOAT64: i32 = 8;
859    pub const BFLOAT16: i32 = 9;
860    pub const FLOAT32_FTZ: i32 = 10;
861    pub const TFLOAT32: i32 = 11;
862    pub const TFLOAT32_FTZ: i32 = 12;
863}
864
865/// `CUtensorMapInterleave`.
866#[allow(non_snake_case)]
867pub mod CUtensorMapInterleave {
868    pub const NONE: i32 = 0;
869    pub const INTERLEAVE_16B: i32 = 1;
870    pub const INTERLEAVE_32B: i32 = 2;
871}
872
873/// `CUtensorMapSwizzle`.
874#[allow(non_snake_case)]
875pub mod CUtensorMapSwizzle {
876    pub const NONE: i32 = 0;
877    pub const SWIZZLE_32B: i32 = 1;
878    pub const SWIZZLE_64B: i32 = 2;
879    pub const SWIZZLE_128B: i32 = 3;
880}
881
882/// `CUtensorMapL2promotion` — L2 prefetch hint.
883#[allow(non_snake_case)]
884pub mod CUtensorMapL2promotion {
885    pub const NONE: i32 = 0;
886    pub const L2_64B: i32 = 1;
887    pub const L2_128B: i32 = 2;
888    pub const L2_256B: i32 = 3;
889}
890
891/// `CUtensorMapFloatOOBfill` — out-of-bounds fill behavior.
892#[allow(non_snake_case)]
893pub mod CUtensorMapFloatOOBfill {
894    pub const NONE: i32 = 0;
895    pub const NAN_REQUEST_ZERO_FMA: i32 = 1;
896}
897
898// ---- Wave 20: IPC -------------------------------------------------------
899
900/// `CUipcEventHandle` — 64-byte opaque cookie for sharing CUevents across
901/// processes (Linux; Windows returns NOT_SUPPORTED).
902#[repr(C)]
903#[derive(Copy, Clone, Debug)]
904#[allow(non_camel_case_types)]
905pub struct CUipcEventHandle {
906    pub reserved: [core::ffi::c_char; 64],
907}
908
909impl Default for CUipcEventHandle {
910    fn default() -> Self {
911        Self { reserved: [0; 64] }
912    }
913}
914
915/// `CUipcMemHandle` — 64-byte opaque cookie for sharing device
916/// allocations across processes.
917#[repr(C)]
918#[derive(Copy, Clone, Debug)]
919#[allow(non_camel_case_types)]
920pub struct CUipcMemHandle {
921    pub reserved: [core::ffi::c_char; 64],
922}
923
924impl Default for CUipcMemHandle {
925    fn default() -> Self {
926        Self { reserved: [0; 64] }
927    }
928}
929
930// ---- Wave 19: conditional + switch graph nodes --------------------------
931
932/// 64-bit handle used by conditional graph nodes (CUDA 12.3+).
933pub type CUgraphConditionalHandle = u64;
934
935/// `CUgraphConditionalNodeType`.
936#[allow(non_snake_case)]
937pub mod CUgraphConditionalNodeType {
938    pub const IF: i32 = 0;
939    pub const WHILE: i32 = 1;
940    pub const SWITCH: i32 = 2;
941}
942
943/// `CUDA_CONDITIONAL_NODE_PARAMS` — parameters for a conditional-node
944/// variant inside [`CUgraphNodeParams`].
945#[repr(C)]
946#[derive(Copy, Clone, Debug)]
947#[allow(non_camel_case_types)]
948pub struct CUDA_CONDITIONAL_NODE_PARAMS {
949    pub handle: CUgraphConditionalHandle,
950    pub type_: core::ffi::c_int,
951    pub size: core::ffi::c_uint,
952    pub body_graph_out: *mut CUgraph,
953    pub ctx: CUcontext,
954}
955
956impl Default for CUDA_CONDITIONAL_NODE_PARAMS {
957    fn default() -> Self {
958        Self {
959            handle: 0,
960            type_: CUgraphConditionalNodeType::IF,
961            size: 1,
962            body_graph_out: core::ptr::null_mut(),
963            ctx: core::ptr::null_mut(),
964        }
965    }
966}
967
968/// `CUgraphNodeParams` — generic node-params tagged union. We model the
969/// payload as an opaque `[u64; 30]` (large enough to hold any variant)
970/// plus the discriminant and tail. Safe wrappers populate the payload via
971/// typed helpers; raw users can cast through `payload.as_mut_ptr()`.
972///
973/// Layout (CUDA 13.x):
974/// ```text
975/// struct CUgraphNodeParams {
976///     CUgraphNodeType type;     // 4
977///     int reserved0[3];         // 12
978///     union { ... } payload;    // 232 bytes (29 × c_longlong)
979///     long long reserved2;      // 8
980/// };
981/// ```
982/// Total 256 bytes, alignment 8.
983#[repr(C)]
984#[derive(Copy, Clone)]
985#[allow(non_camel_case_types)]
986pub struct CUgraphNodeParams {
987    pub type_: core::ffi::c_int,
988    pub reserved0: [core::ffi::c_int; 3],
989    pub payload: [u64; 29],
990    pub reserved2: core::ffi::c_longlong,
991}
992
993impl Default for CUgraphNodeParams {
994    fn default() -> Self {
995        Self {
996            type_: CUgraphNodeType::EMPTY,
997            reserved0: [0; 3],
998            payload: [0; 29],
999            reserved2: 0,
1000        }
1001    }
1002}
1003
1004impl core::fmt::Debug for CUgraphNodeParams {
1005    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1006        f.debug_struct("CUgraphNodeParams")
1007            .field("type", &self.type_)
1008            .finish_non_exhaustive()
1009    }
1010}
1011
1012/// `CUgraphEdgeData` — optional edge metadata (used by the v2 add-node /
1013/// add-dependencies APIs). 8 bytes.
1014#[repr(C)]
1015#[derive(Copy, Clone, Debug, Default)]
1016#[allow(non_camel_case_types)]
1017pub struct CUgraphEdgeData {
1018    pub from_port: u8,
1019    pub to_port: u8,
1020    pub type_: u8,
1021    pub reserved: [u8; 5],
1022}
1023
1024/// `CUmulticastObjectProp` — creation props for `cuMulticastCreate`.
1025/// CUDA 12.0+, NVSwitch-only.
1026#[repr(C)]
1027#[derive(Copy, Clone, Debug, Default)]
1028#[allow(non_camel_case_types)]
1029pub struct CUmulticastObjectProp {
1030    pub num_devices: core::ffi::c_uint,
1031    pub size: usize,
1032    pub handle_types: u64,
1033    pub flags: u64,
1034}
1035
1036/// `CUmulticastGranularity_flags` — pass to `cuMulticastGetGranularity`.
1037#[allow(non_snake_case)]
1038pub mod CUmulticastGranularity_flags {
1039    pub const MINIMUM: i32 = 0;
1040    pub const RECOMMENDED: i32 = 1;
1041}
1042
1043/// `CUdevResourceType` — green-context resource-kind enum (CUDA 12.4+).
1044#[allow(non_snake_case)]
1045pub mod CUdevResourceType {
1046    pub const INVALID: i32 = 0;
1047    pub const SM: i32 = 1;
1048}
1049
1050/// `CUdevSmResource` — SM-count resource payload (12 bytes).
1051#[repr(C)]
1052#[derive(Copy, Clone, Debug, Default)]
1053#[allow(non_camel_case_types)]
1054pub struct CUdevSmResource {
1055    pub sm_count: core::ffi::c_uint,
1056    pub min_sm_partition_size: core::ffi::c_uint,
1057    pub sm_coscheduled_alignment: core::ffi::c_uint,
1058}
1059
1060/// `CUdevResource` — 144-byte resource descriptor. Tagged by `type_`;
1061/// the 48-byte union holds the variant-specific payload (`CUdevSmResource`
1062/// for `SM`). We model the union as a fixed `[u64; 6]` blob and provide
1063/// helpers for the SM case.
1064#[repr(C)]
1065#[derive(Copy, Clone)]
1066#[allow(non_camel_case_types)]
1067pub struct CUdevResource {
1068    pub type_: core::ffi::c_int,
1069    pub internal_padding: [core::ffi::c_uchar; 92],
1070    pub res: [u64; 6], // 48-byte union
1071}
1072
1073impl Default for CUdevResource {
1074    fn default() -> Self {
1075        Self {
1076            type_: CUdevResourceType::INVALID,
1077            internal_padding: [0u8; 92],
1078            res: [0u64; 6],
1079        }
1080    }
1081}
1082
1083impl core::fmt::Debug for CUdevResource {
1084    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1085        f.debug_struct("CUdevResource")
1086            .field("type", &self.type_)
1087            .finish_non_exhaustive()
1088    }
1089}
1090
1091impl CUdevResource {
1092    /// View the SM-specific payload. Only meaningful when `type_ == SM`.
1093    #[inline]
1094    pub fn as_sm(&self) -> CUdevSmResource {
1095        // SAFETY: res is 48 bytes 8-byte aligned; CUdevSmResource is 12
1096        // bytes 4-byte aligned — reading the first 12 bytes is well-defined.
1097        unsafe { core::ptr::read(self.res.as_ptr() as *const CUdevSmResource) }
1098    }
1099}
1100
1101/// `CUgraphNodeType` — returned by `cuGraphNodeGetType`.
1102#[allow(non_snake_case)]
1103pub mod CUgraphNodeType {
1104    pub const KERNEL: i32 = 0;
1105    pub const MEMCPY: i32 = 1;
1106    pub const MEMSET: i32 = 2;
1107    pub const HOST: i32 = 3;
1108    pub const GRAPH: i32 = 4;
1109    pub const EMPTY: i32 = 5;
1110    pub const WAIT_EVENT: i32 = 6;
1111    pub const EVENT_RECORD: i32 = 7;
1112    pub const EXT_SEMAS_SIGNAL: i32 = 8;
1113    pub const EXT_SEMAS_WAIT: i32 = 9;
1114    pub const MEM_ALLOC: i32 = 10;
1115    pub const MEM_FREE: i32 = 11;
1116    pub const BATCH_MEM_OP: i32 = 12;
1117    pub const CONDITIONAL: i32 = 13;
1118}
1119
1120/// Stream-capture mode (passed to `cuStreamBeginCapture`).
1121#[allow(non_snake_case)]
1122pub mod CUstreamCaptureMode {
1123    /// Operations on any stream in the process are captured while this
1124    /// thread's chosen stream is capturing. Discouraged in modern code.
1125    pub const GLOBAL: u32 = 0;
1126    /// Only operations on streams whose capture was initiated from the
1127    /// current thread are captured. Recommended.
1128    pub const THREAD_LOCAL: u32 = 1;
1129    /// Permissive mode — allows unsynchronized cross-stream activity.
1130    pub const RELAXED: u32 = 2;
1131}
1132
1133/// Stream-capture status (returned by `cuStreamIsCapturing`).
1134#[allow(non_snake_case)]
1135pub mod CUstreamCaptureStatus {
1136    pub const NONE: u32 = 0;
1137    pub const ACTIVE: u32 = 1;
1138    pub const INVALIDATED: u32 = 2;
1139}
1140
1141/// Flags for `cuGraphInstantiateWithFlags`.
1142#[allow(non_snake_case)]
1143pub mod CUgraphInstantiate_flags {
1144    /// Automatically free allocations created in the graph after launch completes.
1145    pub const AUTO_FREE_ON_LAUNCH: u64 = 1;
1146    /// Upload the executable graph to the device immediately on instantiate.
1147    pub const UPLOAD: u64 = 2;
1148    /// Use node priorities when scheduling.
1149    pub const USE_NODE_PRIORITY: u64 = 8;
1150}
1151
1152/// Device attribute selector (subset of `CUdevice_attribute`).
1153#[allow(non_snake_case)]
1154pub mod CUdevice_attribute {
1155    pub const MAX_THREADS_PER_BLOCK: i32 = 1;
1156    pub const MAX_BLOCK_DIM_X: i32 = 2;
1157    pub const MAX_BLOCK_DIM_Y: i32 = 3;
1158    pub const MAX_BLOCK_DIM_Z: i32 = 4;
1159    pub const MAX_GRID_DIM_X: i32 = 5;
1160    pub const MAX_GRID_DIM_Y: i32 = 6;
1161    pub const MAX_GRID_DIM_Z: i32 = 7;
1162    pub const MAX_SHARED_MEMORY_PER_BLOCK: i32 = 8;
1163    pub const TOTAL_CONSTANT_MEMORY: i32 = 9;
1164    pub const WARP_SIZE: i32 = 10;
1165    pub const MAX_PITCH: i32 = 11;
1166    pub const MAX_REGISTERS_PER_BLOCK: i32 = 12;
1167    pub const CLOCK_RATE: i32 = 13;
1168    pub const TEXTURE_ALIGNMENT: i32 = 14;
1169    pub const MULTIPROCESSOR_COUNT: i32 = 16;
1170    pub const INTEGRATED: i32 = 18;
1171    pub const COMPUTE_CAPABILITY_MAJOR: i32 = 75;
1172    pub const COMPUTE_CAPABILITY_MINOR: i32 = 76;
1173    pub const PCI_BUS_ID: i32 = 33;
1174    pub const PCI_DEVICE_ID: i32 = 34;
1175    pub const PCI_DOMAIN_ID: i32 = 50;
1176    pub const CONCURRENT_KERNELS: i32 = 31;
1177    pub const ECC_ENABLED: i32 = 32;
1178}
1179
1180// ---- Wave 6: arrays, textures, surfaces ----------------------------------
1181
1182/// `CUarray_format` — scalar format of an array's texels.
1183#[allow(non_snake_case)]
1184pub mod CUarray_format {
1185    pub const UNSIGNED_INT8: u32 = 0x01;
1186    pub const UNSIGNED_INT16: u32 = 0x02;
1187    pub const UNSIGNED_INT32: u32 = 0x03;
1188    pub const SIGNED_INT8: u32 = 0x08;
1189    pub const SIGNED_INT16: u32 = 0x09;
1190    pub const SIGNED_INT32: u32 = 0x0a;
1191    pub const HALF: u32 = 0x10;
1192    pub const FLOAT: u32 = 0x20;
1193}
1194
1195/// `CUaddress_mode` — out-of-bounds behavior for texture sampling.
1196#[allow(non_snake_case)]
1197pub mod CUaddress_mode {
1198    pub const WRAP: u32 = 0;
1199    pub const CLAMP: u32 = 1;
1200    pub const MIRROR: u32 = 2;
1201    pub const BORDER: u32 = 3;
1202}
1203
1204/// `CUfilter_mode` — point vs. linear filtering.
1205#[allow(non_snake_case)]
1206pub mod CUfilter_mode {
1207    pub const POINT: u32 = 0;
1208    pub const LINEAR: u32 = 1;
1209}
1210
1211/// `CUresourcetype` — tag for the variant inside a [`CUDA_RESOURCE_DESC`].
1212#[allow(non_snake_case)]
1213pub mod CUresourcetype {
1214    pub const ARRAY: u32 = 0;
1215    pub const MIPMAPPED_ARRAY: u32 = 1;
1216    pub const LINEAR: u32 = 2;
1217    pub const PITCH2D: u32 = 3;
1218}
1219
1220/// `CUDA_ARRAY_DESCRIPTOR` — shape passed to `cuArrayCreate_v2`.
1221#[repr(C)]
1222#[derive(Copy, Clone, Debug, Default)]
1223#[allow(non_camel_case_types)]
1224pub struct CUDA_ARRAY_DESCRIPTOR {
1225    pub width: usize,
1226    pub height: usize,
1227    pub format: u32,
1228    pub num_channels: core::ffi::c_uint,
1229}
1230
1231/// `CUDA_RESOURCE_DESC` — untagged union of resource-type variants with a
1232/// leading discriminant.
1233///
1234/// The C layout is:
1235/// ```text
1236/// struct CUDA_RESOURCE_DESC {
1237///     CUresourcetype resType;    // c_int (4 bytes)
1238///     // 4 bytes padding (union is 8-byte aligned due to CUdeviceptr / pointers inside)
1239///     union { ... } res;         // 128 bytes, 8-byte aligned
1240///     unsigned int flags;        // 4 bytes
1241///     // 4 bytes tail padding to keep the struct 8-byte aligned overall
1242/// };
1243/// ```
1244/// Total size: 144 bytes. We model the union as `[u64; 16]` (128 bytes,
1245/// align 8) which reproduces the correct layout for free.
1246#[repr(C)]
1247#[derive(Copy, Clone)]
1248#[allow(non_camel_case_types)]
1249pub struct CUDA_RESOURCE_DESC {
1250    pub res_type: core::ffi::c_int,
1251    _pad0: u32,
1252    /// Variant-specific payload (128 bytes, `int reserved[32]` in `cuda.h`).
1253    pub res: [u64; 16],
1254    pub flags: core::ffi::c_uint,
1255    _pad1: u32,
1256}
1257
1258impl Default for CUDA_RESOURCE_DESC {
1259    fn default() -> Self {
1260        Self {
1261            res_type: CUresourcetype::ARRAY as core::ffi::c_int,
1262            _pad0: 0,
1263            res: [0u64; 16],
1264            flags: 0,
1265            _pad1: 0,
1266        }
1267    }
1268}
1269
1270impl core::fmt::Debug for CUDA_RESOURCE_DESC {
1271    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1272        f.debug_struct("CUDA_RESOURCE_DESC")
1273            .field("res_type", &self.res_type)
1274            .field("flags", &self.flags)
1275            .finish_non_exhaustive()
1276    }
1277}
1278
1279impl CUDA_RESOURCE_DESC {
1280    /// Point this descriptor at a [`CUarray`] (resource-type `ARRAY`). The
1281    /// pointer is only *stored* in the union — never dereferenced by us —
1282    /// so we do not require an `unsafe` boundary here.
1283    #[allow(clippy::not_unsafe_ptr_arg_deref)]
1284    pub fn from_array(array: CUarray) -> Self {
1285        let mut d = Self::default();
1286        // The `array` variant of the union is `struct { CUarray hArray; }`,
1287        // placed at offset 0 of the union (which is offset 8 of the outer
1288        // struct, after `res_type` + padding). [u64; 16] is 8-byte aligned,
1289        // so writing a pointer at `res[0]` is well-defined.
1290        //
1291        // SAFETY: `res` is 128 bytes and 8-byte aligned; `CUarray` is a
1292        // pointer (8 bytes), fits at offset 0.
1293        unsafe {
1294            let p = d.res.as_mut_ptr() as *mut CUarray;
1295            p.write(array);
1296        }
1297        d
1298    }
1299}
1300
1301/// `CUDA_TEXTURE_DESC` — texture-sampling parameters.
1302#[repr(C)]
1303#[derive(Copy, Clone, Debug)]
1304#[allow(non_camel_case_types)]
1305pub struct CUDA_TEXTURE_DESC {
1306    pub address_mode: [u32; 3],
1307    pub filter_mode: u32,
1308    pub flags: core::ffi::c_uint,
1309    pub max_anisotropy: core::ffi::c_uint,
1310    pub mipmap_filter_mode: u32,
1311    pub mipmap_level_bias: f32,
1312    pub min_mipmap_level_clamp: f32,
1313    pub max_mipmap_level_clamp: f32,
1314    pub border_color: [f32; 4],
1315    pub reserved: [core::ffi::c_int; 12],
1316}
1317
1318impl Default for CUDA_TEXTURE_DESC {
1319    fn default() -> Self {
1320        Self {
1321            address_mode: [CUaddress_mode::CLAMP; 3],
1322            filter_mode: CUfilter_mode::POINT,
1323            flags: 0,
1324            max_anisotropy: 0,
1325            mipmap_filter_mode: CUfilter_mode::POINT,
1326            mipmap_level_bias: 0.0,
1327            min_mipmap_level_clamp: 0.0,
1328            max_mipmap_level_clamp: 0.0,
1329            border_color: [0.0; 4],
1330            reserved: [0; 12],
1331        }
1332    }
1333}
1334
1335// ---- Wave 7: virtual memory management (VMM) ----------------------------
1336
1337/// `CUmemAllocationType` — what kind of physical backing to create.
1338#[allow(non_snake_case)]
1339pub mod CUmemAllocationType {
1340    pub const INVALID: i32 = 0;
1341    pub const PINNED: i32 = 1;
1342}
1343
1344/// `CUmemLocationType` — identifies what device/host the backing lives on.
1345#[allow(non_snake_case)]
1346pub mod CUmemLocationType {
1347    pub const INVALID: i32 = 0;
1348    pub const DEVICE: i32 = 1;
1349    pub const HOST: i32 = 2;
1350    pub const HOST_NUMA: i32 = 3;
1351    pub const HOST_NUMA_CURRENT: i32 = 4;
1352}
1353
1354/// `CUmemAllocationHandleType` — OS-level handle shape for IPC sharing.
1355#[allow(non_snake_case)]
1356pub mod CUmemAllocationHandleType {
1357    pub const NONE: i32 = 0;
1358    pub const POSIX_FILE_DESCRIPTOR: i32 = 1;
1359    pub const WIN32: i32 = 2;
1360    pub const WIN32_KMT: i32 = 4;
1361    pub const FABRIC: i32 = 8;
1362}
1363
1364/// `CUmemAccess_flags` — access rights granted by `cuMemSetAccess`.
1365#[allow(non_snake_case)]
1366pub mod CUmemAccess_flags {
1367    pub const NONE: i32 = 0;
1368    pub const READ: i32 = 1;
1369    pub const READWRITE: i32 = 3;
1370}
1371
1372/// `CUmemAllocationGranularity_flags` — pass to
1373/// `cuMemGetAllocationGranularity`.
1374#[allow(non_snake_case)]
1375pub mod CUmemAllocationGranularity_flags {
1376    pub const MINIMUM: i32 = 0;
1377    pub const RECOMMENDED: i32 = 1;
1378}
1379
1380/// `CUmemLocation` — `(type, id)` pair identifying a device or NUMA node.
1381#[repr(C)]
1382#[derive(Copy, Clone, Debug, Default)]
1383#[allow(non_camel_case_types)]
1384pub struct CUmemLocation {
1385    pub type_: core::ffi::c_int,
1386    pub id: core::ffi::c_int,
1387}
1388
1389/// Inline flag block inside [`CUmemAllocationProp`] (8 bytes).
1390#[repr(C)]
1391#[derive(Copy, Clone, Debug, Default)]
1392#[allow(non_camel_case_types)]
1393pub struct CUmemAllocationPropFlags {
1394    pub compression_type: core::ffi::c_uchar,
1395    pub gpu_direct_rdma_capable: core::ffi::c_uchar,
1396    pub usage: core::ffi::c_ushort,
1397    pub reserved: [core::ffi::c_uchar; 4],
1398}
1399
1400/// `CUmemAllocationProp` — passed to `cuMemCreate` to describe what kind
1401/// of backing (type, location, IPC handle shape) to produce.
1402#[repr(C)]
1403#[derive(Copy, Clone, Debug, Default)]
1404#[allow(non_camel_case_types)]
1405pub struct CUmemAllocationProp {
1406    pub type_: core::ffi::c_int,
1407    pub requested_handle_types: core::ffi::c_int,
1408    pub location: CUmemLocation,
1409    pub win32_handle_meta_data: *mut c_void,
1410    pub alloc_flags: CUmemAllocationPropFlags,
1411}
1412
1413/// `CUmemAccessDesc` — passed to `cuMemSetAccess` to grant a device
1414/// `flags` access to a virtual-memory range.
1415#[repr(C)]
1416#[derive(Copy, Clone, Debug, Default)]
1417#[allow(non_camel_case_types)]
1418pub struct CUmemAccessDesc {
1419    pub location: CUmemLocation,
1420    pub flags: core::ffi::c_int,
1421}
1422
1423// ---- Wave 8: memory pools -----------------------------------------------
1424
1425/// `CUmemPool_attribute` — pass to `cuMemPoolSetAttribute` / `GetAttribute`.
1426#[allow(non_snake_case)]
1427pub mod CUmemPool_attribute {
1428    pub const REUSE_FOLLOW_EVENT_DEPENDENCIES: i32 = 1;
1429    pub const REUSE_ALLOW_OPPORTUNISTIC: i32 = 2;
1430    pub const REUSE_ALLOW_INTERNAL_DEPENDENCIES: i32 = 3;
1431    pub const RELEASE_THRESHOLD: i32 = 4;
1432    pub const RESERVED_MEM_CURRENT: i32 = 5;
1433    pub const RESERVED_MEM_HIGH: i32 = 6;
1434    pub const USED_MEM_CURRENT: i32 = 7;
1435    pub const USED_MEM_HIGH: i32 = 8;
1436}
1437
1438/// `CUmemPoolProps` — creation props for `cuMemPoolCreate`. 88 bytes in C.
1439#[repr(C)]
1440#[derive(Copy, Clone, Debug)]
1441#[allow(non_camel_case_types)]
1442pub struct CUmemPoolProps {
1443    pub alloc_type: core::ffi::c_int,
1444    pub handle_types: core::ffi::c_int,
1445    pub location: CUmemLocation,
1446    pub win32_security_attributes: *mut c_void,
1447    pub max_size: usize,
1448    pub usage: core::ffi::c_ushort,
1449    pub reserved: [core::ffi::c_uchar; 54],
1450}
1451
1452impl Default for CUmemPoolProps {
1453    fn default() -> Self {
1454        Self {
1455            alloc_type: CUmemAllocationType::PINNED,
1456            handle_types: CUmemAllocationHandleType::NONE,
1457            location: CUmemLocation::default(),
1458            win32_security_attributes: core::ptr::null_mut(),
1459            max_size: 0,
1460            usage: 0,
1461            reserved: [0u8; 54],
1462        }
1463    }
1464}
1465
1466/// `CUmemPoolPtrExportData` — opaque 64-byte blob returned by
1467/// `cuMemPoolExportPointer`.
1468#[repr(C)]
1469#[derive(Copy, Clone, Debug)]
1470#[allow(non_camel_case_types)]
1471pub struct CUmemPoolPtrExportData {
1472    pub reserved: [core::ffi::c_uchar; 64],
1473}
1474
1475impl Default for CUmemPoolPtrExportData {
1476    fn default() -> Self {
1477        Self {
1478            reserved: [0u8; 64],
1479        }
1480    }
1481}
1482
1483// ---- Wave 9: external memory / semaphore interop ------------------------
1484
1485/// `CUexternalMemoryHandleType` — which OS handle shape you're importing.
1486#[allow(non_snake_case)]
1487pub mod CUexternalMemoryHandleType {
1488    pub const OPAQUE_FD: i32 = 1;
1489    pub const OPAQUE_WIN32: i32 = 2;
1490    pub const OPAQUE_WIN32_KMT: i32 = 3;
1491    pub const D3D12_HEAP: i32 = 4;
1492    pub const D3D12_RESOURCE: i32 = 5;
1493    pub const D3D11_RESOURCE: i32 = 6;
1494    pub const D3D11_RESOURCE_KMT: i32 = 7;
1495    pub const NVSCIBUF: i32 = 8;
1496}
1497
1498/// `CUexternalSemaphoreHandleType` — OS handle shape for imported sem.
1499#[allow(non_snake_case)]
1500pub mod CUexternalSemaphoreHandleType {
1501    pub const OPAQUE_FD: i32 = 1;
1502    pub const OPAQUE_WIN32: i32 = 2;
1503    pub const OPAQUE_WIN32_KMT: i32 = 3;
1504    pub const D3D12_FENCE: i32 = 4;
1505    pub const D3D11_FENCE: i32 = 5;
1506    pub const NVSCISYNC: i32 = 6;
1507    pub const KEYED_MUTEX: i32 = 7;
1508    pub const KEYED_MUTEX_KMT: i32 = 8;
1509    pub const TIMELINE_SEMAPHORE_FD: i32 = 9;
1510    pub const TIMELINE_SEMAPHORE_WIN32: i32 = 10;
1511}
1512
1513// `CUexternalMemory` and `CUexternalSemaphore` are declared near the top
1514// of this module alongside the other opaque handles.
1515
1516/// Opaque mipmapped CUDA array handle (CUDA 5+).
1517pub type CUmipmappedArray = *mut c_void;
1518
1519/// Opaque user-object handle (CUDA 12.0+) — refcounted RAII slot for
1520/// attaching external resources to CUDA graphs.
1521pub type CUuserObject = *mut c_void;
1522
1523/// Opaque graphics-resource handle (registered GL buffer, D3D resource,
1524/// VDPAU surface, EGL image, ...). See the `cuGraphics*` API family.
1525pub type CUgraphicsResource = *mut c_void;
1526
1527/// Opaque CUDA Logs callback registration (CUDA 12.9+).
1528pub type CUlogsCallbackHandle = *mut c_void;
1529
1530/// `CUlogIterator` — 32-bit cursor into the driver's in-memory log ring.
1531pub type CUlogIterator = core::ffi::c_uint;
1532
1533/// `CUlogLevel` returned to [`CUlogsCallback`].
1534#[allow(non_snake_case)]
1535pub mod CUlogLevel {
1536    pub const ERROR: i32 = 0;
1537    pub const WARNING: i32 = 1;
1538    pub const INFO: i32 = 2;
1539    pub const TRACE: i32 = 3;
1540}
1541
1542/// `CUlogsCallback` — the function pointer passed to `cuLogsRegisterCallback`.
1543pub type CUlogsCallback = Option<
1544    unsafe extern "C" fn(
1545        data: *mut c_void,
1546        log_level: core::ffi::c_int,
1547        message: *const core::ffi::c_char,
1548        len: core::ffi::c_uint,
1549    ),
1550>;
1551
1552/// `CUmoduleLoadingMode` — reported by `cuModuleGetLoadingMode`.
1553#[allow(non_snake_case)]
1554pub mod CUmoduleLoadingMode {
1555    pub const EAGER_LOADING: i32 = 0x1;
1556    pub const LAZY_LOADING: i32 = 0x2;
1557}
1558
1559// ---- Wave 24: graph memory nodes + graph-exec update --------------------
1560
1561/// `CUgraphExecUpdateResult` — outcome code returned from
1562/// `cuGraphExecUpdate_v2` via [`CUgraphExecUpdateResultInfo::result`].
1563#[allow(non_snake_case)]
1564pub mod CUgraphExecUpdateResult {
1565    pub const SUCCESS: i32 = 0;
1566    pub const ERROR: i32 = 1;
1567    pub const ERROR_TOPOLOGY_CHANGED: i32 = 2;
1568    pub const ERROR_NODE_TYPE_CHANGED: i32 = 3;
1569    pub const ERROR_FUNCTION_CHANGED: i32 = 4;
1570    pub const ERROR_PARAMETERS_CHANGED: i32 = 5;
1571    pub const ERROR_NOT_SUPPORTED: i32 = 6;
1572    pub const ERROR_UNSUPPORTED_FUNCTION_CHANGE: i32 = 7;
1573    pub const ERROR_ATTRIBUTES_CHANGED: i32 = 8;
1574}
1575
1576/// `CUgraphExecUpdateResultInfo` — filled by `cuGraphExecUpdate_v2`
1577/// on (partial) failure to identify which node diverged.
1578#[repr(C)]
1579#[derive(Copy, Clone, Debug)]
1580#[allow(non_camel_case_types)]
1581pub struct CUgraphExecUpdateResultInfo {
1582    pub result: core::ffi::c_int,
1583    pub error_node: CUgraphNode,
1584    pub error_from_node: CUgraphNode,
1585}
1586
1587impl Default for CUgraphExecUpdateResultInfo {
1588    fn default() -> Self {
1589        Self {
1590            result: CUgraphExecUpdateResult::SUCCESS,
1591            error_node: core::ptr::null_mut(),
1592            error_from_node: core::ptr::null_mut(),
1593        }
1594    }
1595}
1596
1597/// `CUgraphMem_attribute` — selector for per-device graph-mem limits.
1598#[allow(non_snake_case)]
1599pub mod CUgraphMem_attribute {
1600    pub const USED_MEM_CURRENT: i32 = 0;
1601    pub const USED_MEM_HIGH: i32 = 1;
1602    pub const RESERVED_MEM_CURRENT: i32 = 2;
1603    pub const RESERVED_MEM_HIGH: i32 = 3;
1604}
1605
1606/// `CUDA_MEM_ALLOC_NODE_PARAMS` — description passed to
1607/// `cuGraphAddMemAllocNode`. `dptr` is written by CUDA on successful add
1608/// (it's the address the node will allocate when the graph runs).
1609#[repr(C)]
1610#[derive(Copy, Clone, Debug)]
1611#[allow(non_camel_case_types)]
1612pub struct CUDA_MEM_ALLOC_NODE_PARAMS {
1613    pub pool_props: CUmemPoolProps,
1614    pub access_descs: *const CUmemAccessDesc,
1615    pub access_desc_count: usize,
1616    pub bytesize: usize,
1617    pub dptr: CUdeviceptr,
1618}
1619
1620impl Default for CUDA_MEM_ALLOC_NODE_PARAMS {
1621    fn default() -> Self {
1622        Self {
1623            pool_props: CUmemPoolProps::default(),
1624            access_descs: core::ptr::null(),
1625            access_desc_count: 0,
1626            bytesize: 0,
1627            dptr: CUdeviceptr(0),
1628        }
1629    }
1630}
1631
1632/// `CUstreamBatchMemOpType` — operation code inside a batch-memop entry.
1633#[allow(non_snake_case)]
1634pub mod CUstreamBatchMemOpType {
1635    pub const WAIT_VALUE_32: u32 = 1;
1636    pub const WRITE_VALUE_32: u32 = 2;
1637    pub const WAIT_VALUE_64: u32 = 4;
1638    pub const WRITE_VALUE_64: u32 = 5;
1639    pub const BARRIER: u32 = 6;
1640    pub const FLUSH_REMOTE_WRITES: u32 = 3;
1641}
1642
1643/// `CUstreamWriteValue_flags` / `CUstreamWaitValue_flags` — bitmask for
1644/// the individual stream-value ops and their batch-memop equivalents.
1645#[allow(non_snake_case)]
1646pub mod CUstreamWaitValue_flags {
1647    pub const GEQ: u32 = 0x0;
1648    pub const EQ: u32 = 0x1;
1649    pub const AND: u32 = 0x2;
1650    pub const NOR: u32 = 0x3;
1651    pub const FLUSH: u32 = 1 << 30;
1652}
1653
1654#[allow(non_snake_case)]
1655pub mod CUstreamWriteValue_flags {
1656    pub const DEFAULT: u32 = 0x0;
1657    pub const NO_MEMORY_BARRIER: u32 = 0x1;
1658}
1659
1660/// `CUstreamBatchMemOpParams` — 48-byte tagged-union entry in a batched
1661/// stream memory-op array. We model it as `[u64; 6]` and provide typed
1662/// builders (`wait_value_32`, `write_value_64`, ...).
1663#[repr(C)]
1664#[derive(Copy, Clone)]
1665#[allow(non_camel_case_types)]
1666pub struct CUstreamBatchMemOpParams {
1667    pub raw: [u64; 6],
1668}
1669
1670#[allow(clippy::derivable_impls)]
1671impl Default for CUstreamBatchMemOpParams {
1672    fn default() -> Self {
1673        Self { raw: [0; 6] }
1674    }
1675}
1676
1677impl core::fmt::Debug for CUstreamBatchMemOpParams {
1678    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1679        f.debug_struct("CUstreamBatchMemOpParams")
1680            .field("op", &(self.raw[0] as u32))
1681            .finish_non_exhaustive()
1682    }
1683}
1684
1685/// Fixed layout of a wait-value or write-value entry (32 bytes, with
1686/// padding the rest of the 48-byte union).
1687///
1688/// Field offsets:
1689/// - [0..4]   operation (u32)
1690/// - [4..8]   reserved / pad
1691/// - [8..16]  address (CUdeviceptr)
1692/// - [16..24] value / value64 (u32 or u64)
1693/// - [24..28] flags (u32)
1694/// - [28..32] pad
1695/// - [32..40] alias (CUdeviceptr)
1696/// - [40..48] pad
1697impl CUstreamBatchMemOpParams {
1698    /// Build a `WaitValue32` entry.
1699    pub fn wait_value_32(address: CUdeviceptr, value: u32, flags: u32) -> Self {
1700        let mut s = Self::default();
1701        unsafe {
1702            let p = s.raw.as_mut_ptr() as *mut u8;
1703            (p as *mut u32).write(CUstreamBatchMemOpType::WAIT_VALUE_32);
1704            (p.add(8) as *mut u64).write(address.0);
1705            (p.add(16) as *mut u32).write(value);
1706            (p.add(24) as *mut u32).write(flags);
1707        }
1708        s
1709    }
1710
1711    pub fn wait_value_64(address: CUdeviceptr, value: u64, flags: u32) -> Self {
1712        let mut s = Self::default();
1713        unsafe {
1714            let p = s.raw.as_mut_ptr() as *mut u8;
1715            (p as *mut u32).write(CUstreamBatchMemOpType::WAIT_VALUE_64);
1716            (p.add(8) as *mut u64).write(address.0);
1717            (p.add(16) as *mut u64).write(value);
1718            (p.add(24) as *mut u32).write(flags);
1719        }
1720        s
1721    }
1722
1723    pub fn write_value_32(address: CUdeviceptr, value: u32, flags: u32) -> Self {
1724        let mut s = Self::default();
1725        unsafe {
1726            let p = s.raw.as_mut_ptr() as *mut u8;
1727            (p as *mut u32).write(CUstreamBatchMemOpType::WRITE_VALUE_32);
1728            (p.add(8) as *mut u64).write(address.0);
1729            (p.add(16) as *mut u32).write(value);
1730            (p.add(24) as *mut u32).write(flags);
1731        }
1732        s
1733    }
1734
1735    pub fn write_value_64(address: CUdeviceptr, value: u64, flags: u32) -> Self {
1736        let mut s = Self::default();
1737        unsafe {
1738            let p = s.raw.as_mut_ptr() as *mut u8;
1739            (p as *mut u32).write(CUstreamBatchMemOpType::WRITE_VALUE_64);
1740            (p.add(8) as *mut u64).write(address.0);
1741            (p.add(16) as *mut u64).write(value);
1742            (p.add(24) as *mut u32).write(flags);
1743        }
1744        s
1745    }
1746}
1747
1748/// `CUDA_BATCH_MEM_OP_NODE_PARAMS` — the fields passed to
1749/// `cuGraphAddBatchMemOpNode`.
1750#[repr(C)]
1751#[derive(Copy, Clone, Debug)]
1752#[allow(non_camel_case_types)]
1753pub struct CUDA_BATCH_MEM_OP_NODE_PARAMS {
1754    pub ctx: CUcontext,
1755    pub count: core::ffi::c_uint,
1756    pub param_array: *mut CUstreamBatchMemOpParams,
1757    pub flags: core::ffi::c_uint,
1758}
1759
1760impl Default for CUDA_BATCH_MEM_OP_NODE_PARAMS {
1761    fn default() -> Self {
1762        Self {
1763            ctx: core::ptr::null_mut(),
1764            count: 0,
1765            param_array: core::ptr::null_mut(),
1766            flags: 0,
1767        }
1768    }
1769}
1770
1771/// Opaque green-context handle (CUDA 12.4+).
1772pub type CUgreenCtx = *mut c_void;
1773
1774/// Opaque device-resource descriptor handle (produced by
1775/// `cuDevResourceGenerateDesc`, consumed by `cuGreenCtxCreate`).
1776pub type CUdevResourceDesc = *mut c_void;
1777
1778/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` — union-bearing import descriptor.
1779/// The 16-byte `handle` slot holds either an `int fd`, a pair of
1780/// `(HANDLE, LPCWSTR)` pointers, or an nvSciBuf object pointer.
1781/// See [`CUDA_EXTERNAL_MEMORY_HANDLE_DESC::from_win32_handle`] and
1782/// [`from_fd`](CUDA_EXTERNAL_MEMORY_HANDLE_DESC::from_fd) for the common cases.
1783#[repr(C)]
1784#[derive(Copy, Clone)]
1785#[allow(non_camel_case_types)]
1786pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
1787    pub type_: core::ffi::c_int,
1788    _pad0: u32,
1789    /// Union payload: max member is `{ HANDLE, LPCWSTR }` = 16 bytes, align 8.
1790    pub handle: [u64; 2],
1791    pub size: u64,
1792    pub flags: core::ffi::c_uint,
1793    pub reserved: [core::ffi::c_uint; 16],
1794}
1795
1796#[allow(clippy::derivable_impls)]
1797impl Default for CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
1798    fn default() -> Self {
1799        Self {
1800            type_: 0,
1801            _pad0: 0,
1802            handle: [0; 2],
1803            size: 0,
1804            flags: 0,
1805            reserved: [0; 16],
1806        }
1807    }
1808}
1809
1810impl core::fmt::Debug for CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
1811    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1812        f.debug_struct("CUDA_EXTERNAL_MEMORY_HANDLE_DESC")
1813            .field("type", &self.type_)
1814            .field("size", &self.size)
1815            .field("flags", &self.flags)
1816            .finish_non_exhaustive()
1817    }
1818}
1819
1820impl CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
1821    /// Import from a POSIX file descriptor (Linux / NvSci).
1822    pub fn from_fd(fd: core::ffi::c_int, size: u64) -> Self {
1823        let mut d = Self {
1824            type_: CUexternalMemoryHandleType::OPAQUE_FD,
1825            size,
1826            ..Default::default()
1827        };
1828        // handle.fd lives in the first 4 bytes of the union.
1829        let slot = d.handle.as_mut_ptr() as *mut core::ffi::c_int;
1830        unsafe { slot.write(fd) };
1831        d
1832    }
1833
1834    /// Import from a Windows NT HANDLE (or optional Unicode name for a
1835    /// named object). Leaves `name` as null when unused.
1836    ///
1837    /// # Safety
1838    ///
1839    /// `handle` and (if non-null) `name` must be live OS objects for the
1840    /// duration of the resulting `cuImportExternalMemory` call.
1841    pub unsafe fn from_win32_handle(
1842        type_: core::ffi::c_int,
1843        handle: *mut c_void,
1844        name: *const c_void,
1845        size: u64,
1846    ) -> Self {
1847        let mut d = Self {
1848            type_,
1849            size,
1850            ..Default::default()
1851        };
1852        // handle.win32 = { HANDLE, LPCWSTR } at offset 0 of the union.
1853        let p = d.handle.as_mut_ptr() as *mut [*mut c_void; 2];
1854        unsafe { p.write([handle, name as *mut c_void]) };
1855        d
1856    }
1857}
1858
1859/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` — offset + size subregion of an
1860/// imported external memory to expose as a device pointer.
1861#[repr(C)]
1862#[derive(Copy, Clone, Debug, Default)]
1863#[allow(non_camel_case_types)]
1864pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
1865    pub offset: u64,
1866    pub size: u64,
1867    pub flags: core::ffi::c_uint,
1868    pub reserved: [core::ffi::c_uint; 16],
1869}
1870
1871/// `CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC` — same shape as the memory
1872/// handle desc but without the trailing `size`.
1873#[repr(C)]
1874#[derive(Copy, Clone)]
1875#[allow(non_camel_case_types)]
1876pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC {
1877    pub type_: core::ffi::c_int,
1878    _pad0: u32,
1879    pub handle: [u64; 2],
1880    pub flags: core::ffi::c_uint,
1881    pub reserved: [core::ffi::c_uint; 16],
1882}
1883
1884#[allow(clippy::derivable_impls)]
1885impl Default for CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC {
1886    fn default() -> Self {
1887        Self {
1888            type_: 0,
1889            _pad0: 0,
1890            handle: [0; 2],
1891            flags: 0,
1892            reserved: [0; 16],
1893        }
1894    }
1895}
1896
1897impl core::fmt::Debug for CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC {
1898    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1899        f.debug_struct("CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC")
1900            .field("type", &self.type_)
1901            .field("flags", &self.flags)
1902            .finish_non_exhaustive()
1903    }
1904}
1905
1906impl CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC {
1907    pub fn from_fd(fd: core::ffi::c_int, type_: core::ffi::c_int) -> Self {
1908        let mut d = Self {
1909            type_,
1910            ..Default::default()
1911        };
1912        let slot = d.handle.as_mut_ptr() as *mut core::ffi::c_int;
1913        unsafe { slot.write(fd) };
1914        d
1915    }
1916
1917    /// # Safety
1918    ///
1919    /// `handle` and (if non-null) `name` must be live OS objects.
1920    pub unsafe fn from_win32_handle(
1921        type_: core::ffi::c_int,
1922        handle: *mut c_void,
1923        name: *const c_void,
1924    ) -> Self {
1925        let mut d = Self {
1926            type_,
1927            ..Default::default()
1928        };
1929        let p = d.handle.as_mut_ptr() as *mut [*mut c_void; 2];
1930        unsafe { p.write([handle, name as *mut c_void]) };
1931        d
1932    }
1933}
1934
1935/// `CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` — which value to signal.
1936/// The `params` union has three members (fence / nvSciSync / keyedMutex)
1937/// plus reserved, totalling 72 bytes (8 for value + 8 nvSci + 8 key +
1938/// 48 reserved).
1939#[repr(C)]
1940#[derive(Copy, Clone)]
1941#[allow(non_camel_case_types)]
1942pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS {
1943    /// Layout:
1944    /// - [0..8]   fence.value (u64) — the fence value to signal.
1945    /// - [8..16]  nvSciSync.{fence|reserved} (pointer or u64).
1946    /// - [16..24] keyedMutex.key (u64).
1947    /// - [24..72] reserved[12] u32.
1948    pub params: [u64; 9],
1949    pub flags: core::ffi::c_uint,
1950    pub reserved: [core::ffi::c_uint; 16],
1951}
1952
1953#[allow(clippy::derivable_impls)]
1954impl Default for CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS {
1955    fn default() -> Self {
1956        Self {
1957            params: [0; 9],
1958            flags: 0,
1959            reserved: [0; 16],
1960        }
1961    }
1962}
1963
1964impl core::fmt::Debug for CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS {
1965    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1966        f.debug_struct("CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS")
1967            .field("fence_value", &self.params[0])
1968            .field("flags", &self.flags)
1969            .finish_non_exhaustive()
1970    }
1971}
1972
1973impl CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS {
1974    /// Signal-fence-value helper for D3D12/Vulkan timeline semaphores.
1975    pub fn fence_value(value: u64) -> Self {
1976        let mut s = Self::default();
1977        s.params[0] = value;
1978        s
1979    }
1980}
1981
1982/// `CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` — same layout but `keyedMutex`
1983/// has an extra `timeoutMs: u32` (so the overall params size is the same
1984/// 72 bytes, just different reserved count).
1985#[repr(C)]
1986#[derive(Copy, Clone)]
1987#[allow(non_camel_case_types)]
1988pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS {
1989    pub params: [u64; 9],
1990    pub flags: core::ffi::c_uint,
1991    pub reserved: [core::ffi::c_uint; 16],
1992}
1993
1994#[allow(clippy::derivable_impls)]
1995impl Default for CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS {
1996    fn default() -> Self {
1997        Self {
1998            params: [0; 9],
1999            flags: 0,
2000            reserved: [0; 16],
2001        }
2002    }
2003}
2004
2005impl core::fmt::Debug for CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS {
2006    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
2007        f.debug_struct("CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS")
2008            .field("fence_value", &self.params[0])
2009            .field("flags", &self.flags)
2010            .finish_non_exhaustive()
2011    }
2012}
2013
2014impl CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS {
2015    pub fn fence_value(value: u64) -> Self {
2016        let mut s = Self::default();
2017        s.params[0] = value;
2018        s
2019    }
2020}
2021
2022// ---- Wave 10: 3D memcpy + 3D arrays + mipmapped arrays ------------------
2023
2024/// `CUDA_ARRAY3D_DESCRIPTOR` — shape passed to `cuArray3DCreate_v2` and
2025/// `cuMipmappedArrayCreate`.
2026#[repr(C)]
2027#[derive(Copy, Clone, Debug, Default)]
2028#[allow(non_camel_case_types)]
2029pub struct CUDA_ARRAY3D_DESCRIPTOR {
2030    pub width: usize,
2031    pub height: usize,
2032    pub depth: usize,
2033    pub format: u32,
2034    pub num_channels: core::ffi::c_uint,
2035    pub flags: core::ffi::c_uint,
2036}
2037
2038/// `CUarray3D_flags` — creation-time flags for 3D / mipmapped arrays.
2039#[allow(non_snake_case)]
2040pub mod CUarray3D_flags {
2041    pub const LAYERED: u32 = 0x01;
2042    pub const SURFACE_LDST: u32 = 0x02;
2043    pub const CUBEMAP: u32 = 0x04;
2044    pub const TEXTURE_GATHER: u32 = 0x08;
2045    pub const DEPTH_TEXTURE: u32 = 0x10;
2046    pub const COLOR_ATTACHMENT: u32 = 0x20;
2047    pub const SPARSE: u32 = 0x40;
2048    pub const DEFERRED_MAPPING: u32 = 0x80;
2049}
2050
2051/// `CUDA_MEMCPY3D` — 3-D memcpy descriptor. 200 bytes.
2052///
2053/// Populate `src_*` and `dst_*` fields according to each side's
2054/// [`CUmemorytype`]: `HOST` uses the host-pointer fields, `DEVICE` uses
2055/// the device-pointer + pitch, `ARRAY` uses the array-handle fields.
2056#[repr(C)]
2057#[derive(Copy, Clone, Debug)]
2058#[allow(non_camel_case_types)]
2059pub struct CUDA_MEMCPY3D {
2060    pub src_x_in_bytes: usize,
2061    pub src_y: usize,
2062    pub src_z: usize,
2063    pub src_lod: usize,
2064    pub src_memory_type: u32,
2065    _pad0: u32,
2066    pub src_host: *const c_void,
2067    pub src_device: CUdeviceptr,
2068    pub src_array: *mut c_void,
2069    pub reserved0: *mut c_void,
2070    pub src_pitch: usize,
2071    pub src_height: usize,
2072
2073    pub dst_x_in_bytes: usize,
2074    pub dst_y: usize,
2075    pub dst_z: usize,
2076    pub dst_lod: usize,
2077    pub dst_memory_type: u32,
2078    _pad1: u32,
2079    pub dst_host: *mut c_void,
2080    pub dst_device: CUdeviceptr,
2081    pub dst_array: *mut c_void,
2082    pub reserved1: *mut c_void,
2083    pub dst_pitch: usize,
2084    pub dst_height: usize,
2085
2086    pub width_in_bytes: usize,
2087    pub height: usize,
2088    pub depth: usize,
2089}
2090
2091impl Default for CUDA_MEMCPY3D {
2092    fn default() -> Self {
2093        Self {
2094            src_x_in_bytes: 0,
2095            src_y: 0,
2096            src_z: 0,
2097            src_lod: 0,
2098            src_memory_type: 0,
2099            _pad0: 0,
2100            src_host: core::ptr::null(),
2101            src_device: CUdeviceptr(0),
2102            src_array: core::ptr::null_mut(),
2103            reserved0: core::ptr::null_mut(),
2104            src_pitch: 0,
2105            src_height: 0,
2106            dst_x_in_bytes: 0,
2107            dst_y: 0,
2108            dst_z: 0,
2109            dst_lod: 0,
2110            dst_memory_type: 0,
2111            _pad1: 0,
2112            dst_host: core::ptr::null_mut(),
2113            dst_device: CUdeviceptr(0),
2114            dst_array: core::ptr::null_mut(),
2115            reserved1: core::ptr::null_mut(),
2116            dst_pitch: 0,
2117            dst_height: 0,
2118            width_in_bytes: 0,
2119            height: 0,
2120            depth: 0,
2121        }
2122    }
2123}