Skip to main content

oxicuda_driver/
ffi.rs

1//! Raw CUDA Driver API FFI types, constants, and enums.
2//!
3//! This module provides the low-level type definitions that mirror the CUDA Driver API
4//! (`cuda.h`). No functions are defined here — only types, opaque pointer aliases,
5//! result-code constants, and `#[repr]` enums used by the dynamically loaded driver
6//! entry points.
7//!
8//! # Safety
9//!
10//! All pointer types in this module are raw pointers intended for FFI use.
11//! They must only be used through the safe wrappers provided by higher-level
12//! modules in `oxicuda-driver`.
13
14use std::ffi::c_void;
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// Core scalar type aliases
19// ---------------------------------------------------------------------------
20
21/// Return code from every CUDA Driver API call.
22///
23/// A value of `0` (`CUDA_SUCCESS`) indicates success; any other value is an
24/// error code. See the `CUDA_*` constants below for the full catalogue.
25pub type CUresult = u32;
26
27/// Ordinal identifier for a CUDA-capable device (0-based).
28pub type CUdevice = i32;
29
30/// Device-side pointer (64-bit address in GPU virtual memory).
31pub type CUdeviceptr = u64;
32
33// ---------------------------------------------------------------------------
34// Opaque handle helpers
35// ---------------------------------------------------------------------------
36
37macro_rules! define_handle {
38    ($(#[$meta:meta])* $name:ident) => {
39        $(#[$meta])*
40        #[repr(transparent)]
41        #[derive(Clone, Copy, PartialEq, Eq, Hash)]
42        pub struct $name(pub *mut c_void);
43
44        // SAFETY: CUDA handles are thread-safe when used with proper
45        // synchronisation via the driver API.
46        unsafe impl Send for $name {}
47        unsafe impl Sync for $name {}
48
49        impl fmt::Debug for $name {
50            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51                write!(f, "{}({:p})", stringify!($name), self.0)
52            }
53        }
54
55        impl Default for $name {
56            fn default() -> Self {
57                Self(std::ptr::null_mut())
58            }
59        }
60
61        impl $name {
62            /// Returns `true` if the handle is null (uninitialised).
63            #[inline]
64            pub fn is_null(self) -> bool {
65                self.0.is_null()
66            }
67        }
68    };
69}
70
71// ---------------------------------------------------------------------------
72// Handle types
73// ---------------------------------------------------------------------------
74
75define_handle! {
76    /// Opaque handle to a CUDA context.
77    CUcontext
78}
79
80define_handle! {
81    /// Opaque handle to a loaded CUDA module (PTX / cubin).
82    CUmodule
83}
84
85define_handle! {
86    /// Opaque handle to a CUDA kernel function within a module.
87    CUfunction
88}
89
90define_handle! {
91    /// Opaque handle to a CUDA stream (command queue).
92    CUstream
93}
94
95define_handle! {
96    /// Opaque handle to a CUDA event (used for timing and synchronisation).
97    CUevent
98}
99
100define_handle! {
101    /// Opaque handle to a CUDA memory pool (`cuMemPool*` family).
102    CUmemoryPool
103}
104
105define_handle! {
106    /// Opaque handle to a CUDA texture reference (legacy API).
107    CUtexref
108}
109
110define_handle! {
111    /// Opaque handle to a CUDA surface reference (legacy API).
112    CUsurfref
113}
114
115define_handle! {
116    /// Opaque handle to a CUDA texture object (modern bindless API).
117    CUtexObject
118}
119
120define_handle! {
121    /// Opaque handle to a CUDA surface object (modern bindless API).
122    CUsurfObject
123}
124
125define_handle! {
126    /// Opaque handle to a CUDA kernel (CUDA 12.8+ library-based kernels).
127    ///
128    /// Used with `cuKernelGetLibrary` to retrieve the library a kernel
129    /// belongs to.
130    CUkernel
131}
132
133define_handle! {
134    /// Opaque handle to a CUDA library (CUDA 12.8+ JIT library API).
135    ///
136    /// Retrieved via `cuKernelGetLibrary` to identify the JIT-compiled
137    /// library that contains a given kernel.
138    CUlibrary
139}
140
141define_handle! {
142    /// Opaque handle to an NVLink multicast object (CUDA 12.8+).
143    ///
144    /// Used with `cuMulticastCreate`, `cuMulticastAddDevice`, and related
145    /// functions to manage NVLink multicast memory regions across devices.
146    CUmulticastObject
147}
148
149define_handle! {
150    /// Opaque handle to a CUDA JIT linker state (`CUlinkState`).
151    ///
152    /// Created by `cuLinkCreate_v2`, populated by repeated calls to
153    /// `cuLinkAddData_v2`, finalised by `cuLinkComplete`, and freed by
154    /// `cuLinkDestroy`.
155    CUlinkState
156}
157
158// =========================================================================
159// CUmemGenericAllocationHandle — VMM allocation handle (CUDA 11.2+)
160// =========================================================================
161
162/// Opaque handle to a generic memory allocation managed by the CUDA virtual
163/// memory management (VMM) APIs (`cuMemCreate`, `cuMemRelease`, `cuMemMap`).
164///
165/// Although the CUDA header types this as `unsigned long long`, it is an opaque
166/// driver-side identifier and must not be interpreted as a numeric address.
167pub type CUmemGenericAllocationHandle = u64;
168
169// =========================================================================
170// CUmemorytype — memory type identifiers
171// =========================================================================
172
173/// Memory type identifiers returned by pointer attribute queries.
174#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
175#[repr(u32)]
176#[non_exhaustive]
177pub enum CUmemorytype {
178    /// Host (system) memory.
179    Host = 1,
180    /// Device (GPU) memory.
181    Device = 2,
182    /// Array memory.
183    Array = 3,
184    /// Unified (managed) memory.
185    Unified = 4,
186}
187
188// =========================================================================
189// CUpointer_attribute — pointer attribute query keys
190// =========================================================================
191
192/// Pointer attribute identifiers passed to `cuPointerGetAttribute`.
193#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
194#[repr(u32)]
195#[non_exhaustive]
196#[allow(non_camel_case_types)]
197pub enum CUpointer_attribute {
198    /// Query the CUDA context associated with a pointer.
199    Context = 1,
200    /// Query the memory type (host / device / unified) of a pointer.
201    MemoryType = 2,
202    /// Query the device pointer corresponding to a host pointer.
203    DevicePointer = 3,
204    /// Query the host pointer corresponding to a device pointer.
205    HostPointer = 4,
206    /// Query whether the memory is managed (unified).
207    IsManaged = 9,
208    /// Query the device ordinal for the pointer.
209    DeviceOrdinal = 10,
210}
211
212// =========================================================================
213// CUlimit — context limit identifiers
214// =========================================================================
215
216/// Context limit identifiers for `cuCtxSetLimit` / `cuCtxGetLimit`.
217#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
218#[repr(u32)]
219#[non_exhaustive]
220pub enum CUlimit {
221    /// Stack size for each GPU thread.
222    StackSize = 0,
223    /// Size of the printf FIFO.
224    PrintfFifoSize = 1,
225    /// Size of the heap used by `malloc()` on the device.
226    MallocHeapSize = 2,
227    /// Maximum nesting depth of a device runtime launch.
228    DevRuntimeSyncDepth = 3,
229    /// Maximum number of outstanding device runtime launches.
230    DevRuntimePendingLaunchCount = 4,
231    /// L2 cache fetch granularity.
232    MaxL2FetchGranularity = 5,
233    /// Maximum persisting L2 cache size.
234    PersistingL2CacheSize = 6,
235}
236
237// =========================================================================
238// CUfunction_attribute — function attribute query keys
239// =========================================================================
240
241/// Function attribute identifiers passed to `cuFuncGetAttribute`.
242#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
243#[repr(i32)]
244#[non_exhaustive]
245#[allow(non_camel_case_types)]
246pub enum CUfunction_attribute {
247    /// Maximum threads per block for this function.
248    MaxThreadsPerBlock = 0,
249    /// Shared memory used by this function (bytes).
250    SharedSizeBytes = 1,
251    /// Size of user-allocated constant memory (bytes).
252    ConstSizeBytes = 2,
253    /// Size of local memory used by each thread (bytes).
254    LocalSizeBytes = 3,
255    /// Number of registers used by each thread.
256    NumRegs = 4,
257    /// PTX virtual architecture version.
258    PtxVersion = 5,
259    /// Binary architecture version.
260    BinaryVersion = 6,
261    /// Whether this function has been cached.
262    CacheModeCa = 7,
263    /// Maximum dynamic shared memory size (bytes).
264    MaxDynamicSharedSizeBytes = 8,
265    /// Preferred shared memory carve-out.
266    PreferredSharedMemoryCarveout = 9,
267}
268
269// =========================================================================
270// CUresult constants — every documented CUDA Driver API error code
271// =========================================================================
272
273/// The API call returned with no errors.
274pub const CUDA_SUCCESS: CUresult = 0;
275
276/// One or more parameters passed to the API call are not acceptable.
277pub const CUDA_ERROR_INVALID_VALUE: CUresult = 1;
278
279/// The API call failed because it was unable to allocate enough memory.
280pub const CUDA_ERROR_OUT_OF_MEMORY: CUresult = 2;
281
282/// The CUDA driver has not been initialised via `cuInit`.
283pub const CUDA_ERROR_NOT_INITIALIZED: CUresult = 3;
284
285/// The CUDA driver is shutting down.
286pub const CUDA_ERROR_DEINITIALIZED: CUresult = 4;
287
288/// Profiler is not initialised for this run.
289pub const CUDA_ERROR_PROFILER_DISABLED: CUresult = 5;
290
291/// (Deprecated) Profiler not started.
292pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: CUresult = 6;
293
294/// (Deprecated) Profiler already started.
295pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: CUresult = 7;
296
297/// (Deprecated) Profiler already stopped.
298pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: CUresult = 8;
299
300/// Stub library loaded instead of the real driver.
301pub const CUDA_ERROR_STUB_LIBRARY: CUresult = 34;
302
303/// Device-side assert triggered.
304pub const CUDA_ERROR_DEVICE_UNAVAILABLE: CUresult = 46;
305
306/// No CUDA-capable device is detected.
307pub const CUDA_ERROR_NO_DEVICE: CUresult = 100;
308
309/// The device ordinal supplied is out of range.
310pub const CUDA_ERROR_INVALID_DEVICE: CUresult = 101;
311
312/// The device does not have a valid licence.
313pub const CUDA_ERROR_DEVICE_NOT_LICENSED: CUresult = 102;
314
315/// The PTX or cubin image is invalid.
316pub const CUDA_ERROR_INVALID_IMAGE: CUresult = 200;
317
318/// The supplied context is not valid.
319pub const CUDA_ERROR_INVALID_CONTEXT: CUresult = 201;
320
321/// (Deprecated) Context already current.
322pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CUresult = 202;
323
324/// A map or register operation has failed.
325pub const CUDA_ERROR_MAP_FAILED: CUresult = 205;
326
327/// An unmap or unregister operation has failed.
328pub const CUDA_ERROR_UNMAP_FAILED: CUresult = 206;
329
330/// The specified array is currently mapped.
331pub const CUDA_ERROR_ARRAY_IS_MAPPED: CUresult = 207;
332
333/// The resource is already mapped.
334pub const CUDA_ERROR_ALREADY_MAPPED: CUresult = 208;
335
336/// There is no kernel image available for execution on the device.
337pub const CUDA_ERROR_NO_BINARY_FOR_GPU: CUresult = 209;
338
339/// A resource has already been acquired.
340pub const CUDA_ERROR_ALREADY_ACQUIRED: CUresult = 210;
341
342/// The resource is not mapped.
343pub const CUDA_ERROR_NOT_MAPPED: CUresult = 211;
344
345/// A mapped resource is not available for access as an array.
346pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: CUresult = 212;
347
348/// A mapped resource is not available for access as a pointer.
349pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: CUresult = 213;
350
351/// An uncorrectable ECC error was detected.
352pub const CUDA_ERROR_ECC_UNCORRECTABLE: CUresult = 214;
353
354/// A PTX JIT limit has been reached.
355pub const CUDA_ERROR_UNSUPPORTED_LIMIT: CUresult = 215;
356
357/// The context already has work from another thread bound to it.
358pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: CUresult = 216;
359
360/// Peer access is not supported across the given devices.
361pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: CUresult = 217;
362
363/// The PTX JIT compilation was disabled or the PTX is invalid.
364pub const CUDA_ERROR_INVALID_PTX: CUresult = 218;
365
366/// Invalid graphics context.
367pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: CUresult = 219;
368
369/// NVLINK is uncorrectable.
370pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: CUresult = 220;
371
372/// JIT compiler not found.
373pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: CUresult = 221;
374
375/// Unsupported PTX version.
376pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: CUresult = 222;
377
378/// JIT compilation disabled.
379pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: CUresult = 223;
380
381/// Unsupported exec-affinity type.
382pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: CUresult = 224;
383
384/// Unsupported device-side synchronisation on this device.
385pub const CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = 225;
386
387/// The requested source is invalid.
388pub const CUDA_ERROR_INVALID_SOURCE: CUresult = 300;
389
390/// The named file was not found.
391pub const CUDA_ERROR_FILE_NOT_FOUND: CUresult = 301;
392
393/// A shared-object symbol lookup failed.
394pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = 302;
395
396/// The shared-object init function failed.
397pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: CUresult = 303;
398
399/// An OS call failed.
400pub const CUDA_ERROR_OPERATING_SYSTEM: CUresult = 304;
401
402/// The supplied handle is invalid.
403pub const CUDA_ERROR_INVALID_HANDLE: CUresult = 400;
404
405/// The requested resource is in an illegal state.
406pub const CUDA_ERROR_ILLEGAL_STATE: CUresult = 401;
407
408/// A loss-less compression buffer was detected while doing uncompressed access.
409pub const CUDA_ERROR_LOSSY_QUERY: CUresult = 402;
410
411/// A named symbol was not found.
412pub const CUDA_ERROR_NOT_FOUND: CUresult = 500;
413
414/// The operation is not ready (asynchronous).
415pub const CUDA_ERROR_NOT_READY: CUresult = 600;
416
417/// An illegal memory address was encountered.
418pub const CUDA_ERROR_ILLEGAL_ADDRESS: CUresult = 700;
419
420/// The kernel launch uses too many resources (registers / shared memory).
421pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: CUresult = 701;
422
423/// The kernel launch exceeded the time-out enforced by the driver.
424pub const CUDA_ERROR_LAUNCH_TIMEOUT: CUresult = 702;
425
426/// A launch did not occur on a compatible texturing mode.
427pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: CUresult = 703;
428
429/// Peer access already enabled.
430pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: CUresult = 704;
431
432/// Peer access has not been enabled.
433pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: CUresult = 705;
434
435/// The primary context has already been initialised.
436pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: CUresult = 708;
437
438/// The context is being destroyed.
439pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: CUresult = 709;
440
441/// A 64-bit device assertion triggered.
442pub const CUDA_ERROR_ASSERT: CUresult = 710;
443
444/// Hardware resources to enable peer access are exhausted.
445pub const CUDA_ERROR_TOO_MANY_PEERS: CUresult = 711;
446
447/// The host-side memory region is already registered.
448pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: CUresult = 712;
449
450/// The host-side memory region is not registered.
451pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: CUresult = 713;
452
453/// Hardware stack overflow on the device.
454pub const CUDA_ERROR_HARDWARE_STACK_ERROR: CUresult = 714;
455
456/// Illegal instruction encountered on the device.
457pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: CUresult = 715;
458
459/// Misaligned address on the device.
460pub const CUDA_ERROR_MISALIGNED_ADDRESS: CUresult = 716;
461
462/// Invalid address space.
463pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: CUresult = 717;
464
465/// Invalid program counter on the device.
466pub const CUDA_ERROR_INVALID_PC: CUresult = 718;
467
468/// The kernel launch failed.
469pub const CUDA_ERROR_LAUNCH_FAILED: CUresult = 719;
470
471/// Cooperative launch is too large for the device/kernel.
472pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = 720;
473
474/// The API call is not permitted in the active context.
475pub const CUDA_ERROR_NOT_PERMITTED: CUresult = 800;
476
477/// The API call is not supported by the current driver/device combination.
478pub const CUDA_ERROR_NOT_SUPPORTED: CUresult = 801;
479
480/// System not ready for CUDA operations.
481pub const CUDA_ERROR_SYSTEM_NOT_READY: CUresult = 802;
482
483/// System driver mismatch.
484pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: CUresult = 803;
485
486/// Old-style context incompatible with CUDA 3.2+ API.
487pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: CUresult = 804;
488
489/// MPS connection failed.
490pub const CUDA_ERROR_MPS_CONNECTION_FAILED: CUresult = 805;
491
492/// MPS RPC failure.
493pub const CUDA_ERROR_MPS_RPC_FAILURE: CUresult = 806;
494
495/// MPS server not ready.
496pub const CUDA_ERROR_MPS_SERVER_NOT_READY: CUresult = 807;
497
498/// MPS maximum clients reached.
499pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: CUresult = 808;
500
501/// MPS maximum connections reached.
502pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: CUresult = 809;
503
504/// MPS client terminated.
505pub const CUDA_ERROR_MPS_CLIENT_TERMINATED: CUresult = 810;
506
507/// CDP not supported.
508pub const CUDA_ERROR_CDP_NOT_SUPPORTED: CUresult = 811;
509
510/// CDP version mismatch.
511pub const CUDA_ERROR_CDP_VERSION_MISMATCH: CUresult = 812;
512
513/// Stream capture unsupported.
514pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: CUresult = 900;
515
516/// Stream capture invalidated.
517pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: CUresult = 901;
518
519/// Stream capture merge not permitted.
520pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: CUresult = 902;
521
522/// Stream capture unmatched.
523pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: CUresult = 903;
524
525/// Stream capture unjoined.
526pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: CUresult = 904;
527
528/// Stream capture isolation violation.
529pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: CUresult = 905;
530
531/// Implicit stream in graph capture.
532pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: CUresult = 906;
533
534/// Captured event error.
535pub const CUDA_ERROR_CAPTURED_EVENT: CUresult = 907;
536
537/// Stream capture wrong thread.
538pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: CUresult = 908;
539
540/// The async operation timed out.
541pub const CUDA_ERROR_TIMEOUT: CUresult = 909;
542
543/// The graph update failed.
544pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: CUresult = 910;
545
546/// External device error.
547pub const CUDA_ERROR_EXTERNAL_DEVICE: CUresult = 911;
548
549/// Invalid cluster size.
550pub const CUDA_ERROR_INVALID_CLUSTER_SIZE: CUresult = 912;
551
552/// Function not loaded.
553pub const CUDA_ERROR_FUNCTION_NOT_LOADED: CUresult = 913;
554
555/// Invalid resource type.
556pub const CUDA_ERROR_INVALID_RESOURCE_TYPE: CUresult = 914;
557
558/// Invalid resource configuration.
559pub const CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = 915;
560
561/// An unknown internal error occurred.
562pub const CUDA_ERROR_UNKNOWN: CUresult = 999;
563
564// =========================================================================
565// CUdevice_attribute — device property query keys
566// =========================================================================
567
568/// Device attribute identifiers passed to `cuDeviceGetAttribute`.
569#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
570#[repr(i32)]
571#[non_exhaustive]
572#[allow(non_camel_case_types)]
573pub enum CUdevice_attribute {
574    /// Maximum number of threads per block.
575    MaxThreadsPerBlock = 1,
576    /// Maximum x-dimension of a block.
577    MaxBlockDimX = 2,
578    /// Maximum y-dimension of a block.
579    MaxBlockDimY = 3,
580    /// Maximum z-dimension of a block.
581    MaxBlockDimZ = 4,
582    /// Maximum x-dimension of a grid.
583    MaxGridDimX = 5,
584    /// Maximum y-dimension of a grid.
585    MaxGridDimY = 6,
586    /// Maximum z-dimension of a grid.
587    MaxGridDimZ = 7,
588    /// Maximum shared memory available per block (bytes).
589    MaxSharedMemoryPerBlock = 8,
590    /// Total amount of constant memory on the device (bytes).
591    TotalConstantMemory = 9,
592    /// Warp size in threads.
593    WarpSize = 10,
594    /// Maximum pitch allowed by memory copies (bytes).
595    MaxPitch = 11,
596    /// Maximum number of 32-bit registers per block.
597    MaxRegistersPerBlock = 12,
598    /// Peak clock frequency in kHz.
599    ClockRate = 13,
600    /// Alignment requirement for textures.
601    TextureAlignment = 14,
602    /// Device can possibly copy memory and execute a kernel concurrently.
603    GpuOverlap = 15,
604    /// Number of multiprocessors on the device.
605    MultiprocessorCount = 16,
606    /// Whether there is a run-time limit on kernels.
607    KernelExecTimeout = 17,
608    /// Device is integrated (shares host memory).
609    Integrated = 18,
610    /// Device can map host memory with `cuMemHostAlloc` / `cuMemHostRegister`.
611    CanMapHostMemory = 19,
612    /// Compute mode: default, exclusive, prohibited, etc.
613    ComputeMode = 20,
614    /// Maximum 1D texture width.
615    MaxTexture1DWidth = 21,
616    /// Maximum 2D texture width.
617    MaxTexture2DWidth = 22,
618    /// Maximum 2D texture height.
619    MaxTexture2DHeight = 23,
620    /// Maximum 3D texture width.
621    MaxTexture3DWidth = 24,
622    /// Maximum 3D texture height.
623    MaxTexture3DHeight = 25,
624    /// Maximum 3D texture depth.
625    MaxTexture3DDepth = 26,
626    /// Maximum 2D layered texture width.
627    MaxTexture2DLayeredWidth = 27,
628    /// Maximum 2D layered texture height.
629    MaxTexture2DLayeredHeight = 28,
630    /// Maximum layers in a 2D layered texture.
631    MaxTexture2DLayeredLayers = 29,
632    /// Alignment requirement for surfaces.
633    SurfaceAlignment = 30,
634    /// Device can execute multiple kernels concurrently.
635    ConcurrentKernels = 31,
636    /// Device supports ECC memory.
637    EccEnabled = 32,
638    /// PCI bus ID of the device.
639    PciBusId = 33,
640    /// PCI device ID of the device.
641    PciDeviceId = 34,
642    /// Device is using TCC (Tesla Compute Cluster) driver model.
643    TccDriver = 35,
644    /// Peak memory clock frequency in kHz.
645    MemoryClockRate = 36,
646    /// Global memory bus width in bits.
647    GlobalMemoryBusWidth = 37,
648    /// Size of L2 cache in bytes.
649    L2CacheSize = 38,
650    /// Maximum resident threads per multiprocessor.
651    MaxThreadsPerMultiprocessor = 39,
652    /// Number of asynchronous engines.
653    AsyncEngineCount = 40,
654    /// Device shares a unified address space with the host.
655    UnifiedAddressing = 41,
656    /// Maximum 1D layered texture width.
657    MaxTexture1DLayeredWidth = 42,
658    /// Maximum layers in a 1D layered texture.
659    MaxTexture1DLayeredLayers = 43,
660    /// Maximum 2D texture width if CUDA 2D memory allocation is bound.
661    MaxTexture2DGatherWidth = 44,
662    /// Maximum 2D texture height if CUDA 2D memory allocation is bound.
663    MaxTexture2DGatherHeight = 45,
664    /// Alternate maximum 3D texture width.
665    MaxTexture3DWidthAlt = 47,
666    /// Alternate maximum 3D texture height.
667    MaxTexture3DHeightAlt = 48,
668    /// Alternate maximum 3D texture depth.
669    MaxTexture3DDepthAlt = 49,
670    /// PCI domain ID.
671    PciDomainId = 50,
672    /// Texture pitch alignment.
673    TexturePitchAlignment = 51,
674    /// Maximum 1D mipmapped texture width.
675    MaxTexture1DMipmappedWidth2 = 52,
676    /// Maximum width for a cubemap texture.
677    MaxTextureCubemapWidth = 54,
678    /// Maximum width for a cubemap layered texture.
679    MaxTextureCubemapLayeredWidth = 55,
680    /// Maximum layers in a cubemap layered texture.
681    MaxTextureCubemapLayeredLayers = 56,
682    /// Maximum 1D surface width.
683    MaxSurface1DWidth = 57,
684    /// Maximum 2D surface width.
685    MaxSurface2DWidth = 58,
686    /// Maximum 2D surface height.
687    MaxSurface2DHeight = 59,
688    /// Maximum 3D surface width.
689    MaxSurface3DWidth = 60,
690    /// Maximum 3D surface height.
691    MaxSurface3DHeight = 61,
692    /// Maximum 3D surface depth.
693    MaxSurface3DDepth = 62,
694    /// Maximum cubemap surface width.
695    MaxSurfaceCubemapWidth = 63,
696    /// Maximum 1D layered surface width.
697    MaxSurface1DLayeredWidth = 64,
698    /// Maximum layers in a 1D layered surface.
699    MaxSurface1DLayeredLayers = 65,
700    /// Maximum 2D layered surface width.
701    MaxSurface2DLayeredWidth = 66,
702    /// Maximum 2D layered surface height.
703    MaxSurface2DLayeredHeight = 67,
704    /// Maximum layers in a 2D layered surface.
705    MaxSurface2DLayeredLayers = 68,
706    /// Maximum cubemap layered surface width.
707    MaxSurfaceCubemapLayeredWidth = 69,
708    /// Maximum layers in a cubemap layered surface.
709    MaxSurfaceCubemapLayeredLayers = 70,
710    /// Maximum 1D linear texture width (deprecated).
711    MaxTexture1DLinearWidth = 71,
712    /// Maximum 2D linear texture width.
713    MaxTexture2DLinearWidth = 72,
714    /// Maximum 2D linear texture height.
715    MaxTexture2DLinearHeight = 73,
716    /// Maximum 2D linear texture pitch (bytes).
717    MaxTexture2DLinearPitch = 74,
718    /// Major compute capability version number.
719    ComputeCapabilityMajor = 75,
720    /// Minor compute capability version number.
721    ComputeCapabilityMinor = 76,
722    /// Maximum mipmapped 2D texture width.
723    MaxTexture2DMipmappedWidth = 77,
724    /// Maximum mipmapped 2D texture height.
725    MaxTexture2DMipmappedHeight = 78,
726    /// Maximum mipmapped 1D texture width.
727    MaxTexture1DMipmappedWidth = 79,
728    /// Device supports stream priorities.
729    StreamPrioritiesSupported = 80,
730    /// Maximum shared memory per multiprocessor (bytes).
731    MaxSharedMemoryPerMultiprocessor = 81,
732    /// Maximum registers per multiprocessor.
733    MaxRegistersPerMultiprocessor = 82,
734    /// Device supports managed memory.
735    ManagedMemory = 83,
736    /// Device is on a multi-GPU board.
737    IsMultiGpuBoard = 84,
738    /// Unique identifier for the multi-GPU board group.
739    MultiGpuBoardGroupId = 85,
740    /// Host-visible native-atomic support for float operations.
741    HostNativeAtomicSupported = 86,
742    /// Ratio of single-to-double precision performance.
743    SingleToDoublePrecisionPerfRatio = 87,
744    /// Device supports pageable memory access.
745    PageableMemoryAccess = 88,
746    /// Device can access host registered memory at the same virtual address.
747    ConcurrentManagedAccess = 89,
748    /// Device supports compute preemption.
749    ComputePreemptionSupported = 90,
750    /// Device can access host memory via pageable accesses.
751    CanUseHostPointerForRegisteredMem = 91,
752    /// Reserved attribute (CUDA internal, value 92).
753    Reserved92 = 92,
754    /// Reserved attribute (CUDA internal, value 93).
755    Reserved93 = 93,
756    /// Reserved attribute (CUDA internal, value 94).
757    Reserved94 = 94,
758    /// Device supports cooperative kernel launches.
759    CooperativeLaunch = 95,
760    /// Device supports cooperative kernel launches across multiple GPUs.
761    CooperativeMultiDeviceLaunch = 96,
762    /// Maximum optin shared memory per block.
763    MaxSharedMemoryPerBlockOptin = 97,
764    /// Device supports flushing of outstanding remote writes.
765    CanFlushRemoteWrites = 98,
766    /// Device supports host-side memory-register functions.
767    HostRegisterSupported = 99,
768    /// Device supports pageable memory access using host page tables.
769    PageableMemoryAccessUsesHostPageTables = 100,
770    /// Device supports direct access to managed memory on the host.
771    DirectManagedMemAccessFromHost = 101,
772    /// Device supports virtual memory management APIs.
773    VirtualMemoryManagementSupported = 102,
774    /// Device supports handle-type POSIX file descriptors for IPC.
775    HandleTypePosixFileDescriptorSupported = 103,
776    /// Device supports handle-type Win32 handles for IPC.
777    HandleTypeWin32HandleSupported = 104,
778    /// Device supports handle-type Win32 KMT handles for IPC.
779    HandleTypeWin32KmtHandleSupported = 105,
780    /// Maximum blocks per multiprocessor.
781    MaxBlocksPerMultiprocessor = 106,
782    /// Device supports generic compression for memory.
783    GenericCompressionSupported = 107,
784    /// Maximum persisting L2 cache size (bytes).
785    MaxPersistingL2CacheSize = 108,
786    /// Maximum access-policy window size for L2 cache.
787    MaxAccessPolicyWindowSize = 109,
788    /// Device supports RDMA APIs via `cuMemRangeGetAttribute`.
789    GpuDirectRdmaWithCudaVmmSupported = 110,
790    /// Free memory / total memory on the device accessible via `cuMemGetInfo`.
791    AccessPolicyMaxWindowSize = 111,
792    /// Reserved range of shared memory per SM (bytes).
793    ReservedSharedMemoryPerBlock = 112,
794    /// Device supports timeline semaphore interop.
795    TimelineSemaphoreInteropSupported = 113,
796    /// Device supports memory pools (`cudaMallocAsync`).
797    MemoryPoolsSupported = 115,
798    /// GPU direct RDMA is supported.
799    GpuDirectRdmaSupported = 116,
800    /// GPU direct RDMA flush-writes order.
801    GpuDirectRdmaFlushWritesOptions = 117,
802    /// GPU direct RDMA writes ordering.
803    GpuDirectRdmaWritesOrdering = 118,
804    /// Memory pool supported handle types.
805    MemoryPoolSupportedHandleTypes = 119,
806    /// Device supports cluster launch.
807    ClusterLaunch = 120,
808    /// Deferred mapping CUDA array supported.
809    DeferredMappingCudaArraySupported = 121,
810    /// Device supports IPC event handles.
811    IpcEventSupported = 122,
812    /// Device supports mem-sync domain count.
813    MemSyncDomainCount = 123,
814    /// Device supports tensor-map access to data.
815    TensorMapAccessSupported = 124,
816    /// Unified function pointers supported.
817    UnifiedFunctionPointers = 125,
818    /// NUMA config.
819    NumaConfig = 127,
820    /// NUMA id.
821    NumaId = 128,
822    /// Multicast supported.
823    /// Device supports getting the minimum required per-block shared memory
824    /// for a cooperative launch via the extended attributes.
825    MaxTimelineSemaphoreInteropSupported = 129,
826    /// Device supports memory sync domain operations.
827    MemSyncDomainSupported = 130,
828    /// Device supports GPU-Direct Fabric.
829    GpuDirectRdmaFabricSupported = 131,
830    /// Device supports multicast.
831    MulticastSupported = 132,
832    /// Device supports MPS features.
833    MpsEnabled = 133,
834    /// Host-NUMA identifier.
835    HostNumaId = 134,
836}
837
838// =========================================================================
839// CUjit_option — options for the JIT compiler
840// =========================================================================
841
842/// JIT compilation options passed to `cuModuleLoadDataEx` and related functions.
843#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
844#[repr(u32)]
845#[non_exhaustive]
846#[allow(non_camel_case_types)]
847pub enum CUjit_option {
848    /// Maximum number of registers that a thread may use.
849    MaxRegisters = 0,
850    /// Number of threads per block for the JIT target.
851    ThreadsPerBlock = 1,
852    /// Wall-clock time (ms) for compilation.
853    WallTime = 2,
854    /// Pointer to a buffer for info log output.
855    InfoLogBuffer = 3,
856    /// Size (bytes) of the info-log buffer.
857    InfoLogBufferSizeBytes = 4,
858    /// Pointer to a buffer for error log output.
859    ErrorLogBuffer = 5,
860    /// Size (bytes) of the error-log buffer.
861    ErrorLogBufferSizeBytes = 6,
862    /// Optimisation level (0-4).
863    OptimizationLevel = 7,
864    /// Determines the target based on the current attached context.
865    TargetFromCuContext = 8,
866    /// Specific compute target (sm_XX).
867    Target = 9,
868    /// Fallback strategy when exact match is not found.
869    FallbackStrategy = 10,
870    /// Specifies whether to generate debug info.
871    GenerateDebugInfo = 11,
872    /// Generate verbose log messages.
873    LogVerbose = 12,
874    /// Generate line-number information.
875    GenerateLineInfo = 13,
876    /// Cache mode (on / off).
877    CacheMode = 14,
878    /// (Internal) New SM3X option.
879    Sm3xOpt = 15,
880    /// Fast compile flag.
881    FastCompile = 16,
882    /// Global symbol names.
883    GlobalSymbolNames = 17,
884    /// Global symbol addresses.
885    GlobalSymbolAddresses = 18,
886    /// Number of global symbols.
887    GlobalSymbolCount = 19,
888    /// LTO flag.
889    Lto = 20,
890    /// FTZ (flush-to-zero) flag.
891    Ftz = 21,
892    /// Prec-div flag.
893    PrecDiv = 22,
894    /// Prec-sqrt flag.
895    PrecSqrt = 23,
896    /// FMA flag.
897    Fma = 24,
898    /// Referenced kernel names.
899    ReferencedKernelNames = 25,
900    /// Referenced kernel count.
901    ReferencedKernelCount = 26,
902    /// Referenced variable names.
903    ReferencedVariableNames = 27,
904    /// Referenced variable count.
905    ReferencedVariableCount = 28,
906    /// Optimise unused device variables.
907    OptimizeUnusedDeviceVariables = 29,
908    /// Position-independent code.
909    PositionIndependentCode = 30,
910}
911
912// =========================================================================
913// CUjitInputType — input types for the linker
914// =========================================================================
915
916/// Input types for `cuLinkAddData` / `cuLinkAddFile`.
917#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
918#[repr(u32)]
919#[non_exhaustive]
920pub enum CUjitInputType {
921    /// PTX source code.
922    Ptx = 1,
923    /// Compiled device code (cubin).
924    Cubin = 2,
925    /// Fat binary bundle.
926    Fatbin = 3,
927    /// Relocatable device object.
928    Object = 4,
929    /// Device code library.
930    Library = 5,
931}
932
933// =========================================================================
934// CUmemLocationType — location-type discriminant (CUDA 11.2+ VMM)
935// =========================================================================
936
937/// Specifies the kind of location described by a [`CUmemLocation`].
938///
939/// Mirrors `CUmemLocationType` in `cuda.h`.  Used by the virtual-memory
940/// management APIs to identify where a memory allocation resides or which
941/// device should be granted access.
942#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
943#[repr(u32)]
944#[non_exhaustive]
945pub enum CUmemLocationType {
946    /// Invalid / uninitialised location type.
947    Invalid = 0,
948    /// Location is a CUDA device (the `id` field is a device ordinal).
949    Device = 1,
950    /// Location is the host (CPU) memory.
951    Host = 2,
952    /// Location is a specific NUMA node on the host.
953    HostNuma = 3,
954    /// Location is the NUMA node currently bound to the calling thread.
955    HostNumaCurrent = 4,
956}
957
958// =========================================================================
959// CUmemAllocationType — allocation-kind discriminant (CUDA 11.2+ VMM)
960// =========================================================================
961
962/// Type of memory allocation requested via the VMM APIs.
963///
964/// Mirrors `CUmemAllocationType` in `cuda.h`.
965#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
966#[repr(u32)]
967#[non_exhaustive]
968pub enum CUmemAllocationType {
969    /// Invalid / uninitialised allocation type.
970    Invalid = 0,
971    /// Pinned (page-locked) GPU memory backed by physical device frames.
972    Pinned = 1,
973    /// Sentinel value used by the CUDA driver to mark forward-compatible
974    /// extensions; always equal to the maximum 32-bit signed integer.
975    Max = 0x7fff_ffff,
976}
977
978// =========================================================================
979// CUmemAllocationHandleType — exportable handle bitfield (CUDA 11.2+ VMM)
980// =========================================================================
981
982/// Set of operating-system handle types that the driver may export for a
983/// VMM allocation.  Treated as a bitfield in the CUDA C API.
984///
985/// Mirrors `CUmemAllocationHandleType` in `cuda.h`.
986#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
987#[repr(u32)]
988#[non_exhaustive]
989pub enum CUmemAllocationHandleType {
990    /// No exportable handle is requested.
991    None = 0,
992    /// POSIX file descriptor (Linux).
993    PosixFileDescriptor = 1,
994    /// Win32 NT handle.
995    Win32 = 2,
996    /// Win32 KMT handle (legacy kernel-mode-thunk).
997    Win32Kmt = 4,
998    /// Fabric handle for multi-host shared memory (CUDA 12.0+).
999    Fabric = 8,
1000}
1001
1002// =========================================================================
1003// CUmemAccessFlags — peer-access permissions for VMM allocations
1004// =========================================================================
1005
1006/// Access flags applied via `cuMemSetAccess` to a VMM allocation, controlling
1007/// whether a particular [`CUmemLocation`] may read or write the mapping.
1008///
1009/// Mirrors `CUmemAccess_flags` in `cuda.h`.  Renamed to follow Rust naming
1010/// conventions; the discriminant values are unchanged.
1011#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1012#[repr(u32)]
1013#[non_exhaustive]
1014pub enum CUmemAccessFlags {
1015    /// No access permitted from the location.
1016    None = 0,
1017    /// Read-only access permitted.
1018    Read = 1,
1019    /// Read-write access permitted.
1020    ReadWrite = 3,
1021    /// Sentinel value used by the CUDA driver for forward compatibility.
1022    Max = 0x7fff_ffff,
1023}
1024
1025// =========================================================================
1026// CUmemLocation — memory-location descriptor (CUDA 11.2+ VMM)
1027// =========================================================================
1028
1029/// Describes a physical memory location for the VMM and pool APIs.
1030///
1031/// Mirrors `CUmemLocation` in `cuda.h`.  The interpretation of `id` depends on
1032/// `loc_type`: for [`CUmemLocationType::Device`] it is a device ordinal, for
1033/// [`CUmemLocationType::HostNuma`] it is a NUMA node identifier, and for the
1034/// other variants it must be set to `0`.
1035///
1036/// The `loc_type` field is stored as a raw `u32` so that any forward-compatible
1037/// value emitted by a future driver can be round-tripped without UB; convert
1038/// to / from [`CUmemLocationType`] manually when interpreting it.
1039#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1040#[repr(C)]
1041pub struct CUmemLocation {
1042    /// Location type; see [`CUmemLocationType`].
1043    pub loc_type: u32,
1044    /// Identifier whose meaning depends on `loc_type`.
1045    pub id: i32,
1046}
1047
1048// =========================================================================
1049// CUmemAllocationProp — properties of a VMM allocation request
1050// =========================================================================
1051
1052/// Properties passed to `cuMemCreate` to describe a new VMM allocation.
1053///
1054/// Mirrors `CUmemAllocationProp` in `cuda.h`.
1055///
1056/// The `alloc_type`, `requested_handle_types` and `alloc_flags` fields are
1057/// stored as raw integers so that future driver extensions cannot trigger UB
1058/// via unknown discriminants; convert them to / from
1059/// [`CUmemAllocationType`] / [`CUmemAllocationHandleType`] when interpreting.
1060#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1061#[repr(C)]
1062pub struct CUmemAllocationProp {
1063    /// Allocation type; see [`CUmemAllocationType`].
1064    pub alloc_type: u32,
1065    /// Bitfield of OS handle types to export; see
1066    /// [`CUmemAllocationHandleType`].
1067    pub requested_handle_types: u32,
1068    /// Physical location of the allocation.
1069    pub location: CUmemLocation,
1070    /// Win32 security attributes pointer; null on non-Windows platforms or
1071    /// when no specific security descriptor is required.
1072    pub win32_handle_meta_data: *mut c_void,
1073    /// Reserved for future allocation flags; must be `0` on current drivers.
1074    pub alloc_flags: u64,
1075}
1076
1077// SAFETY: The struct contains a raw pointer (`win32_handle_meta_data`) that
1078// callers are responsible for managing.  The CUDA driver treats the pointer
1079// as opaque, so the struct itself is logically Send+Sync.
1080unsafe impl Send for CUmemAllocationProp {}
1081unsafe impl Sync for CUmemAllocationProp {}
1082
1083impl Default for CUmemAllocationProp {
1084    fn default() -> Self {
1085        Self {
1086            alloc_type: 0,
1087            requested_handle_types: 0,
1088            location: CUmemLocation::default(),
1089            win32_handle_meta_data: std::ptr::null_mut(),
1090            alloc_flags: 0,
1091        }
1092    }
1093}
1094
1095// =========================================================================
1096// CUmemAccessDesc — per-location access permissions for `cuMemSetAccess`
1097// =========================================================================
1098
1099/// Per-location access descriptor for `cuMemSetAccess`.
1100///
1101/// Mirrors `CUmemAccessDesc` in `cuda.h`.  The `flags` field stores a
1102/// [`CUmemAccessFlags`] value as a raw `u32` for FFI safety.
1103#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1104#[repr(C)]
1105pub struct CUmemAccessDesc {
1106    /// Memory location whose access permission is being changed.
1107    pub location: CUmemLocation,
1108    /// Access flags; see [`CUmemAccessFlags`].
1109    pub flags: u32,
1110}
1111
1112// =========================================================================
1113// CUmemPoolProps — properties of a stream-ordered memory pool
1114// =========================================================================
1115
1116/// Properties passed to `cuMemPoolCreate`.
1117///
1118/// Mirrors `CUmemPoolProps` in `cuda.h`.  The trailing `reserved` field is
1119/// part of the public ABI: the CUDA driver expects 56 zero bytes there to
1120/// preserve forward compatibility.
1121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1122#[repr(C)]
1123pub struct CUmemPoolProps {
1124    /// Allocation type to use when servicing pool requests; see
1125    /// [`CUmemAllocationType`].
1126    pub alloc_type: u32,
1127    /// Bitfield of OS handle types to export; see
1128    /// [`CUmemAllocationHandleType`].
1129    pub handle_types: u32,
1130    /// Physical location backing the pool.
1131    pub location: CUmemLocation,
1132    /// Win32 security-attributes pointer; null on non-Windows platforms or
1133    /// when no specific security descriptor is required.
1134    pub win32_security_attributes: *mut c_void,
1135    /// Maximum aggregate size (bytes) the pool may hold.  `0` means
1136    /// unlimited.
1137    pub max_size: usize,
1138    /// Reserved padding required by the CUDA ABI; must remain zeroed.
1139    pub reserved: [u8; 56],
1140}
1141
1142// SAFETY: The struct contains a raw pointer (`win32_security_attributes`) that
1143// callers are responsible for managing.  The CUDA driver treats the pointer
1144// as opaque, so the struct itself is logically Send+Sync.
1145unsafe impl Send for CUmemPoolProps {}
1146unsafe impl Sync for CUmemPoolProps {}
1147
1148impl Default for CUmemPoolProps {
1149    fn default() -> Self {
1150        Self {
1151            alloc_type: 0,
1152            handle_types: 0,
1153            location: CUmemLocation::default(),
1154            win32_security_attributes: std::ptr::null_mut(),
1155            max_size: 0,
1156            reserved: [0u8; 56],
1157        }
1158    }
1159}
1160
1161// =========================================================================
1162// CUDA_MEMCPY2D — descriptor for `cuMemcpy2D_v2`
1163// =========================================================================
1164
1165/// Descriptor for a 2-D memory copy executed via `cuMemcpy2D_v2`.
1166///
1167/// Mirrors `CUDA_MEMCPY2D` in `cuda.h`.  The CUDA driver inspects only the
1168/// fields appropriate for the source / destination memory types; the
1169/// remaining fields **must** be zeroed.  Use [`CUDA_MEMCPY2D::default`] to
1170/// obtain a zero-initialised descriptor and only set the fields you need.
1171///
1172/// `src_memory_type` and `dst_memory_type` are stored as raw `u32` for FFI
1173/// safety; convert to / from [`CUmemorytype`] manually.
1174#[derive(Debug, Clone, Copy)]
1175#[repr(C)]
1176pub struct CUDA_MEMCPY2D {
1177    /// Source X offset in bytes.
1178    pub src_x_in_bytes: usize,
1179    /// Source Y offset in rows.
1180    pub src_y: usize,
1181    /// Source memory type; see [`CUmemorytype`].
1182    pub src_memory_type: u32,
1183    /// Source host pointer (only valid when `src_memory_type == Host`).
1184    pub src_host: *const c_void,
1185    /// Source device pointer (only valid when `src_memory_type == Device`).
1186    pub src_device: CUdeviceptr,
1187    /// Source CUDA array (only valid when `src_memory_type == Array`).
1188    pub src_array: crate::ffi::CUarray,
1189    /// Source pitch in bytes (`0` selects a tightly-packed layout).
1190    pub src_pitch: usize,
1191    /// Destination X offset in bytes.
1192    pub dst_x_in_bytes: usize,
1193    /// Destination Y offset in rows.
1194    pub dst_y: usize,
1195    /// Destination memory type; see [`CUmemorytype`].
1196    pub dst_memory_type: u32,
1197    /// Destination host pointer (only valid when `dst_memory_type == Host`).
1198    pub dst_host: *mut c_void,
1199    /// Destination device pointer (only valid when `dst_memory_type == Device`).
1200    pub dst_device: CUdeviceptr,
1201    /// Destination CUDA array (only valid when `dst_memory_type == Array`).
1202    pub dst_array: crate::ffi::CUarray,
1203    /// Destination pitch in bytes (`0` selects a tightly-packed layout).
1204    pub dst_pitch: usize,
1205    /// Width of the copied region in bytes.
1206    pub width_in_bytes: usize,
1207    /// Height of the copied region in rows.
1208    pub height: usize,
1209}
1210
1211// SAFETY: The struct contains raw pointers and a CUDA array handle; callers
1212// are responsible for managing the underlying memory and handles.  Treating
1213// the descriptor itself as Send+Sync mirrors the C-side struct, which the
1214// driver may inspect from any thread.
1215unsafe impl Send for CUDA_MEMCPY2D {}
1216unsafe impl Sync for CUDA_MEMCPY2D {}
1217
1218impl Default for CUDA_MEMCPY2D {
1219    fn default() -> Self {
1220        Self {
1221            src_x_in_bytes: 0,
1222            src_y: 0,
1223            src_memory_type: 0,
1224            src_host: std::ptr::null(),
1225            src_device: 0,
1226            src_array: crate::ffi::CUarray::default(),
1227            src_pitch: 0,
1228            dst_x_in_bytes: 0,
1229            dst_y: 0,
1230            dst_memory_type: 0,
1231            dst_host: std::ptr::null_mut(),
1232            dst_device: 0,
1233            dst_array: crate::ffi::CUarray::default(),
1234            dst_pitch: 0,
1235            width_in_bytes: 0,
1236            height: 0,
1237        }
1238    }
1239}
1240
1241// =========================================================================
1242// Submodules — extracted per refactoring policy (<2000 lines per file)
1243// =========================================================================
1244
1245#[path = "ffi_constants.rs"]
1246mod ffi_constants;
1247pub use ffi_constants::*;
1248
1249#[path = "ffi_launch.rs"]
1250mod ffi_launch;
1251pub use ffi_launch::*;
1252
1253#[path = "ffi_descriptors.rs"]
1254mod ffi_descriptors;
1255pub use ffi_descriptors::*;
1256
1257// =========================================================================
1258// Tests
1259// =========================================================================
1260
1261#[cfg(test)]
1262mod tests {
1263    use super::*;
1264
1265    #[test]
1266    fn test_cuda_success_is_zero() {
1267        assert_eq!(CUDA_SUCCESS, 0);
1268    }
1269
1270    #[test]
1271    fn test_opaque_types_are_pointer_sized() {
1272        assert_eq!(
1273            std::mem::size_of::<CUcontext>(),
1274            std::mem::size_of::<*mut c_void>()
1275        );
1276        assert_eq!(
1277            std::mem::size_of::<CUmodule>(),
1278            std::mem::size_of::<*mut c_void>()
1279        );
1280        assert_eq!(
1281            std::mem::size_of::<CUstream>(),
1282            std::mem::size_of::<*mut c_void>()
1283        );
1284        assert_eq!(
1285            std::mem::size_of::<CUevent>(),
1286            std::mem::size_of::<*mut c_void>()
1287        );
1288        assert_eq!(
1289            std::mem::size_of::<CUfunction>(),
1290            std::mem::size_of::<*mut c_void>()
1291        );
1292        assert_eq!(
1293            std::mem::size_of::<CUmemoryPool>(),
1294            std::mem::size_of::<*mut c_void>()
1295        );
1296    }
1297
1298    #[test]
1299    fn test_handle_default_is_null() {
1300        assert!(CUcontext::default().is_null());
1301        assert!(CUmodule::default().is_null());
1302        assert!(CUfunction::default().is_null());
1303        assert!(CUstream::default().is_null());
1304        assert!(CUevent::default().is_null());
1305        assert!(CUmemoryPool::default().is_null());
1306    }
1307
1308    #[test]
1309    fn test_device_attribute_repr() {
1310        // Original variants
1311        assert_eq!(CUdevice_attribute::MaxThreadsPerBlock as i32, 1);
1312        assert_eq!(CUdevice_attribute::WarpSize as i32, 10);
1313        assert_eq!(CUdevice_attribute::MultiprocessorCount as i32, 16);
1314        assert_eq!(CUdevice_attribute::ComputeCapabilityMajor as i32, 75);
1315        assert_eq!(CUdevice_attribute::ComputeCapabilityMinor as i32, 76);
1316        assert_eq!(CUdevice_attribute::MaxBlocksPerMultiprocessor as i32, 106);
1317        assert_eq!(CUdevice_attribute::L2CacheSize as i32, 38);
1318        assert_eq!(
1319            CUdevice_attribute::MaxSharedMemoryPerMultiprocessor as i32,
1320            81
1321        );
1322        assert_eq!(CUdevice_attribute::ManagedMemory as i32, 83);
1323
1324        // New variants
1325        assert_eq!(CUdevice_attribute::MaxTexture2DGatherWidth as i32, 44);
1326        assert_eq!(CUdevice_attribute::MaxTexture2DGatherHeight as i32, 45);
1327        assert_eq!(CUdevice_attribute::MaxTexture3DWidthAlt as i32, 47);
1328        assert_eq!(CUdevice_attribute::MaxTexture3DHeightAlt as i32, 48);
1329        assert_eq!(CUdevice_attribute::MaxTexture3DDepthAlt as i32, 49);
1330        assert_eq!(CUdevice_attribute::MaxTexture1DMipmappedWidth2 as i32, 52);
1331        assert_eq!(CUdevice_attribute::Reserved92 as i32, 92);
1332        assert_eq!(CUdevice_attribute::Reserved93 as i32, 93);
1333        assert_eq!(CUdevice_attribute::Reserved94 as i32, 94);
1334        assert_eq!(
1335            CUdevice_attribute::VirtualMemoryManagementSupported as i32,
1336            102
1337        );
1338        assert_eq!(
1339            CUdevice_attribute::HandleTypePosixFileDescriptorSupported as i32,
1340            103
1341        );
1342        assert_eq!(
1343            CUdevice_attribute::HandleTypeWin32HandleSupported as i32,
1344            104
1345        );
1346        assert_eq!(
1347            CUdevice_attribute::HandleTypeWin32KmtHandleSupported as i32,
1348            105
1349        );
1350        assert_eq!(CUdevice_attribute::AccessPolicyMaxWindowSize as i32, 111);
1351        assert_eq!(CUdevice_attribute::ReservedSharedMemoryPerBlock as i32, 112);
1352        assert_eq!(
1353            CUdevice_attribute::TimelineSemaphoreInteropSupported as i32,
1354            113
1355        );
1356        assert_eq!(CUdevice_attribute::MemoryPoolsSupported as i32, 115);
1357        assert_eq!(CUdevice_attribute::ClusterLaunch as i32, 120);
1358        assert_eq!(CUdevice_attribute::UnifiedFunctionPointers as i32, 125);
1359        assert_eq!(
1360            CUdevice_attribute::MaxTimelineSemaphoreInteropSupported as i32,
1361            129
1362        );
1363        assert_eq!(CUdevice_attribute::MemSyncDomainSupported as i32, 130);
1364        assert_eq!(CUdevice_attribute::GpuDirectRdmaFabricSupported as i32, 131);
1365    }
1366
1367    #[test]
1368    fn test_jit_option_repr() {
1369        assert_eq!(CUjit_option::MaxRegisters as u32, 0);
1370        assert_eq!(CUjit_option::ThreadsPerBlock as u32, 1);
1371        assert_eq!(CUjit_option::WallTime as u32, 2);
1372        assert_eq!(CUjit_option::InfoLogBuffer as u32, 3);
1373        assert_eq!(CUjit_option::InfoLogBufferSizeBytes as u32, 4);
1374        assert_eq!(CUjit_option::ErrorLogBuffer as u32, 5);
1375        assert_eq!(CUjit_option::ErrorLogBufferSizeBytes as u32, 6);
1376        assert_eq!(CUjit_option::OptimizationLevel as u32, 7);
1377        assert_eq!(CUjit_option::Target as u32, 9);
1378        assert_eq!(CUjit_option::FallbackStrategy as u32, 10);
1379    }
1380
1381    #[test]
1382    #[allow(clippy::assertions_on_constants)]
1383    fn test_error_code_ranges() {
1384        // Basic errors: 1-8
1385        assert!(CUDA_ERROR_INVALID_VALUE < 10);
1386        // Device errors: 100-102
1387        assert!((100..=102).contains(&CUDA_ERROR_NO_DEVICE));
1388        assert!((100..=102).contains(&CUDA_ERROR_INVALID_DEVICE));
1389        assert!((100..=102).contains(&CUDA_ERROR_DEVICE_NOT_LICENSED));
1390        // Image/context errors: 200+
1391        assert!(CUDA_ERROR_INVALID_IMAGE >= 200);
1392        // Launch errors: 700+
1393        assert!(CUDA_ERROR_LAUNCH_FAILED >= 700);
1394        assert!(CUDA_ERROR_ILLEGAL_ADDRESS >= 700);
1395        assert!(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES >= 700);
1396        // Stream capture errors: 900+
1397        assert!(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED >= 900);
1398        // Unknown is 999
1399        assert_eq!(CUDA_ERROR_UNKNOWN, 999);
1400    }
1401
1402    #[test]
1403    fn test_handle_debug_format() {
1404        let ctx = CUcontext::default();
1405        let debug_str = format!("{ctx:?}");
1406        assert!(debug_str.starts_with("CUcontext("));
1407    }
1408
1409    #[test]
1410    fn test_handle_equality() {
1411        let a = CUcontext::default();
1412        let b = CUcontext::default();
1413        assert_eq!(a, b);
1414    }
1415
1416    #[test]
1417    fn test_new_handle_types_are_pointer_sized() {
1418        assert_eq!(
1419            std::mem::size_of::<CUtexref>(),
1420            std::mem::size_of::<*mut c_void>()
1421        );
1422        assert_eq!(
1423            std::mem::size_of::<CUsurfref>(),
1424            std::mem::size_of::<*mut c_void>()
1425        );
1426        assert_eq!(
1427            std::mem::size_of::<CUtexObject>(),
1428            std::mem::size_of::<*mut c_void>()
1429        );
1430        assert_eq!(
1431            std::mem::size_of::<CUsurfObject>(),
1432            std::mem::size_of::<*mut c_void>()
1433        );
1434    }
1435
1436    #[test]
1437    fn test_new_handle_defaults_are_null() {
1438        assert!(CUtexref::default().is_null());
1439        assert!(CUsurfref::default().is_null());
1440        assert!(CUtexObject::default().is_null());
1441        assert!(CUsurfObject::default().is_null());
1442    }
1443
1444    #[test]
1445    fn test_memory_type_enum() {
1446        assert_eq!(CUmemorytype::Host as u32, 1);
1447        assert_eq!(CUmemorytype::Device as u32, 2);
1448        assert_eq!(CUmemorytype::Array as u32, 3);
1449        assert_eq!(CUmemorytype::Unified as u32, 4);
1450    }
1451
1452    #[test]
1453    fn test_pointer_attribute_enum() {
1454        assert_eq!(CUpointer_attribute::Context as u32, 1);
1455        assert_eq!(CUpointer_attribute::MemoryType as u32, 2);
1456        assert_eq!(CUpointer_attribute::DevicePointer as u32, 3);
1457        assert_eq!(CUpointer_attribute::HostPointer as u32, 4);
1458        assert_eq!(CUpointer_attribute::IsManaged as u32, 9);
1459        assert_eq!(CUpointer_attribute::DeviceOrdinal as u32, 10);
1460    }
1461
1462    #[test]
1463    fn test_limit_enum() {
1464        assert_eq!(CUlimit::StackSize as u32, 0);
1465        assert_eq!(CUlimit::PrintfFifoSize as u32, 1);
1466        assert_eq!(CUlimit::MallocHeapSize as u32, 2);
1467        assert_eq!(CUlimit::DevRuntimeSyncDepth as u32, 3);
1468        assert_eq!(CUlimit::DevRuntimePendingLaunchCount as u32, 4);
1469        assert_eq!(CUlimit::MaxL2FetchGranularity as u32, 5);
1470        assert_eq!(CUlimit::PersistingL2CacheSize as u32, 6);
1471    }
1472
1473    #[test]
1474    fn test_function_attribute_enum() {
1475        assert_eq!(CUfunction_attribute::MaxThreadsPerBlock as i32, 0);
1476        assert_eq!(CUfunction_attribute::SharedSizeBytes as i32, 1);
1477        assert_eq!(CUfunction_attribute::NumRegs as i32, 4);
1478        assert_eq!(CUfunction_attribute::PtxVersion as i32, 5);
1479        assert_eq!(CUfunction_attribute::BinaryVersion as i32, 6);
1480        assert_eq!(CUfunction_attribute::MaxDynamicSharedSizeBytes as i32, 8);
1481        assert_eq!(
1482            CUfunction_attribute::PreferredSharedMemoryCarveout as i32,
1483            9
1484        );
1485    }
1486
1487    // ---------------------------------------------------------------------
1488    // VMM / Pool / Linker FFI types — added by Wave 1
1489    // ---------------------------------------------------------------------
1490
1491    #[test]
1492    fn test_link_state_handle_is_pointer_sized_and_default_null() {
1493        assert_eq!(
1494            std::mem::size_of::<CUlinkState>(),
1495            std::mem::size_of::<*mut c_void>()
1496        );
1497        assert!(CUlinkState::default().is_null());
1498    }
1499
1500    #[test]
1501    fn test_mem_generic_allocation_handle_is_u64() {
1502        assert_eq!(
1503            std::mem::size_of::<CUmemGenericAllocationHandle>(),
1504            std::mem::size_of::<u64>()
1505        );
1506        let _: CUmemGenericAllocationHandle = 0u64;
1507    }
1508
1509    #[test]
1510    fn test_mem_location_type_repr() {
1511        assert_eq!(CUmemLocationType::Invalid as u32, 0);
1512        assert_eq!(CUmemLocationType::Device as u32, 1);
1513        assert_eq!(CUmemLocationType::Host as u32, 2);
1514        assert_eq!(CUmemLocationType::HostNuma as u32, 3);
1515        assert_eq!(CUmemLocationType::HostNumaCurrent as u32, 4);
1516    }
1517
1518    #[test]
1519    fn test_mem_allocation_type_repr() {
1520        assert_eq!(CUmemAllocationType::Invalid as u32, 0);
1521        assert_eq!(CUmemAllocationType::Pinned as u32, 1);
1522        assert_eq!(CUmemAllocationType::Max as u32, 0x7fff_ffff);
1523    }
1524
1525    #[test]
1526    fn test_mem_allocation_handle_type_repr() {
1527        assert_eq!(CUmemAllocationHandleType::None as u32, 0);
1528        assert_eq!(CUmemAllocationHandleType::PosixFileDescriptor as u32, 1);
1529        assert_eq!(CUmemAllocationHandleType::Win32 as u32, 2);
1530        assert_eq!(CUmemAllocationHandleType::Win32Kmt as u32, 4);
1531        assert_eq!(CUmemAllocationHandleType::Fabric as u32, 8);
1532    }
1533
1534    #[test]
1535    fn test_mem_access_flags_repr() {
1536        assert_eq!(CUmemAccessFlags::None as u32, 0);
1537        assert_eq!(CUmemAccessFlags::Read as u32, 1);
1538        assert_eq!(CUmemAccessFlags::ReadWrite as u32, 3);
1539        assert_eq!(CUmemAccessFlags::Max as u32, 0x7fff_ffff);
1540    }
1541
1542    #[test]
1543    fn test_mem_location_layout() {
1544        // Two consecutive 4-byte fields → 8 bytes, alignment 4.
1545        assert_eq!(std::mem::size_of::<CUmemLocation>(), 8);
1546        assert_eq!(std::mem::align_of::<CUmemLocation>(), 4);
1547        let loc = CUmemLocation::default();
1548        assert_eq!(loc.loc_type, 0);
1549        assert_eq!(loc.id, 0);
1550    }
1551
1552    #[test]
1553    fn test_mem_access_desc_layout() {
1554        // CUmemLocation (8) + flags (u32 = 4) → 12 bytes, alignment 4.
1555        assert_eq!(std::mem::size_of::<CUmemAccessDesc>(), 12);
1556        assert_eq!(std::mem::align_of::<CUmemAccessDesc>(), 4);
1557        let desc = CUmemAccessDesc::default();
1558        assert_eq!(desc.flags, 0);
1559    }
1560
1561    #[test]
1562    fn test_mem_allocation_prop_default_zeroed() {
1563        let prop = CUmemAllocationProp::default();
1564        assert_eq!(prop.alloc_type, 0);
1565        assert_eq!(prop.requested_handle_types, 0);
1566        assert_eq!(prop.location.loc_type, 0);
1567        assert_eq!(prop.location.id, 0);
1568        assert!(prop.win32_handle_meta_data.is_null());
1569        assert_eq!(prop.alloc_flags, 0);
1570    }
1571
1572    #[test]
1573    fn test_mem_pool_props_default_zeroed_and_padded() {
1574        let props = CUmemPoolProps::default();
1575        assert_eq!(props.alloc_type, 0);
1576        assert_eq!(props.handle_types, 0);
1577        assert_eq!(props.location.loc_type, 0);
1578        assert_eq!(props.location.id, 0);
1579        assert!(props.win32_security_attributes.is_null());
1580        assert_eq!(props.max_size, 0);
1581        assert!(props.reserved.iter().all(|&b| b == 0));
1582        // The CUDA ABI mandates 56 reserved bytes.
1583        assert_eq!(props.reserved.len(), 56);
1584    }
1585
1586    #[test]
1587    fn test_memcpy2d_default_zeroed() {
1588        let m = CUDA_MEMCPY2D::default();
1589        assert_eq!(m.src_x_in_bytes, 0);
1590        assert_eq!(m.src_y, 0);
1591        assert_eq!(m.src_memory_type, 0);
1592        assert!(m.src_host.is_null());
1593        assert_eq!(m.src_device, 0);
1594        assert!(m.src_array.is_null());
1595        assert_eq!(m.src_pitch, 0);
1596        assert_eq!(m.dst_x_in_bytes, 0);
1597        assert_eq!(m.dst_y, 0);
1598        assert_eq!(m.dst_memory_type, 0);
1599        assert!(m.dst_host.is_null());
1600        assert_eq!(m.dst_device, 0);
1601        assert!(m.dst_array.is_null());
1602        assert_eq!(m.dst_pitch, 0);
1603        assert_eq!(m.width_in_bytes, 0);
1604        assert_eq!(m.height, 0);
1605    }
1606}