Skip to main content

oxicuda_driver/
ffi.rs

1//! Raw CUDA Driver API FFI types, constants, and enums.
2//!
3//! This module provides the low-level type definitions that mirror the CUDA Driver API
4//! (`cuda.h`). No functions are defined here — only types, opaque pointer aliases,
5//! result-code constants, and `#[repr]` enums used by the dynamically loaded driver
6//! entry points.
7//!
8//! # Safety
9//!
10//! All pointer types in this module are raw pointers intended for FFI use.
11//! They must only be used through the safe wrappers provided by higher-level
12//! modules in `oxicuda-driver`.
13
14use std::ffi::c_void;
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// Core scalar type aliases
19// ---------------------------------------------------------------------------
20
21/// Return code from every CUDA Driver API call.
22///
23/// A value of `0` (`CUDA_SUCCESS`) indicates success; any other value is an
24/// error code. See the `CUDA_*` constants below for the full catalogue.
25pub type CUresult = u32;
26
27/// Ordinal identifier for a CUDA-capable device (0-based).
28pub type CUdevice = i32;
29
30/// Device-side pointer (64-bit address in GPU virtual memory).
31pub type CUdeviceptr = u64;
32
33// ---------------------------------------------------------------------------
34// Opaque handle helpers
35// ---------------------------------------------------------------------------
36
37macro_rules! define_handle {
38    ($(#[$meta:meta])* $name:ident) => {
39        $(#[$meta])*
40        #[repr(transparent)]
41        #[derive(Clone, Copy, PartialEq, Eq, Hash)]
42        pub struct $name(pub *mut c_void);
43
44        // SAFETY: CUDA handles are thread-safe when used with proper
45        // synchronisation via the driver API.
46        unsafe impl Send for $name {}
47        unsafe impl Sync for $name {}
48
49        impl fmt::Debug for $name {
50            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51                write!(f, "{}({:p})", stringify!($name), self.0)
52            }
53        }
54
55        impl Default for $name {
56            fn default() -> Self {
57                Self(std::ptr::null_mut())
58            }
59        }
60
61        impl $name {
62            /// Returns `true` if the handle is null (uninitialised).
63            #[inline]
64            pub fn is_null(self) -> bool {
65                self.0.is_null()
66            }
67        }
68    };
69}
70
71// ---------------------------------------------------------------------------
72// Handle types
73// ---------------------------------------------------------------------------
74
75define_handle! {
76    /// Opaque handle to a CUDA context.
77    CUcontext
78}
79
80define_handle! {
81    /// Opaque handle to a loaded CUDA module (PTX / cubin).
82    CUmodule
83}
84
85define_handle! {
86    /// Opaque handle to a CUDA kernel function within a module.
87    CUfunction
88}
89
90define_handle! {
91    /// Opaque handle to a CUDA stream (command queue).
92    CUstream
93}
94
95define_handle! {
96    /// Opaque handle to a CUDA event (used for timing and synchronisation).
97    CUevent
98}
99
100define_handle! {
101    /// Opaque handle to a CUDA memory pool (`cuMemPool*` family).
102    CUmemoryPool
103}
104
105define_handle! {
106    /// Opaque handle to a CUDA texture reference (legacy API).
107    CUtexref
108}
109
110define_handle! {
111    /// Opaque handle to a CUDA surface reference (legacy API).
112    CUsurfref
113}
114
115define_handle! {
116    /// Opaque handle to a CUDA texture object (modern bindless API).
117    CUtexObject
118}
119
120define_handle! {
121    /// Opaque handle to a CUDA surface object (modern bindless API).
122    CUsurfObject
123}
124
125define_handle! {
126    /// Opaque handle to a CUDA kernel (CUDA 12.8+ library-based kernels).
127    ///
128    /// Used with `cuKernelGetLibrary` to retrieve the library a kernel
129    /// belongs to.
130    CUkernel
131}
132
133define_handle! {
134    /// Opaque handle to a CUDA library (CUDA 12.8+ JIT library API).
135    ///
136    /// Retrieved via `cuKernelGetLibrary` to identify the JIT-compiled
137    /// library that contains a given kernel.
138    CUlibrary
139}
140
141define_handle! {
142    /// Opaque handle to an NVLink multicast object (CUDA 12.8+).
143    ///
144    /// Used with `cuMulticastCreate`, `cuMulticastAddDevice`, and related
145    /// functions to manage NVLink multicast memory regions across devices.
146    CUmulticastObject
147}
148
149// =========================================================================
150// CUmemorytype — memory type identifiers
151// =========================================================================
152
153/// Memory type identifiers returned by pointer attribute queries.
154#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
155#[repr(u32)]
156#[non_exhaustive]
157pub enum CUmemorytype {
158    /// Host (system) memory.
159    Host = 1,
160    /// Device (GPU) memory.
161    Device = 2,
162    /// Array memory.
163    Array = 3,
164    /// Unified (managed) memory.
165    Unified = 4,
166}
167
168// =========================================================================
169// CUpointer_attribute — pointer attribute query keys
170// =========================================================================
171
172/// Pointer attribute identifiers passed to `cuPointerGetAttribute`.
173#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
174#[repr(u32)]
175#[non_exhaustive]
176#[allow(non_camel_case_types)]
177pub enum CUpointer_attribute {
178    /// Query the CUDA context associated with a pointer.
179    Context = 1,
180    /// Query the memory type (host / device / unified) of a pointer.
181    MemoryType = 2,
182    /// Query the device pointer corresponding to a host pointer.
183    DevicePointer = 3,
184    /// Query the host pointer corresponding to a device pointer.
185    HostPointer = 4,
186    /// Query whether the memory is managed (unified).
187    IsManaged = 9,
188    /// Query the device ordinal for the pointer.
189    DeviceOrdinal = 10,
190}
191
192// =========================================================================
193// CUlimit — context limit identifiers
194// =========================================================================
195
196/// Context limit identifiers for `cuCtxSetLimit` / `cuCtxGetLimit`.
197#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
198#[repr(u32)]
199#[non_exhaustive]
200pub enum CUlimit {
201    /// Stack size for each GPU thread.
202    StackSize = 0,
203    /// Size of the printf FIFO.
204    PrintfFifoSize = 1,
205    /// Size of the heap used by `malloc()` on the device.
206    MallocHeapSize = 2,
207    /// Maximum nesting depth of a device runtime launch.
208    DevRuntimeSyncDepth = 3,
209    /// Maximum number of outstanding device runtime launches.
210    DevRuntimePendingLaunchCount = 4,
211    /// L2 cache fetch granularity.
212    MaxL2FetchGranularity = 5,
213    /// Maximum persisting L2 cache size.
214    PersistingL2CacheSize = 6,
215}
216
217// =========================================================================
218// CUfunction_attribute — function attribute query keys
219// =========================================================================
220
221/// Function attribute identifiers passed to `cuFuncGetAttribute`.
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
223#[repr(i32)]
224#[non_exhaustive]
225#[allow(non_camel_case_types)]
226pub enum CUfunction_attribute {
227    /// Maximum threads per block for this function.
228    MaxThreadsPerBlock = 0,
229    /// Shared memory used by this function (bytes).
230    SharedSizeBytes = 1,
231    /// Size of user-allocated constant memory (bytes).
232    ConstSizeBytes = 2,
233    /// Size of local memory used by each thread (bytes).
234    LocalSizeBytes = 3,
235    /// Number of registers used by each thread.
236    NumRegs = 4,
237    /// PTX virtual architecture version.
238    PtxVersion = 5,
239    /// Binary architecture version.
240    BinaryVersion = 6,
241    /// Whether this function has been cached.
242    CacheModeCa = 7,
243    /// Maximum dynamic shared memory size (bytes).
244    MaxDynamicSharedSizeBytes = 8,
245    /// Preferred shared memory carve-out.
246    PreferredSharedMemoryCarveout = 9,
247}
248
249// =========================================================================
250// CUresult constants — every documented CUDA Driver API error code
251// =========================================================================
252
253/// The API call returned with no errors.
254pub const CUDA_SUCCESS: CUresult = 0;
255
256/// One or more parameters passed to the API call are not acceptable.
257pub const CUDA_ERROR_INVALID_VALUE: CUresult = 1;
258
259/// The API call failed because it was unable to allocate enough memory.
260pub const CUDA_ERROR_OUT_OF_MEMORY: CUresult = 2;
261
262/// The CUDA driver has not been initialised via `cuInit`.
263pub const CUDA_ERROR_NOT_INITIALIZED: CUresult = 3;
264
265/// The CUDA driver is shutting down.
266pub const CUDA_ERROR_DEINITIALIZED: CUresult = 4;
267
268/// Profiler is not initialised for this run.
269pub const CUDA_ERROR_PROFILER_DISABLED: CUresult = 5;
270
271/// (Deprecated) Profiler not started.
272pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: CUresult = 6;
273
274/// (Deprecated) Profiler already started.
275pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: CUresult = 7;
276
277/// (Deprecated) Profiler already stopped.
278pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: CUresult = 8;
279
280/// Stub library loaded instead of the real driver.
281pub const CUDA_ERROR_STUB_LIBRARY: CUresult = 34;
282
283/// Device-side assert triggered.
284pub const CUDA_ERROR_DEVICE_UNAVAILABLE: CUresult = 46;
285
286/// No CUDA-capable device is detected.
287pub const CUDA_ERROR_NO_DEVICE: CUresult = 100;
288
289/// The device ordinal supplied is out of range.
290pub const CUDA_ERROR_INVALID_DEVICE: CUresult = 101;
291
292/// The device does not have a valid licence.
293pub const CUDA_ERROR_DEVICE_NOT_LICENSED: CUresult = 102;
294
295/// The PTX or cubin image is invalid.
296pub const CUDA_ERROR_INVALID_IMAGE: CUresult = 200;
297
298/// The supplied context is not valid.
299pub const CUDA_ERROR_INVALID_CONTEXT: CUresult = 201;
300
301/// (Deprecated) Context already current.
302pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CUresult = 202;
303
304/// A map or register operation has failed.
305pub const CUDA_ERROR_MAP_FAILED: CUresult = 205;
306
307/// An unmap or unregister operation has failed.
308pub const CUDA_ERROR_UNMAP_FAILED: CUresult = 206;
309
310/// The specified array is currently mapped.
311pub const CUDA_ERROR_ARRAY_IS_MAPPED: CUresult = 207;
312
313/// The resource is already mapped.
314pub const CUDA_ERROR_ALREADY_MAPPED: CUresult = 208;
315
316/// There is no kernel image available for execution on the device.
317pub const CUDA_ERROR_NO_BINARY_FOR_GPU: CUresult = 209;
318
319/// A resource has already been acquired.
320pub const CUDA_ERROR_ALREADY_ACQUIRED: CUresult = 210;
321
322/// The resource is not mapped.
323pub const CUDA_ERROR_NOT_MAPPED: CUresult = 211;
324
325/// A mapped resource is not available for access as an array.
326pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: CUresult = 212;
327
328/// A mapped resource is not available for access as a pointer.
329pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: CUresult = 213;
330
331/// An uncorrectable ECC error was detected.
332pub const CUDA_ERROR_ECC_UNCORRECTABLE: CUresult = 214;
333
334/// A PTX JIT limit has been reached.
335pub const CUDA_ERROR_UNSUPPORTED_LIMIT: CUresult = 215;
336
337/// The context already has work from another thread bound to it.
338pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: CUresult = 216;
339
340/// Peer access is not supported across the given devices.
341pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: CUresult = 217;
342
343/// The PTX JIT compilation was disabled or the PTX is invalid.
344pub const CUDA_ERROR_INVALID_PTX: CUresult = 218;
345
346/// Invalid graphics context.
347pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: CUresult = 219;
348
349/// NVLINK is uncorrectable.
350pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: CUresult = 220;
351
352/// JIT compiler not found.
353pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: CUresult = 221;
354
355/// Unsupported PTX version.
356pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: CUresult = 222;
357
358/// JIT compilation disabled.
359pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: CUresult = 223;
360
361/// Unsupported exec-affinity type.
362pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: CUresult = 224;
363
364/// Unsupported device-side synchronisation on this device.
365pub const CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = 225;
366
367/// The requested source is invalid.
368pub const CUDA_ERROR_INVALID_SOURCE: CUresult = 300;
369
370/// The named file was not found.
371pub const CUDA_ERROR_FILE_NOT_FOUND: CUresult = 301;
372
373/// A shared-object symbol lookup failed.
374pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = 302;
375
376/// The shared-object init function failed.
377pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: CUresult = 303;
378
379/// An OS call failed.
380pub const CUDA_ERROR_OPERATING_SYSTEM: CUresult = 304;
381
382/// The supplied handle is invalid.
383pub const CUDA_ERROR_INVALID_HANDLE: CUresult = 400;
384
385/// The requested resource is in an illegal state.
386pub const CUDA_ERROR_ILLEGAL_STATE: CUresult = 401;
387
388/// A loss-less compression buffer was detected while doing uncompressed access.
389pub const CUDA_ERROR_LOSSY_QUERY: CUresult = 402;
390
391/// A named symbol was not found.
392pub const CUDA_ERROR_NOT_FOUND: CUresult = 500;
393
394/// The operation is not ready (asynchronous).
395pub const CUDA_ERROR_NOT_READY: CUresult = 600;
396
397/// An illegal memory address was encountered.
398pub const CUDA_ERROR_ILLEGAL_ADDRESS: CUresult = 700;
399
400/// The kernel launch uses too many resources (registers / shared memory).
401pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: CUresult = 701;
402
403/// The kernel launch exceeded the time-out enforced by the driver.
404pub const CUDA_ERROR_LAUNCH_TIMEOUT: CUresult = 702;
405
406/// A launch did not occur on a compatible texturing mode.
407pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: CUresult = 703;
408
409/// Peer access already enabled.
410pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: CUresult = 704;
411
412/// Peer access has not been enabled.
413pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: CUresult = 705;
414
415/// The primary context has already been initialised.
416pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: CUresult = 708;
417
418/// The context is being destroyed.
419pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: CUresult = 709;
420
421/// A 64-bit device assertion triggered.
422pub const CUDA_ERROR_ASSERT: CUresult = 710;
423
424/// Hardware resources to enable peer access are exhausted.
425pub const CUDA_ERROR_TOO_MANY_PEERS: CUresult = 711;
426
427/// The host-side memory region is already registered.
428pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: CUresult = 712;
429
430/// The host-side memory region is not registered.
431pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: CUresult = 713;
432
433/// Hardware stack overflow on the device.
434pub const CUDA_ERROR_HARDWARE_STACK_ERROR: CUresult = 714;
435
436/// Illegal instruction encountered on the device.
437pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: CUresult = 715;
438
439/// Misaligned address on the device.
440pub const CUDA_ERROR_MISALIGNED_ADDRESS: CUresult = 716;
441
442/// Invalid address space.
443pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: CUresult = 717;
444
445/// Invalid program counter on the device.
446pub const CUDA_ERROR_INVALID_PC: CUresult = 718;
447
448/// The kernel launch failed.
449pub const CUDA_ERROR_LAUNCH_FAILED: CUresult = 719;
450
451/// Cooperative launch is too large for the device/kernel.
452pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = 720;
453
454/// The API call is not permitted in the active context.
455pub const CUDA_ERROR_NOT_PERMITTED: CUresult = 800;
456
457/// The API call is not supported by the current driver/device combination.
458pub const CUDA_ERROR_NOT_SUPPORTED: CUresult = 801;
459
460/// System not ready for CUDA operations.
461pub const CUDA_ERROR_SYSTEM_NOT_READY: CUresult = 802;
462
463/// System driver mismatch.
464pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: CUresult = 803;
465
466/// Old-style context incompatible with CUDA 3.2+ API.
467pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: CUresult = 804;
468
469/// MPS connection failed.
470pub const CUDA_ERROR_MPS_CONNECTION_FAILED: CUresult = 805;
471
472/// MPS RPC failure.
473pub const CUDA_ERROR_MPS_RPC_FAILURE: CUresult = 806;
474
475/// MPS server not ready.
476pub const CUDA_ERROR_MPS_SERVER_NOT_READY: CUresult = 807;
477
478/// MPS maximum clients reached.
479pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: CUresult = 808;
480
481/// MPS maximum connections reached.
482pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: CUresult = 809;
483
484/// MPS client terminated.
485pub const CUDA_ERROR_MPS_CLIENT_TERMINATED: CUresult = 810;
486
487/// CDP not supported.
488pub const CUDA_ERROR_CDP_NOT_SUPPORTED: CUresult = 811;
489
490/// CDP version mismatch.
491pub const CUDA_ERROR_CDP_VERSION_MISMATCH: CUresult = 812;
492
493/// Stream capture unsupported.
494pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: CUresult = 900;
495
496/// Stream capture invalidated.
497pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: CUresult = 901;
498
499/// Stream capture merge not permitted.
500pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: CUresult = 902;
501
502/// Stream capture unmatched.
503pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: CUresult = 903;
504
505/// Stream capture unjoined.
506pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: CUresult = 904;
507
508/// Stream capture isolation violation.
509pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: CUresult = 905;
510
511/// Implicit stream in graph capture.
512pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: CUresult = 906;
513
514/// Captured event error.
515pub const CUDA_ERROR_CAPTURED_EVENT: CUresult = 907;
516
517/// Stream capture wrong thread.
518pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: CUresult = 908;
519
520/// The async operation timed out.
521pub const CUDA_ERROR_TIMEOUT: CUresult = 909;
522
523/// The graph update failed.
524pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: CUresult = 910;
525
526/// External device error.
527pub const CUDA_ERROR_EXTERNAL_DEVICE: CUresult = 911;
528
529/// Invalid cluster size.
530pub const CUDA_ERROR_INVALID_CLUSTER_SIZE: CUresult = 912;
531
532/// Function not loaded.
533pub const CUDA_ERROR_FUNCTION_NOT_LOADED: CUresult = 913;
534
535/// Invalid resource type.
536pub const CUDA_ERROR_INVALID_RESOURCE_TYPE: CUresult = 914;
537
538/// Invalid resource configuration.
539pub const CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = 915;
540
541/// An unknown internal error occurred.
542pub const CUDA_ERROR_UNKNOWN: CUresult = 999;
543
544// =========================================================================
545// CUdevice_attribute — device property query keys
546// =========================================================================
547
548/// Device attribute identifiers passed to `cuDeviceGetAttribute`.
549#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
550#[repr(i32)]
551#[non_exhaustive]
552#[allow(non_camel_case_types)]
553pub enum CUdevice_attribute {
554    /// Maximum number of threads per block.
555    MaxThreadsPerBlock = 1,
556    /// Maximum x-dimension of a block.
557    MaxBlockDimX = 2,
558    /// Maximum y-dimension of a block.
559    MaxBlockDimY = 3,
560    /// Maximum z-dimension of a block.
561    MaxBlockDimZ = 4,
562    /// Maximum x-dimension of a grid.
563    MaxGridDimX = 5,
564    /// Maximum y-dimension of a grid.
565    MaxGridDimY = 6,
566    /// Maximum z-dimension of a grid.
567    MaxGridDimZ = 7,
568    /// Maximum shared memory available per block (bytes).
569    MaxSharedMemoryPerBlock = 8,
570    /// Total amount of constant memory on the device (bytes).
571    TotalConstantMemory = 9,
572    /// Warp size in threads.
573    WarpSize = 10,
574    /// Maximum pitch allowed by memory copies (bytes).
575    MaxPitch = 11,
576    /// Maximum number of 32-bit registers per block.
577    MaxRegistersPerBlock = 12,
578    /// Peak clock frequency in kHz.
579    ClockRate = 13,
580    /// Alignment requirement for textures.
581    TextureAlignment = 14,
582    /// Device can possibly copy memory and execute a kernel concurrently.
583    GpuOverlap = 15,
584    /// Number of multiprocessors on the device.
585    MultiprocessorCount = 16,
586    /// Whether there is a run-time limit on kernels.
587    KernelExecTimeout = 17,
588    /// Device is integrated (shares host memory).
589    Integrated = 18,
590    /// Device can map host memory with `cuMemHostAlloc` / `cuMemHostRegister`.
591    CanMapHostMemory = 19,
592    /// Compute mode: default, exclusive, prohibited, etc.
593    ComputeMode = 20,
594    /// Maximum 1D texture width.
595    MaxTexture1DWidth = 21,
596    /// Maximum 2D texture width.
597    MaxTexture2DWidth = 22,
598    /// Maximum 2D texture height.
599    MaxTexture2DHeight = 23,
600    /// Maximum 3D texture width.
601    MaxTexture3DWidth = 24,
602    /// Maximum 3D texture height.
603    MaxTexture3DHeight = 25,
604    /// Maximum 3D texture depth.
605    MaxTexture3DDepth = 26,
606    /// Maximum 2D layered texture width.
607    MaxTexture2DLayeredWidth = 27,
608    /// Maximum 2D layered texture height.
609    MaxTexture2DLayeredHeight = 28,
610    /// Maximum layers in a 2D layered texture.
611    MaxTexture2DLayeredLayers = 29,
612    /// Alignment requirement for surfaces.
613    SurfaceAlignment = 30,
614    /// Device can execute multiple kernels concurrently.
615    ConcurrentKernels = 31,
616    /// Device supports ECC memory.
617    EccEnabled = 32,
618    /// PCI bus ID of the device.
619    PciBusId = 33,
620    /// PCI device ID of the device.
621    PciDeviceId = 34,
622    /// Device is using TCC (Tesla Compute Cluster) driver model.
623    TccDriver = 35,
624    /// Peak memory clock frequency in kHz.
625    MemoryClockRate = 36,
626    /// Global memory bus width in bits.
627    GlobalMemoryBusWidth = 37,
628    /// Size of L2 cache in bytes.
629    L2CacheSize = 38,
630    /// Maximum resident threads per multiprocessor.
631    MaxThreadsPerMultiprocessor = 39,
632    /// Number of asynchronous engines.
633    AsyncEngineCount = 40,
634    /// Device shares a unified address space with the host.
635    UnifiedAddressing = 41,
636    /// Maximum 1D layered texture width.
637    MaxTexture1DLayeredWidth = 42,
638    /// Maximum layers in a 1D layered texture.
639    MaxTexture1DLayeredLayers = 43,
640    /// Maximum 2D texture width if CUDA 2D memory allocation is bound.
641    MaxTexture2DGatherWidth = 44,
642    /// Maximum 2D texture height if CUDA 2D memory allocation is bound.
643    MaxTexture2DGatherHeight = 45,
644    /// Alternate maximum 3D texture width.
645    MaxTexture3DWidthAlt = 47,
646    /// Alternate maximum 3D texture height.
647    MaxTexture3DHeightAlt = 48,
648    /// Alternate maximum 3D texture depth.
649    MaxTexture3DDepthAlt = 49,
650    /// PCI domain ID.
651    PciDomainId = 50,
652    /// Texture pitch alignment.
653    TexturePitchAlignment = 51,
654    /// Maximum 1D mipmapped texture width.
655    MaxTexture1DMipmappedWidth2 = 52,
656    /// Maximum width for a cubemap texture.
657    MaxTextureCubemapWidth = 54,
658    /// Maximum width for a cubemap layered texture.
659    MaxTextureCubemapLayeredWidth = 55,
660    /// Maximum layers in a cubemap layered texture.
661    MaxTextureCubemapLayeredLayers = 56,
662    /// Maximum 1D surface width.
663    MaxSurface1DWidth = 57,
664    /// Maximum 2D surface width.
665    MaxSurface2DWidth = 58,
666    /// Maximum 2D surface height.
667    MaxSurface2DHeight = 59,
668    /// Maximum 3D surface width.
669    MaxSurface3DWidth = 60,
670    /// Maximum 3D surface height.
671    MaxSurface3DHeight = 61,
672    /// Maximum 3D surface depth.
673    MaxSurface3DDepth = 62,
674    /// Maximum cubemap surface width.
675    MaxSurfaceCubemapWidth = 63,
676    /// Maximum 1D layered surface width.
677    MaxSurface1DLayeredWidth = 64,
678    /// Maximum layers in a 1D layered surface.
679    MaxSurface1DLayeredLayers = 65,
680    /// Maximum 2D layered surface width.
681    MaxSurface2DLayeredWidth = 66,
682    /// Maximum 2D layered surface height.
683    MaxSurface2DLayeredHeight = 67,
684    /// Maximum layers in a 2D layered surface.
685    MaxSurface2DLayeredLayers = 68,
686    /// Maximum cubemap layered surface width.
687    MaxSurfaceCubemapLayeredWidth = 69,
688    /// Maximum layers in a cubemap layered surface.
689    MaxSurfaceCubemapLayeredLayers = 70,
690    /// Maximum 1D linear texture width (deprecated).
691    MaxTexture1DLinearWidth = 71,
692    /// Maximum 2D linear texture width.
693    MaxTexture2DLinearWidth = 72,
694    /// Maximum 2D linear texture height.
695    MaxTexture2DLinearHeight = 73,
696    /// Maximum 2D linear texture pitch (bytes).
697    MaxTexture2DLinearPitch = 74,
698    /// Major compute capability version number.
699    ComputeCapabilityMajor = 75,
700    /// Minor compute capability version number.
701    ComputeCapabilityMinor = 76,
702    /// Maximum mipmapped 2D texture width.
703    MaxTexture2DMipmappedWidth = 77,
704    /// Maximum mipmapped 2D texture height.
705    MaxTexture2DMipmappedHeight = 78,
706    /// Maximum mipmapped 1D texture width.
707    MaxTexture1DMipmappedWidth = 79,
708    /// Device supports stream priorities.
709    StreamPrioritiesSupported = 80,
710    /// Maximum shared memory per multiprocessor (bytes).
711    MaxSharedMemoryPerMultiprocessor = 81,
712    /// Maximum registers per multiprocessor.
713    MaxRegistersPerMultiprocessor = 82,
714    /// Device supports managed memory.
715    ManagedMemory = 83,
716    /// Device is on a multi-GPU board.
717    IsMultiGpuBoard = 84,
718    /// Unique identifier for the multi-GPU board group.
719    MultiGpuBoardGroupId = 85,
720    /// Host-visible native-atomic support for float operations.
721    HostNativeAtomicSupported = 86,
722    /// Ratio of single-to-double precision performance.
723    SingleToDoublePrecisionPerfRatio = 87,
724    /// Device supports pageable memory access.
725    PageableMemoryAccess = 88,
726    /// Device can access host registered memory at the same virtual address.
727    ConcurrentManagedAccess = 89,
728    /// Device supports compute preemption.
729    ComputePreemptionSupported = 90,
730    /// Device can access host memory via pageable accesses.
731    CanUseHostPointerForRegisteredMem = 91,
732    /// Reserved attribute (CUDA internal, value 92).
733    Reserved92 = 92,
734    /// Reserved attribute (CUDA internal, value 93).
735    Reserved93 = 93,
736    /// Reserved attribute (CUDA internal, value 94).
737    Reserved94 = 94,
738    /// Device supports cooperative kernel launches.
739    CooperativeLaunch = 95,
740    /// Device supports cooperative kernel launches across multiple GPUs.
741    CooperativeMultiDeviceLaunch = 96,
742    /// Maximum optin shared memory per block.
743    MaxSharedMemoryPerBlockOptin = 97,
744    /// Device supports flushing of outstanding remote writes.
745    CanFlushRemoteWrites = 98,
746    /// Device supports host-side memory-register functions.
747    HostRegisterSupported = 99,
748    /// Device supports pageable memory access using host page tables.
749    PageableMemoryAccessUsesHostPageTables = 100,
750    /// Device supports direct access to managed memory on the host.
751    DirectManagedMemAccessFromHost = 101,
752    /// Device supports virtual memory management APIs.
753    VirtualMemoryManagementSupported = 102,
754    /// Device supports handle-type POSIX file descriptors for IPC.
755    HandleTypePosixFileDescriptorSupported = 103,
756    /// Device supports handle-type Win32 handles for IPC.
757    HandleTypeWin32HandleSupported = 104,
758    /// Device supports handle-type Win32 KMT handles for IPC.
759    HandleTypeWin32KmtHandleSupported = 105,
760    /// Maximum blocks per multiprocessor.
761    MaxBlocksPerMultiprocessor = 106,
762    /// Device supports generic compression for memory.
763    GenericCompressionSupported = 107,
764    /// Maximum persisting L2 cache size (bytes).
765    MaxPersistingL2CacheSize = 108,
766    /// Maximum access-policy window size for L2 cache.
767    MaxAccessPolicyWindowSize = 109,
768    /// Device supports RDMA APIs via `cuMemRangeGetAttribute`.
769    GpuDirectRdmaWithCudaVmmSupported = 110,
770    /// Free memory / total memory on the device accessible via `cuMemGetInfo`.
771    AccessPolicyMaxWindowSize = 111,
772    /// Reserved range of shared memory per SM (bytes).
773    ReservedSharedMemoryPerBlock = 112,
774    /// Device supports timeline semaphore interop.
775    TimelineSemaphoreInteropSupported = 113,
776    /// Device supports memory pools (`cudaMallocAsync`).
777    MemoryPoolsSupported = 115,
778    /// GPU direct RDMA is supported.
779    GpuDirectRdmaSupported = 116,
780    /// GPU direct RDMA flush-writes order.
781    GpuDirectRdmaFlushWritesOptions = 117,
782    /// GPU direct RDMA writes ordering.
783    GpuDirectRdmaWritesOrdering = 118,
784    /// Memory pool supported handle types.
785    MemoryPoolSupportedHandleTypes = 119,
786    /// Device supports cluster launch.
787    ClusterLaunch = 120,
788    /// Deferred mapping CUDA array supported.
789    DeferredMappingCudaArraySupported = 121,
790    /// Device supports IPC event handles.
791    IpcEventSupported = 122,
792    /// Device supports mem-sync domain count.
793    MemSyncDomainCount = 123,
794    /// Device supports tensor-map access to data.
795    TensorMapAccessSupported = 124,
796    /// Unified function pointers supported.
797    UnifiedFunctionPointers = 125,
798    /// NUMA config.
799    NumaConfig = 127,
800    /// NUMA id.
801    NumaId = 128,
802    /// Multicast supported.
803    /// Device supports getting the minimum required per-block shared memory
804    /// for a cooperative launch via the extended attributes.
805    MaxTimelineSemaphoreInteropSupported = 129,
806    /// Device supports memory sync domain operations.
807    MemSyncDomainSupported = 130,
808    /// Device supports GPU-Direct Fabric.
809    GpuDirectRdmaFabricSupported = 131,
810    /// Device supports multicast.
811    MulticastSupported = 132,
812    /// Device supports MPS features.
813    MpsEnabled = 133,
814    /// Host-NUMA identifier.
815    HostNumaId = 134,
816}
817
818// =========================================================================
819// CUjit_option — options for the JIT compiler
820// =========================================================================
821
822/// JIT compilation options passed to `cuModuleLoadDataEx` and related functions.
823#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
824#[repr(u32)]
825#[non_exhaustive]
826#[allow(non_camel_case_types)]
827pub enum CUjit_option {
828    /// Maximum number of registers that a thread may use.
829    MaxRegisters = 0,
830    /// Number of threads per block for the JIT target.
831    ThreadsPerBlock = 1,
832    /// Wall-clock time (ms) for compilation.
833    WallTime = 2,
834    /// Pointer to a buffer for info log output.
835    InfoLogBuffer = 3,
836    /// Size (bytes) of the info-log buffer.
837    InfoLogBufferSizeBytes = 4,
838    /// Pointer to a buffer for error log output.
839    ErrorLogBuffer = 5,
840    /// Size (bytes) of the error-log buffer.
841    ErrorLogBufferSizeBytes = 6,
842    /// Optimisation level (0-4).
843    OptimizationLevel = 7,
844    /// Determines the target based on the current attached context.
845    TargetFromCuContext = 8,
846    /// Specific compute target (sm_XX).
847    Target = 9,
848    /// Fallback strategy when exact match is not found.
849    FallbackStrategy = 10,
850    /// Specifies whether to generate debug info.
851    GenerateDebugInfo = 11,
852    /// Generate verbose log messages.
853    LogVerbose = 12,
854    /// Generate line-number information.
855    GenerateLineInfo = 13,
856    /// Cache mode (on / off).
857    CacheMode = 14,
858    /// (Internal) New SM3X option.
859    Sm3xOpt = 15,
860    /// Fast compile flag.
861    FastCompile = 16,
862    /// Global symbol names.
863    GlobalSymbolNames = 17,
864    /// Global symbol addresses.
865    GlobalSymbolAddresses = 18,
866    /// Number of global symbols.
867    GlobalSymbolCount = 19,
868    /// LTO flag.
869    Lto = 20,
870    /// FTZ (flush-to-zero) flag.
871    Ftz = 21,
872    /// Prec-div flag.
873    PrecDiv = 22,
874    /// Prec-sqrt flag.
875    PrecSqrt = 23,
876    /// FMA flag.
877    Fma = 24,
878    /// Referenced kernel names.
879    ReferencedKernelNames = 25,
880    /// Referenced kernel count.
881    ReferencedKernelCount = 26,
882    /// Referenced variable names.
883    ReferencedVariableNames = 27,
884    /// Referenced variable count.
885    ReferencedVariableCount = 28,
886    /// Optimise unused device variables.
887    OptimizeUnusedDeviceVariables = 29,
888    /// Position-independent code.
889    PositionIndependentCode = 30,
890}
891
892// =========================================================================
893// CUjitInputType — input types for the linker
894// =========================================================================
895
896/// Input types for `cuLinkAddData` / `cuLinkAddFile`.
897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
898#[repr(u32)]
899#[non_exhaustive]
900pub enum CUjitInputType {
901    /// PTX source code.
902    Ptx = 1,
903    /// Compiled device code (cubin).
904    Cubin = 2,
905    /// Fat binary bundle.
906    Fatbin = 3,
907    /// Relocatable device object.
908    Object = 4,
909    /// Device code library.
910    Library = 5,
911}
912
913// =========================================================================
914// Submodules — extracted per refactoring policy (<2000 lines per file)
915// =========================================================================
916
917#[path = "ffi_constants.rs"]
918mod ffi_constants;
919pub use ffi_constants::*;
920
921#[path = "ffi_launch.rs"]
922mod ffi_launch;
923pub use ffi_launch::*;
924
925#[path = "ffi_descriptors.rs"]
926mod ffi_descriptors;
927pub use ffi_descriptors::*;
928
929// =========================================================================
930// Tests
931// =========================================================================
932
933#[cfg(test)]
934mod tests {
935    use super::*;
936
937    #[test]
938    fn test_cuda_success_is_zero() {
939        assert_eq!(CUDA_SUCCESS, 0);
940    }
941
942    #[test]
943    fn test_opaque_types_are_pointer_sized() {
944        assert_eq!(
945            std::mem::size_of::<CUcontext>(),
946            std::mem::size_of::<*mut c_void>()
947        );
948        assert_eq!(
949            std::mem::size_of::<CUmodule>(),
950            std::mem::size_of::<*mut c_void>()
951        );
952        assert_eq!(
953            std::mem::size_of::<CUstream>(),
954            std::mem::size_of::<*mut c_void>()
955        );
956        assert_eq!(
957            std::mem::size_of::<CUevent>(),
958            std::mem::size_of::<*mut c_void>()
959        );
960        assert_eq!(
961            std::mem::size_of::<CUfunction>(),
962            std::mem::size_of::<*mut c_void>()
963        );
964        assert_eq!(
965            std::mem::size_of::<CUmemoryPool>(),
966            std::mem::size_of::<*mut c_void>()
967        );
968    }
969
970    #[test]
971    fn test_handle_default_is_null() {
972        assert!(CUcontext::default().is_null());
973        assert!(CUmodule::default().is_null());
974        assert!(CUfunction::default().is_null());
975        assert!(CUstream::default().is_null());
976        assert!(CUevent::default().is_null());
977        assert!(CUmemoryPool::default().is_null());
978    }
979
980    #[test]
981    fn test_device_attribute_repr() {
982        // Original variants
983        assert_eq!(CUdevice_attribute::MaxThreadsPerBlock as i32, 1);
984        assert_eq!(CUdevice_attribute::WarpSize as i32, 10);
985        assert_eq!(CUdevice_attribute::MultiprocessorCount as i32, 16);
986        assert_eq!(CUdevice_attribute::ComputeCapabilityMajor as i32, 75);
987        assert_eq!(CUdevice_attribute::ComputeCapabilityMinor as i32, 76);
988        assert_eq!(CUdevice_attribute::MaxBlocksPerMultiprocessor as i32, 106);
989        assert_eq!(CUdevice_attribute::L2CacheSize as i32, 38);
990        assert_eq!(
991            CUdevice_attribute::MaxSharedMemoryPerMultiprocessor as i32,
992            81
993        );
994        assert_eq!(CUdevice_attribute::ManagedMemory as i32, 83);
995
996        // New variants
997        assert_eq!(CUdevice_attribute::MaxTexture2DGatherWidth as i32, 44);
998        assert_eq!(CUdevice_attribute::MaxTexture2DGatherHeight as i32, 45);
999        assert_eq!(CUdevice_attribute::MaxTexture3DWidthAlt as i32, 47);
1000        assert_eq!(CUdevice_attribute::MaxTexture3DHeightAlt as i32, 48);
1001        assert_eq!(CUdevice_attribute::MaxTexture3DDepthAlt as i32, 49);
1002        assert_eq!(CUdevice_attribute::MaxTexture1DMipmappedWidth2 as i32, 52);
1003        assert_eq!(CUdevice_attribute::Reserved92 as i32, 92);
1004        assert_eq!(CUdevice_attribute::Reserved93 as i32, 93);
1005        assert_eq!(CUdevice_attribute::Reserved94 as i32, 94);
1006        assert_eq!(
1007            CUdevice_attribute::VirtualMemoryManagementSupported as i32,
1008            102
1009        );
1010        assert_eq!(
1011            CUdevice_attribute::HandleTypePosixFileDescriptorSupported as i32,
1012            103
1013        );
1014        assert_eq!(
1015            CUdevice_attribute::HandleTypeWin32HandleSupported as i32,
1016            104
1017        );
1018        assert_eq!(
1019            CUdevice_attribute::HandleTypeWin32KmtHandleSupported as i32,
1020            105
1021        );
1022        assert_eq!(CUdevice_attribute::AccessPolicyMaxWindowSize as i32, 111);
1023        assert_eq!(CUdevice_attribute::ReservedSharedMemoryPerBlock as i32, 112);
1024        assert_eq!(
1025            CUdevice_attribute::TimelineSemaphoreInteropSupported as i32,
1026            113
1027        );
1028        assert_eq!(CUdevice_attribute::MemoryPoolsSupported as i32, 115);
1029        assert_eq!(CUdevice_attribute::ClusterLaunch as i32, 120);
1030        assert_eq!(CUdevice_attribute::UnifiedFunctionPointers as i32, 125);
1031        assert_eq!(
1032            CUdevice_attribute::MaxTimelineSemaphoreInteropSupported as i32,
1033            129
1034        );
1035        assert_eq!(CUdevice_attribute::MemSyncDomainSupported as i32, 130);
1036        assert_eq!(CUdevice_attribute::GpuDirectRdmaFabricSupported as i32, 131);
1037    }
1038
1039    #[test]
1040    fn test_jit_option_repr() {
1041        assert_eq!(CUjit_option::MaxRegisters as u32, 0);
1042        assert_eq!(CUjit_option::ThreadsPerBlock as u32, 1);
1043        assert_eq!(CUjit_option::WallTime as u32, 2);
1044        assert_eq!(CUjit_option::InfoLogBuffer as u32, 3);
1045        assert_eq!(CUjit_option::InfoLogBufferSizeBytes as u32, 4);
1046        assert_eq!(CUjit_option::ErrorLogBuffer as u32, 5);
1047        assert_eq!(CUjit_option::ErrorLogBufferSizeBytes as u32, 6);
1048        assert_eq!(CUjit_option::OptimizationLevel as u32, 7);
1049        assert_eq!(CUjit_option::Target as u32, 9);
1050        assert_eq!(CUjit_option::FallbackStrategy as u32, 10);
1051    }
1052
1053    #[test]
1054    #[allow(clippy::assertions_on_constants)]
1055    fn test_error_code_ranges() {
1056        // Basic errors: 1-8
1057        assert!(CUDA_ERROR_INVALID_VALUE < 10);
1058        // Device errors: 100-102
1059        assert!((100..=102).contains(&CUDA_ERROR_NO_DEVICE));
1060        assert!((100..=102).contains(&CUDA_ERROR_INVALID_DEVICE));
1061        assert!((100..=102).contains(&CUDA_ERROR_DEVICE_NOT_LICENSED));
1062        // Image/context errors: 200+
1063        assert!(CUDA_ERROR_INVALID_IMAGE >= 200);
1064        // Launch errors: 700+
1065        assert!(CUDA_ERROR_LAUNCH_FAILED >= 700);
1066        assert!(CUDA_ERROR_ILLEGAL_ADDRESS >= 700);
1067        assert!(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES >= 700);
1068        // Stream capture errors: 900+
1069        assert!(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED >= 900);
1070        // Unknown is 999
1071        assert_eq!(CUDA_ERROR_UNKNOWN, 999);
1072    }
1073
1074    #[test]
1075    fn test_handle_debug_format() {
1076        let ctx = CUcontext::default();
1077        let debug_str = format!("{ctx:?}");
1078        assert!(debug_str.starts_with("CUcontext("));
1079    }
1080
1081    #[test]
1082    fn test_handle_equality() {
1083        let a = CUcontext::default();
1084        let b = CUcontext::default();
1085        assert_eq!(a, b);
1086    }
1087
1088    #[test]
1089    fn test_new_handle_types_are_pointer_sized() {
1090        assert_eq!(
1091            std::mem::size_of::<CUtexref>(),
1092            std::mem::size_of::<*mut c_void>()
1093        );
1094        assert_eq!(
1095            std::mem::size_of::<CUsurfref>(),
1096            std::mem::size_of::<*mut c_void>()
1097        );
1098        assert_eq!(
1099            std::mem::size_of::<CUtexObject>(),
1100            std::mem::size_of::<*mut c_void>()
1101        );
1102        assert_eq!(
1103            std::mem::size_of::<CUsurfObject>(),
1104            std::mem::size_of::<*mut c_void>()
1105        );
1106    }
1107
1108    #[test]
1109    fn test_new_handle_defaults_are_null() {
1110        assert!(CUtexref::default().is_null());
1111        assert!(CUsurfref::default().is_null());
1112        assert!(CUtexObject::default().is_null());
1113        assert!(CUsurfObject::default().is_null());
1114    }
1115
1116    #[test]
1117    fn test_memory_type_enum() {
1118        assert_eq!(CUmemorytype::Host as u32, 1);
1119        assert_eq!(CUmemorytype::Device as u32, 2);
1120        assert_eq!(CUmemorytype::Array as u32, 3);
1121        assert_eq!(CUmemorytype::Unified as u32, 4);
1122    }
1123
1124    #[test]
1125    fn test_pointer_attribute_enum() {
1126        assert_eq!(CUpointer_attribute::Context as u32, 1);
1127        assert_eq!(CUpointer_attribute::MemoryType as u32, 2);
1128        assert_eq!(CUpointer_attribute::DevicePointer as u32, 3);
1129        assert_eq!(CUpointer_attribute::HostPointer as u32, 4);
1130        assert_eq!(CUpointer_attribute::IsManaged as u32, 9);
1131        assert_eq!(CUpointer_attribute::DeviceOrdinal as u32, 10);
1132    }
1133
1134    #[test]
1135    fn test_limit_enum() {
1136        assert_eq!(CUlimit::StackSize as u32, 0);
1137        assert_eq!(CUlimit::PrintfFifoSize as u32, 1);
1138        assert_eq!(CUlimit::MallocHeapSize as u32, 2);
1139        assert_eq!(CUlimit::DevRuntimeSyncDepth as u32, 3);
1140        assert_eq!(CUlimit::DevRuntimePendingLaunchCount as u32, 4);
1141        assert_eq!(CUlimit::MaxL2FetchGranularity as u32, 5);
1142        assert_eq!(CUlimit::PersistingL2CacheSize as u32, 6);
1143    }
1144
1145    #[test]
1146    fn test_function_attribute_enum() {
1147        assert_eq!(CUfunction_attribute::MaxThreadsPerBlock as i32, 0);
1148        assert_eq!(CUfunction_attribute::SharedSizeBytes as i32, 1);
1149        assert_eq!(CUfunction_attribute::NumRegs as i32, 4);
1150        assert_eq!(CUfunction_attribute::PtxVersion as i32, 5);
1151        assert_eq!(CUfunction_attribute::BinaryVersion as i32, 6);
1152        assert_eq!(CUfunction_attribute::MaxDynamicSharedSizeBytes as i32, 8);
1153        assert_eq!(
1154            CUfunction_attribute::PreferredSharedMemoryCarveout as i32,
1155            9
1156        );
1157    }
1158}