oxicuda_driver/
ffi.rs

1//! Raw CUDA Driver API FFI types, constants, and enums.
2//!
3//! This module provides the low-level type definitions that mirror the CUDA Driver API
4//! (`cuda.h`). No functions are defined here — only types, opaque pointer aliases,
5//! result-code constants, and `#[repr]` enums used by the dynamically loaded driver
6//! entry points.
7//!
8//! # Safety
9//!
10//! All pointer types in this module are raw pointers intended for FFI use.
11//! They must only be used through the safe wrappers provided by higher-level
12//! modules in `oxicuda-driver`.
13
14use std::ffi::c_void;
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// Core scalar type aliases
19// ---------------------------------------------------------------------------
20
21/// Return code from every CUDA Driver API call.
22///
23/// A value of `0` (`CUDA_SUCCESS`) indicates success; any other value is an
24/// error code. See the `CUDA_*` constants below for the full catalogue.
25pub type CUresult = u32;
26
27/// Ordinal identifier for a CUDA-capable device (0-based).
28pub type CUdevice = i32;
29
30/// Device-side pointer (64-bit address in GPU virtual memory).
31pub type CUdeviceptr = u64;
32
33// ---------------------------------------------------------------------------
34// Opaque handle helpers
35// ---------------------------------------------------------------------------
36
37macro_rules! define_handle {
38    ($(#[$meta:meta])* $name:ident) => {
39        $(#[$meta])*
40        #[repr(transparent)]
41        #[derive(Clone, Copy, PartialEq, Eq, Hash)]
42        pub struct $name(pub *mut c_void);
43
44        // SAFETY: an opaque CUDA handle is a plain value (a driver-side
45        // identifier), not something this crate ever dereferences. Every
46        // operation on it is performed by the driver through an
47        // `unsafe extern "C"` entry point, and the CUDA Driver API is
48        // documented as thread-safe (a context may even be current on several
49        // threads at once, CUDA >= 4.0). It is therefore sound to send and
50        // share these handles across threads. This is a deliberate part of the
51        // crate's threading model — `DevicePool` shares `Arc<Context>` across
52        // threads and so requires `Context: Sync`.
53        unsafe impl Send for $name {}
54        unsafe impl Sync for $name {}
55
56        impl fmt::Debug for $name {
57            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
58                write!(f, "{}({:p})", stringify!($name), self.0)
59            }
60        }
61
62        impl Default for $name {
63            fn default() -> Self {
64                Self(std::ptr::null_mut())
65            }
66        }
67
68        impl $name {
69            /// Returns `true` if the handle is null (uninitialised).
70            #[inline]
71            pub fn is_null(self) -> bool {
72                self.0.is_null()
73            }
74        }
75    };
76}
77
78/// Like [`define_handle!`] but for CUDA handles that the header types as a
79/// plain `unsigned long long` value rather than an opaque pointer (e.g.
80/// `CUtexObject`, `CUsurfObject`). Using a `u64` payload keeps the ABI correct
81/// on 32-bit targets (where a pointer is only 4 bytes) and renders the numeric
82/// value in `Debug`.
83macro_rules! define_value_handle {
84    ($(#[$meta:meta])* $name:ident) => {
85        $(#[$meta])*
86        #[repr(transparent)]
87        #[derive(Clone, Copy, PartialEq, Eq, Hash)]
88        pub struct $name(pub u64);
89
90        impl fmt::Debug for $name {
91            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
92                write!(f, "{}(0x{:x})", stringify!($name), self.0)
93            }
94        }
95
96        impl Default for $name {
97            fn default() -> Self {
98                Self(0)
99            }
100        }
101
102        impl $name {
103            /// Returns `true` if the handle is null (uninitialised, i.e. `0`).
104            #[inline]
105            pub fn is_null(self) -> bool {
106                self.0 == 0
107            }
108        }
109    };
110}
111
112// ---------------------------------------------------------------------------
113// Handle types
114// ---------------------------------------------------------------------------
115
116define_handle! {
117    /// Opaque handle to a CUDA context.
118    CUcontext
119}
120
121define_handle! {
122    /// Opaque handle to a loaded CUDA module (PTX / cubin).
123    CUmodule
124}
125
126define_handle! {
127    /// Opaque handle to a CUDA kernel function within a module.
128    CUfunction
129}
130
131define_handle! {
132    /// Opaque handle to a CUDA stream (command queue).
133    CUstream
134}
135
136define_handle! {
137    /// Opaque handle to a CUDA event (used for timing and synchronisation).
138    CUevent
139}
140
141define_handle! {
142    /// Opaque handle to a CUDA memory pool (`cuMemPool*` family).
143    CUmemoryPool
144}
145
146define_handle! {
147    /// Opaque handle to a CUDA texture reference (legacy API).
148    CUtexref
149}
150
151define_handle! {
152    /// Opaque handle to a CUDA surface reference (legacy API).
153    CUsurfref
154}
155
156define_value_handle! {
157    /// Handle to a CUDA texture object (modern bindless API).
158    ///
159    /// `cuda.h` types this as `unsigned long long`, so it is modelled as a
160    /// `u64` value rather than an opaque pointer.
161    CUtexObject
162}
163
164define_value_handle! {
165    /// Handle to a CUDA surface object (modern bindless API).
166    ///
167    /// `cuda.h` types this as `unsigned long long`, so it is modelled as a
168    /// `u64` value rather than an opaque pointer.
169    CUsurfObject
170}
171
172define_handle! {
173    /// Opaque handle to a CUDA kernel (CUDA 12.8+ library-based kernels).
174    ///
175    /// Used with `cuKernelGetLibrary` to retrieve the library a kernel
176    /// belongs to.
177    CUkernel
178}
179
180define_handle! {
181    /// Opaque handle to a CUDA library (CUDA 12.8+ JIT library API).
182    ///
183    /// Retrieved via `cuKernelGetLibrary` to identify the JIT-compiled
184    /// library that contains a given kernel.
185    CUlibrary
186}
187
188define_handle! {
189    /// Opaque handle to an NVLink multicast object (CUDA 12.8+).
190    ///
191    /// Used with `cuMulticastCreate`, `cuMulticastAddDevice`, and related
192    /// functions to manage NVLink multicast memory regions across devices.
193    CUmulticastObject
194}
195
196define_handle! {
197    /// Opaque handle to a CUDA JIT linker state (`CUlinkState`).
198    ///
199    /// Created by `cuLinkCreate_v2`, populated by repeated calls to
200    /// `cuLinkAddData_v2`, finalised by `cuLinkComplete`, and freed by
201    /// `cuLinkDestroy`.
202    CUlinkState
203}
204
205// =========================================================================
206// CUmemGenericAllocationHandle — VMM allocation handle (CUDA 11.2+)
207// =========================================================================
208
209/// Opaque handle to a generic memory allocation managed by the CUDA virtual
210/// memory management (VMM) APIs (`cuMemCreate`, `cuMemRelease`, `cuMemMap`).
211///
212/// Although the CUDA header types this as `unsigned long long`, it is an opaque
213/// driver-side identifier and must not be interpreted as a numeric address.
214pub type CUmemGenericAllocationHandle = u64;
215
216// =========================================================================
217// CUmemorytype — memory type identifiers
218// =========================================================================
219
220/// Memory type identifiers returned by pointer attribute queries.
221#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
222#[repr(u32)]
223#[non_exhaustive]
224pub enum CUmemorytype {
225    /// Host (system) memory.
226    Host = 1,
227    /// Device (GPU) memory.
228    Device = 2,
229    /// Array memory.
230    Array = 3,
231    /// Unified (managed) memory.
232    Unified = 4,
233}
234
235// =========================================================================
236// CUpointer_attribute — pointer attribute query keys
237// =========================================================================
238
239/// Pointer attribute identifiers passed to `cuPointerGetAttribute`.
240#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
241#[repr(u32)]
242#[non_exhaustive]
243#[allow(non_camel_case_types)]
244pub enum CUpointer_attribute {
245    /// Query the CUDA context associated with a pointer.
246    Context = 1,
247    /// Query the memory type (host / device / unified) of a pointer.
248    MemoryType = 2,
249    /// Query the device pointer corresponding to a host pointer.
250    DevicePointer = 3,
251    /// Query the host pointer corresponding to a device pointer.
252    HostPointer = 4,
253    /// Query whether the memory is managed (unified).
254    IsManaged = 8,
255    /// Query the device ordinal for the pointer.
256    DeviceOrdinal = 9,
257}
258
259// =========================================================================
260// CUlimit — context limit identifiers
261// =========================================================================
262
263/// Context limit identifiers for `cuCtxSetLimit` / `cuCtxGetLimit`.
264#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
265#[repr(u32)]
266#[non_exhaustive]
267pub enum CUlimit {
268    /// Stack size for each GPU thread.
269    StackSize = 0,
270    /// Size of the printf FIFO.
271    PrintfFifoSize = 1,
272    /// Size of the heap used by `malloc()` on the device.
273    MallocHeapSize = 2,
274    /// Maximum nesting depth of a device runtime launch.
275    DevRuntimeSyncDepth = 3,
276    /// Maximum number of outstanding device runtime launches.
277    DevRuntimePendingLaunchCount = 4,
278    /// L2 cache fetch granularity.
279    MaxL2FetchGranularity = 5,
280    /// Maximum persisting L2 cache size.
281    PersistingL2CacheSize = 6,
282}
283
284// =========================================================================
285// CUfunction_attribute — function attribute query keys
286// =========================================================================
287
288/// Function attribute identifiers passed to `cuFuncGetAttribute`.
289#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
290#[repr(i32)]
291#[non_exhaustive]
292#[allow(non_camel_case_types)]
293pub enum CUfunction_attribute {
294    /// Maximum threads per block for this function.
295    MaxThreadsPerBlock = 0,
296    /// Shared memory used by this function (bytes).
297    SharedSizeBytes = 1,
298    /// Size of user-allocated constant memory (bytes).
299    ConstSizeBytes = 2,
300    /// Size of local memory used by each thread (bytes).
301    LocalSizeBytes = 3,
302    /// Number of registers used by each thread.
303    NumRegs = 4,
304    /// PTX virtual architecture version.
305    PtxVersion = 5,
306    /// Binary architecture version.
307    BinaryVersion = 6,
308    /// Whether this function has been cached.
309    CacheModeCa = 7,
310    /// Maximum dynamic shared memory size (bytes).
311    MaxDynamicSharedSizeBytes = 8,
312    /// Preferred shared memory carve-out.
313    PreferredSharedMemoryCarveout = 9,
314}
315
316// =========================================================================
317// CUresult constants — every documented CUDA Driver API error code
318// =========================================================================
319
320/// The API call returned with no errors.
321pub const CUDA_SUCCESS: CUresult = 0;
322
323/// One or more parameters passed to the API call are not acceptable.
324pub const CUDA_ERROR_INVALID_VALUE: CUresult = 1;
325
326/// The API call failed because it was unable to allocate enough memory.
327pub const CUDA_ERROR_OUT_OF_MEMORY: CUresult = 2;
328
329/// The CUDA driver has not been initialised via `cuInit`.
330pub const CUDA_ERROR_NOT_INITIALIZED: CUresult = 3;
331
332/// The CUDA driver is shutting down.
333pub const CUDA_ERROR_DEINITIALIZED: CUresult = 4;
334
335/// Profiler is not initialised for this run.
336pub const CUDA_ERROR_PROFILER_DISABLED: CUresult = 5;
337
338/// (Deprecated) Profiler not started.
339pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: CUresult = 6;
340
341/// (Deprecated) Profiler already started.
342pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: CUresult = 7;
343
344/// (Deprecated) Profiler already stopped.
345pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: CUresult = 8;
346
347/// Stub library loaded instead of the real driver.
348pub const CUDA_ERROR_STUB_LIBRARY: CUresult = 34;
349
350/// Device-side assert triggered.
351pub const CUDA_ERROR_DEVICE_UNAVAILABLE: CUresult = 46;
352
353/// No CUDA-capable device is detected.
354pub const CUDA_ERROR_NO_DEVICE: CUresult = 100;
355
356/// The device ordinal supplied is out of range.
357pub const CUDA_ERROR_INVALID_DEVICE: CUresult = 101;
358
359/// The device does not have a valid licence.
360pub const CUDA_ERROR_DEVICE_NOT_LICENSED: CUresult = 102;
361
362/// The PTX or cubin image is invalid.
363pub const CUDA_ERROR_INVALID_IMAGE: CUresult = 200;
364
365/// The supplied context is not valid.
366pub const CUDA_ERROR_INVALID_CONTEXT: CUresult = 201;
367
368/// (Deprecated) Context already current.
369pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CUresult = 202;
370
371/// A map or register operation has failed.
372pub const CUDA_ERROR_MAP_FAILED: CUresult = 205;
373
374/// An unmap or unregister operation has failed.
375pub const CUDA_ERROR_UNMAP_FAILED: CUresult = 206;
376
377/// The specified array is currently mapped.
378pub const CUDA_ERROR_ARRAY_IS_MAPPED: CUresult = 207;
379
380/// The resource is already mapped.
381pub const CUDA_ERROR_ALREADY_MAPPED: CUresult = 208;
382
383/// There is no kernel image available for execution on the device.
384pub const CUDA_ERROR_NO_BINARY_FOR_GPU: CUresult = 209;
385
386/// A resource has already been acquired.
387pub const CUDA_ERROR_ALREADY_ACQUIRED: CUresult = 210;
388
389/// The resource is not mapped.
390pub const CUDA_ERROR_NOT_MAPPED: CUresult = 211;
391
392/// A mapped resource is not available for access as an array.
393pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: CUresult = 212;
394
395/// A mapped resource is not available for access as a pointer.
396pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: CUresult = 213;
397
398/// An uncorrectable ECC error was detected.
399pub const CUDA_ERROR_ECC_UNCORRECTABLE: CUresult = 214;
400
401/// A PTX JIT limit has been reached.
402pub const CUDA_ERROR_UNSUPPORTED_LIMIT: CUresult = 215;
403
404/// The context already has work from another thread bound to it.
405pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: CUresult = 216;
406
407/// Peer access is not supported across the given devices.
408pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: CUresult = 217;
409
410/// The PTX JIT compilation was disabled or the PTX is invalid.
411pub const CUDA_ERROR_INVALID_PTX: CUresult = 218;
412
413/// Invalid graphics context.
414pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: CUresult = 219;
415
416/// NVLINK is uncorrectable.
417pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: CUresult = 220;
418
419/// JIT compiler not found.
420pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: CUresult = 221;
421
422/// Unsupported PTX version.
423pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: CUresult = 222;
424
425/// JIT compilation disabled.
426pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: CUresult = 223;
427
428/// Unsupported exec-affinity type.
429pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: CUresult = 224;
430
431/// Unsupported device-side synchronisation on this device.
432pub const CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = 225;
433
434/// The requested source is invalid.
435pub const CUDA_ERROR_INVALID_SOURCE: CUresult = 300;
436
437/// The named file was not found.
438pub const CUDA_ERROR_FILE_NOT_FOUND: CUresult = 301;
439
440/// A shared-object symbol lookup failed.
441pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = 302;
442
443/// The shared-object init function failed.
444pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: CUresult = 303;
445
446/// An OS call failed.
447pub const CUDA_ERROR_OPERATING_SYSTEM: CUresult = 304;
448
449/// The supplied handle is invalid.
450pub const CUDA_ERROR_INVALID_HANDLE: CUresult = 400;
451
452/// The requested resource is in an illegal state.
453pub const CUDA_ERROR_ILLEGAL_STATE: CUresult = 401;
454
455/// A loss-less compression buffer was detected while doing uncompressed access.
456pub const CUDA_ERROR_LOSSY_QUERY: CUresult = 402;
457
458/// A named symbol was not found.
459pub const CUDA_ERROR_NOT_FOUND: CUresult = 500;
460
461/// The operation is not ready (asynchronous).
462pub const CUDA_ERROR_NOT_READY: CUresult = 600;
463
464/// An illegal memory address was encountered.
465pub const CUDA_ERROR_ILLEGAL_ADDRESS: CUresult = 700;
466
467/// The kernel launch uses too many resources (registers / shared memory).
468pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: CUresult = 701;
469
470/// The kernel launch exceeded the time-out enforced by the driver.
471pub const CUDA_ERROR_LAUNCH_TIMEOUT: CUresult = 702;
472
473/// A launch did not occur on a compatible texturing mode.
474pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: CUresult = 703;
475
476/// Peer access already enabled.
477pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: CUresult = 704;
478
479/// Peer access has not been enabled.
480pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: CUresult = 705;
481
482/// The primary context has already been initialised.
483pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: CUresult = 708;
484
485/// The context is being destroyed.
486pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: CUresult = 709;
487
488/// A 64-bit device assertion triggered.
489pub const CUDA_ERROR_ASSERT: CUresult = 710;
490
491/// Hardware resources to enable peer access are exhausted.
492pub const CUDA_ERROR_TOO_MANY_PEERS: CUresult = 711;
493
494/// The host-side memory region is already registered.
495pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: CUresult = 712;
496
497/// The host-side memory region is not registered.
498pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: CUresult = 713;
499
500/// Hardware stack overflow on the device.
501pub const CUDA_ERROR_HARDWARE_STACK_ERROR: CUresult = 714;
502
503/// Illegal instruction encountered on the device.
504pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: CUresult = 715;
505
506/// Misaligned address on the device.
507pub const CUDA_ERROR_MISALIGNED_ADDRESS: CUresult = 716;
508
509/// Invalid address space.
510pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: CUresult = 717;
511
512/// Invalid program counter on the device.
513pub const CUDA_ERROR_INVALID_PC: CUresult = 718;
514
515/// The kernel launch failed.
516pub const CUDA_ERROR_LAUNCH_FAILED: CUresult = 719;
517
518/// Cooperative launch is too large for the device/kernel.
519pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = 720;
520
521/// The API call is not permitted in the active context.
522pub const CUDA_ERROR_NOT_PERMITTED: CUresult = 800;
523
524/// The API call is not supported by the current driver/device combination.
525pub const CUDA_ERROR_NOT_SUPPORTED: CUresult = 801;
526
527/// System not ready for CUDA operations.
528pub const CUDA_ERROR_SYSTEM_NOT_READY: CUresult = 802;
529
530/// System driver mismatch.
531pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: CUresult = 803;
532
533/// Old-style context incompatible with CUDA 3.2+ API.
534pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: CUresult = 804;
535
536/// MPS connection failed.
537pub const CUDA_ERROR_MPS_CONNECTION_FAILED: CUresult = 805;
538
539/// MPS RPC failure.
540pub const CUDA_ERROR_MPS_RPC_FAILURE: CUresult = 806;
541
542/// MPS server not ready.
543pub const CUDA_ERROR_MPS_SERVER_NOT_READY: CUresult = 807;
544
545/// MPS maximum clients reached.
546pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: CUresult = 808;
547
548/// MPS maximum connections reached.
549pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: CUresult = 809;
550
551/// MPS client terminated.
552pub const CUDA_ERROR_MPS_CLIENT_TERMINATED: CUresult = 810;
553
554/// CDP not supported.
555pub const CUDA_ERROR_CDP_NOT_SUPPORTED: CUresult = 811;
556
557/// CDP version mismatch.
558pub const CUDA_ERROR_CDP_VERSION_MISMATCH: CUresult = 812;
559
560/// Stream capture unsupported.
561pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: CUresult = 900;
562
563/// Stream capture invalidated.
564pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: CUresult = 901;
565
566/// Stream capture merge not permitted.
567pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: CUresult = 902;
568
569/// Stream capture unmatched.
570pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: CUresult = 903;
571
572/// Stream capture unjoined.
573pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: CUresult = 904;
574
575/// Stream capture isolation violation.
576pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: CUresult = 905;
577
578/// Implicit stream in graph capture.
579pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: CUresult = 906;
580
581/// Captured event error.
582pub const CUDA_ERROR_CAPTURED_EVENT: CUresult = 907;
583
584/// Stream capture wrong thread.
585pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: CUresult = 908;
586
587/// The async operation timed out.
588pub const CUDA_ERROR_TIMEOUT: CUresult = 909;
589
590/// The graph update failed.
591pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: CUresult = 910;
592
593/// External device error.
594pub const CUDA_ERROR_EXTERNAL_DEVICE: CUresult = 911;
595
596/// Invalid cluster size.
597pub const CUDA_ERROR_INVALID_CLUSTER_SIZE: CUresult = 912;
598
599/// Function not loaded.
600pub const CUDA_ERROR_FUNCTION_NOT_LOADED: CUresult = 913;
601
602/// Invalid resource type.
603pub const CUDA_ERROR_INVALID_RESOURCE_TYPE: CUresult = 914;
604
605/// Invalid resource configuration.
606pub const CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = 915;
607
608/// An unknown internal error occurred.
609pub const CUDA_ERROR_UNKNOWN: CUresult = 999;
610
611// =========================================================================
612// CUdevice_attribute — device property query keys
613// =========================================================================
614
615/// Device attribute identifiers passed to `cuDeviceGetAttribute`.
616#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
617#[repr(i32)]
618#[non_exhaustive]
619#[allow(non_camel_case_types)]
620pub enum CUdevice_attribute {
621    /// Maximum number of threads per block.
622    MaxThreadsPerBlock = 1,
623    /// Maximum x-dimension of a block.
624    MaxBlockDimX = 2,
625    /// Maximum y-dimension of a block.
626    MaxBlockDimY = 3,
627    /// Maximum z-dimension of a block.
628    MaxBlockDimZ = 4,
629    /// Maximum x-dimension of a grid.
630    MaxGridDimX = 5,
631    /// Maximum y-dimension of a grid.
632    MaxGridDimY = 6,
633    /// Maximum z-dimension of a grid.
634    MaxGridDimZ = 7,
635    /// Maximum shared memory available per block (bytes).
636    MaxSharedMemoryPerBlock = 8,
637    /// Total amount of constant memory on the device (bytes).
638    TotalConstantMemory = 9,
639    /// Warp size in threads.
640    WarpSize = 10,
641    /// Maximum pitch allowed by memory copies (bytes).
642    MaxPitch = 11,
643    /// Maximum number of 32-bit registers per block.
644    MaxRegistersPerBlock = 12,
645    /// Peak clock frequency in kHz.
646    ClockRate = 13,
647    /// Alignment requirement for textures.
648    TextureAlignment = 14,
649    /// Device can possibly copy memory and execute a kernel concurrently.
650    GpuOverlap = 15,
651    /// Number of multiprocessors on the device.
652    MultiprocessorCount = 16,
653    /// Whether there is a run-time limit on kernels.
654    KernelExecTimeout = 17,
655    /// Device is integrated (shares host memory).
656    Integrated = 18,
657    /// Device can map host memory with `cuMemHostAlloc` / `cuMemHostRegister`.
658    CanMapHostMemory = 19,
659    /// Compute mode: default, exclusive, prohibited, etc.
660    ComputeMode = 20,
661    /// Maximum 1D texture width.
662    MaxTexture1DWidth = 21,
663    /// Maximum 2D texture width.
664    MaxTexture2DWidth = 22,
665    /// Maximum 2D texture height.
666    MaxTexture2DHeight = 23,
667    /// Maximum 3D texture width.
668    MaxTexture3DWidth = 24,
669    /// Maximum 3D texture height.
670    MaxTexture3DHeight = 25,
671    /// Maximum 3D texture depth.
672    MaxTexture3DDepth = 26,
673    /// Maximum 2D layered texture width.
674    MaxTexture2DLayeredWidth = 27,
675    /// Maximum 2D layered texture height.
676    MaxTexture2DLayeredHeight = 28,
677    /// Maximum layers in a 2D layered texture.
678    MaxTexture2DLayeredLayers = 29,
679    /// Alignment requirement for surfaces.
680    SurfaceAlignment = 30,
681    /// Device can execute multiple kernels concurrently.
682    ConcurrentKernels = 31,
683    /// Device supports ECC memory.
684    EccEnabled = 32,
685    /// PCI bus ID of the device.
686    PciBusId = 33,
687    /// PCI device ID of the device.
688    PciDeviceId = 34,
689    /// Device is using TCC (Tesla Compute Cluster) driver model.
690    TccDriver = 35,
691    /// Peak memory clock frequency in kHz.
692    MemoryClockRate = 36,
693    /// Global memory bus width in bits.
694    GlobalMemoryBusWidth = 37,
695    /// Size of L2 cache in bytes.
696    L2CacheSize = 38,
697    /// Maximum resident threads per multiprocessor.
698    MaxThreadsPerMultiprocessor = 39,
699    /// Number of asynchronous engines.
700    AsyncEngineCount = 40,
701    /// Device shares a unified address space with the host.
702    UnifiedAddressing = 41,
703    /// Maximum 1D layered texture width.
704    MaxTexture1DLayeredWidth = 42,
705    /// Maximum layers in a 1D layered texture.
706    MaxTexture1DLayeredLayers = 43,
707    /// Device can gather 2D textures (`CAN_TEX2D_GATHER`, deprecated).
708    CanTex2dGather = 44,
709    /// Maximum 2D texture gather width.
710    MaxTexture2DGatherWidth = 45,
711    /// Maximum 2D texture gather height.
712    MaxTexture2DGatherHeight = 46,
713    /// Alternate maximum 3D texture width.
714    MaxTexture3DWidthAlt = 47,
715    /// Alternate maximum 3D texture height.
716    MaxTexture3DHeightAlt = 48,
717    /// Alternate maximum 3D texture depth.
718    MaxTexture3DDepthAlt = 49,
719    /// PCI domain ID.
720    PciDomainId = 50,
721    /// Texture pitch alignment.
722    TexturePitchAlignment = 51,
723    /// Maximum width for a cubemap texture.
724    MaxTextureCubemapWidth = 52,
725    /// Maximum width for a cubemap layered texture.
726    MaxTextureCubemapLayeredWidth = 53,
727    /// Maximum layers in a cubemap layered texture.
728    MaxTextureCubemapLayeredLayers = 54,
729    /// Maximum 1D surface width.
730    MaxSurface1DWidth = 55,
731    /// Maximum 2D surface width.
732    MaxSurface2DWidth = 56,
733    /// Maximum 2D surface height.
734    MaxSurface2DHeight = 57,
735    /// Maximum 3D surface width.
736    MaxSurface3DWidth = 58,
737    /// Maximum 3D surface height.
738    MaxSurface3DHeight = 59,
739    /// Maximum 3D surface depth.
740    MaxSurface3DDepth = 60,
741    /// Maximum 1D layered surface width.
742    MaxSurface1DLayeredWidth = 61,
743    /// Maximum layers in a 1D layered surface.
744    MaxSurface1DLayeredLayers = 62,
745    /// Maximum 2D layered surface width.
746    MaxSurface2DLayeredWidth = 63,
747    /// Maximum 2D layered surface height.
748    MaxSurface2DLayeredHeight = 64,
749    /// Maximum layers in a 2D layered surface.
750    MaxSurface2DLayeredLayers = 65,
751    /// Maximum cubemap surface width.
752    MaxSurfaceCubemapWidth = 66,
753    /// Maximum cubemap layered surface width.
754    MaxSurfaceCubemapLayeredWidth = 67,
755    /// Maximum layers in a cubemap layered surface.
756    MaxSurfaceCubemapLayeredLayers = 68,
757    /// Maximum 1D linear texture width (deprecated).
758    MaxTexture1DLinearWidth = 69,
759    /// Maximum 2D linear texture width.
760    MaxTexture2DLinearWidth = 70,
761    /// Maximum 2D linear texture height.
762    MaxTexture2DLinearHeight = 71,
763    /// Maximum 2D linear texture pitch (bytes).
764    MaxTexture2DLinearPitch = 72,
765    /// Maximum mipmapped 2D texture width.
766    MaxTexture2DMipmappedWidth = 73,
767    /// Maximum mipmapped 2D texture height.
768    MaxTexture2DMipmappedHeight = 74,
769    /// Major compute capability version number.
770    ComputeCapabilityMajor = 75,
771    /// Minor compute capability version number.
772    ComputeCapabilityMinor = 76,
773    /// Maximum mipmapped 1D texture width.
774    MaxTexture1DMipmappedWidth = 77,
775    /// Device supports stream priorities.
776    StreamPrioritiesSupported = 78,
777    /// Device supports caching globals in L1 cache.
778    GlobalL1CacheSupported = 79,
779    /// Device supports caching locals in L1 cache.
780    LocalL1CacheSupported = 80,
781    /// Maximum shared memory per multiprocessor (bytes).
782    MaxSharedMemoryPerMultiprocessor = 81,
783    /// Maximum registers per multiprocessor.
784    MaxRegistersPerMultiprocessor = 82,
785    /// Device supports managed memory.
786    ManagedMemory = 83,
787    /// Device is on a multi-GPU board.
788    IsMultiGpuBoard = 84,
789    /// Unique identifier for the multi-GPU board group.
790    MultiGpuBoardGroupId = 85,
791    /// Host-visible native-atomic support for float operations.
792    HostNativeAtomicSupported = 86,
793    /// Ratio of single-to-double precision performance.
794    SingleToDoublePrecisionPerfRatio = 87,
795    /// Device supports pageable memory access.
796    PageableMemoryAccess = 88,
797    /// Device can access host registered memory at the same virtual address.
798    ConcurrentManagedAccess = 89,
799    /// Device supports compute preemption.
800    ComputePreemptionSupported = 90,
801    /// Device can access host memory via pageable accesses.
802    CanUseHostPointerForRegisteredMem = 91,
803    /// Deprecated: use of stream memory operations v1 (CUDA internal).
804    CanUseStreamMemOpsV1 = 92,
805    /// Deprecated: use of 64-bit stream memory operations v1 (CUDA internal).
806    CanUse64BitStreamMemOpsV1 = 93,
807    /// Deprecated: use of stream wait-value NOR v1 (CUDA internal).
808    CanUseStreamWaitValueNorV1 = 94,
809    /// Device supports cooperative kernel launches.
810    CooperativeLaunch = 95,
811    /// Device supports cooperative kernel launches across multiple GPUs.
812    CooperativeMultiDeviceLaunch = 96,
813    /// Maximum optin shared memory per block.
814    MaxSharedMemoryPerBlockOptin = 97,
815    /// Device supports flushing of outstanding remote writes.
816    CanFlushRemoteWrites = 98,
817    /// Device supports host-side memory-register functions.
818    HostRegisterSupported = 99,
819    /// Device supports pageable memory access using host page tables.
820    PageableMemoryAccessUsesHostPageTables = 100,
821    /// Device supports direct access to managed memory on the host.
822    DirectManagedMemAccessFromHost = 101,
823    /// Device supports virtual memory management APIs.
824    VirtualMemoryManagementSupported = 102,
825    /// Device supports handle-type POSIX file descriptors for IPC.
826    HandleTypePosixFileDescriptorSupported = 103,
827    /// Device supports handle-type Win32 handles for IPC.
828    HandleTypeWin32HandleSupported = 104,
829    /// Device supports handle-type Win32 KMT handles for IPC.
830    HandleTypeWin32KmtHandleSupported = 105,
831    /// Maximum blocks per multiprocessor.
832    MaxBlocksPerMultiprocessor = 106,
833    /// Device supports generic compression for memory.
834    GenericCompressionSupported = 107,
835    /// Maximum persisting L2 cache size (bytes).
836    MaxPersistingL2CacheSize = 108,
837    /// Maximum access-policy window size for L2 cache.
838    MaxAccessPolicyWindowSize = 109,
839    /// Device supports RDMA APIs via `cuMemRangeGetAttribute`.
840    GpuDirectRdmaWithCudaVmmSupported = 110,
841    /// Reserved range of shared memory per SM (bytes).
842    ReservedSharedMemoryPerBlock = 111,
843    /// Device supports sparse CUDA arrays.
844    SparseCudaArraySupported = 112,
845    /// Device supports read-only registration of host memory.
846    ReadOnlyHostRegisterSupported = 113,
847    /// Device supports timeline semaphore interop.
848    TimelineSemaphoreInteropSupported = 114,
849    /// Device supports memory pools (`cudaMallocAsync`).
850    MemoryPoolsSupported = 115,
851    /// GPU direct RDMA is supported.
852    GpuDirectRdmaSupported = 116,
853    /// GPU direct RDMA flush-writes order.
854    GpuDirectRdmaFlushWritesOptions = 117,
855    /// GPU direct RDMA writes ordering.
856    GpuDirectRdmaWritesOrdering = 118,
857    /// Memory pool supported handle types.
858    MemoryPoolSupportedHandleTypes = 119,
859    /// Device supports cluster launch.
860    ClusterLaunch = 120,
861    /// Deferred mapping CUDA array supported.
862    DeferredMappingCudaArraySupported = 121,
863    /// Device supports 64-bit stream memory operations.
864    CanUse64BitStreamMemOps = 122,
865    /// Device supports stream wait-value NOR.
866    CanUseStreamWaitValueNor = 123,
867    /// Device supports importing memory from a Linux `dma_buf`.
868    DmaBufSupported = 124,
869    /// Device supports IPC event handles.
870    IpcEventSupported = 125,
871    /// Device supports mem-sync domain count.
872    MemSyncDomainCount = 126,
873    /// Device supports tensor-map access to data.
874    TensorMapAccessSupported = 127,
875    /// Device supports exporting memory as a fabric handle
876    /// (`HANDLE_TYPE_FABRIC_SUPPORTED`).
877    GpuDirectRdmaFabricSupported = 128,
878    /// Unified function pointers supported.
879    UnifiedFunctionPointers = 129,
880    /// NUMA config.
881    NumaConfig = 130,
882    /// NUMA id.
883    NumaId = 131,
884    /// Device supports multicast.
885    MulticastSupported = 132,
886    /// Device supports MPS features.
887    MpsEnabled = 133,
888    /// Host-NUMA identifier.
889    HostNumaId = 134,
890}
891
892// =========================================================================
893// CUjit_option — options for the JIT compiler
894// =========================================================================
895
896/// JIT compilation options passed to `cuModuleLoadDataEx` and related functions.
897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
898#[repr(u32)]
899#[non_exhaustive]
900#[allow(non_camel_case_types)]
901pub enum CUjit_option {
902    /// Maximum number of registers that a thread may use.
903    MaxRegisters = 0,
904    /// Number of threads per block for the JIT target.
905    ThreadsPerBlock = 1,
906    /// Wall-clock time (ms) for compilation.
907    WallTime = 2,
908    /// Pointer to a buffer for info log output.
909    InfoLogBuffer = 3,
910    /// Size (bytes) of the info-log buffer.
911    InfoLogBufferSizeBytes = 4,
912    /// Pointer to a buffer for error log output.
913    ErrorLogBuffer = 5,
914    /// Size (bytes) of the error-log buffer.
915    ErrorLogBufferSizeBytes = 6,
916    /// Optimisation level (0-4).
917    OptimizationLevel = 7,
918    /// Determines the target based on the current attached context.
919    TargetFromCuContext = 8,
920    /// Specific compute target (sm_XX).
921    Target = 9,
922    /// Fallback strategy when exact match is not found.
923    FallbackStrategy = 10,
924    /// Specifies whether to generate debug info.
925    GenerateDebugInfo = 11,
926    /// Generate verbose log messages.
927    LogVerbose = 12,
928    /// Generate line-number information.
929    GenerateLineInfo = 13,
930    /// Cache mode (on / off).
931    CacheMode = 14,
932    /// (Internal) New SM3X option.
933    Sm3xOpt = 15,
934    /// Fast compile flag.
935    FastCompile = 16,
936    /// Global symbol names.
937    GlobalSymbolNames = 17,
938    /// Global symbol addresses.
939    GlobalSymbolAddresses = 18,
940    /// Number of global symbols.
941    GlobalSymbolCount = 19,
942    /// LTO flag.
943    Lto = 20,
944    /// FTZ (flush-to-zero) flag.
945    Ftz = 21,
946    /// Prec-div flag.
947    PrecDiv = 22,
948    /// Prec-sqrt flag.
949    PrecSqrt = 23,
950    /// FMA flag.
951    Fma = 24,
952    /// Referenced kernel names.
953    ReferencedKernelNames = 25,
954    /// Referenced kernel count.
955    ReferencedKernelCount = 26,
956    /// Referenced variable names.
957    ReferencedVariableNames = 27,
958    /// Referenced variable count.
959    ReferencedVariableCount = 28,
960    /// Optimise unused device variables.
961    OptimizeUnusedDeviceVariables = 29,
962    /// Position-independent code.
963    PositionIndependentCode = 30,
964}
965
966// =========================================================================
967// CUjitInputType — input types for the linker
968// =========================================================================
969
970/// Input types for `cuLinkAddData` / `cuLinkAddFile`.
971#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
972#[repr(u32)]
973#[non_exhaustive]
974pub enum CUjitInputType {
975    /// PTX source code.
976    Ptx = 1,
977    /// Compiled device code (cubin).
978    Cubin = 2,
979    /// Fat binary bundle.
980    Fatbin = 3,
981    /// Relocatable device object.
982    Object = 4,
983    /// Device code library.
984    Library = 5,
985}
986
987// =========================================================================
988// CUmemLocationType — location-type discriminant (CUDA 11.2+ VMM)
989// =========================================================================
990
991/// Specifies the kind of location described by a [`CUmemLocation`].
992///
993/// Mirrors `CUmemLocationType` in `cuda.h`.  Used by the virtual-memory
994/// management APIs to identify where a memory allocation resides or which
995/// device should be granted access.
996#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
997#[repr(u32)]
998#[non_exhaustive]
999pub enum CUmemLocationType {
1000    /// Invalid / uninitialised location type.
1001    Invalid = 0,
1002    /// Location is a CUDA device (the `id` field is a device ordinal).
1003    Device = 1,
1004    /// Location is the host (CPU) memory.
1005    Host = 2,
1006    /// Location is a specific NUMA node on the host.
1007    HostNuma = 3,
1008    /// Location is the NUMA node currently bound to the calling thread.
1009    HostNumaCurrent = 4,
1010}
1011
1012// =========================================================================
1013// CUmemAllocationType — allocation-kind discriminant (CUDA 11.2+ VMM)
1014// =========================================================================
1015
1016/// Type of memory allocation requested via the VMM APIs.
1017///
1018/// Mirrors `CUmemAllocationType` in `cuda.h`.
1019#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1020#[repr(u32)]
1021#[non_exhaustive]
1022pub enum CUmemAllocationType {
1023    /// Invalid / uninitialised allocation type.
1024    Invalid = 0,
1025    /// Pinned (page-locked) GPU memory backed by physical device frames.
1026    Pinned = 1,
1027    /// Sentinel value used by the CUDA driver to mark forward-compatible
1028    /// extensions; always equal to the maximum 32-bit signed integer.
1029    Max = 0x7fff_ffff,
1030}
1031
1032// =========================================================================
1033// CUmemAllocationHandleType — exportable handle bitfield (CUDA 11.2+ VMM)
1034// =========================================================================
1035
1036/// Set of operating-system handle types that the driver may export for a
1037/// VMM allocation.  Treated as a bitfield in the CUDA C API.
1038///
1039/// Mirrors `CUmemAllocationHandleType` in `cuda.h`.
1040#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1041#[repr(u32)]
1042#[non_exhaustive]
1043pub enum CUmemAllocationHandleType {
1044    /// No exportable handle is requested.
1045    None = 0,
1046    /// POSIX file descriptor (Linux).
1047    PosixFileDescriptor = 1,
1048    /// Win32 NT handle.
1049    Win32 = 2,
1050    /// Win32 KMT handle (legacy kernel-mode-thunk).
1051    Win32Kmt = 4,
1052    /// Fabric handle for multi-host shared memory (CUDA 12.0+).
1053    Fabric = 8,
1054}
1055
1056// =========================================================================
1057// CUmemAccessFlags — peer-access permissions for VMM allocations
1058// =========================================================================
1059
1060/// Access flags applied via `cuMemSetAccess` to a VMM allocation, controlling
1061/// whether a particular [`CUmemLocation`] may read or write the mapping.
1062///
1063/// Mirrors `CUmemAccess_flags` in `cuda.h`.  Renamed to follow Rust naming
1064/// conventions; the discriminant values are unchanged.
1065#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1066#[repr(u32)]
1067#[non_exhaustive]
1068pub enum CUmemAccessFlags {
1069    /// No access permitted from the location.
1070    None = 0,
1071    /// Read-only access permitted.
1072    Read = 1,
1073    /// Read-write access permitted.
1074    ReadWrite = 3,
1075    /// Sentinel value used by the CUDA driver for forward compatibility.
1076    Max = 0x7fff_ffff,
1077}
1078
1079// =========================================================================
1080// CUmemLocation — memory-location descriptor (CUDA 11.2+ VMM)
1081// =========================================================================
1082
1083/// Describes a physical memory location for the VMM and pool APIs.
1084///
1085/// Mirrors `CUmemLocation` in `cuda.h`.  The interpretation of `id` depends on
1086/// `loc_type`: for [`CUmemLocationType::Device`] it is a device ordinal, for
1087/// [`CUmemLocationType::HostNuma`] it is a NUMA node identifier, and for the
1088/// other variants it must be set to `0`.
1089///
1090/// The `loc_type` field is stored as a raw `u32` so that any forward-compatible
1091/// value emitted by a future driver can be round-tripped without UB; convert
1092/// to / from [`CUmemLocationType`] manually when interpreting it.
1093#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1094#[repr(C)]
1095pub struct CUmemLocation {
1096    /// Location type; see [`CUmemLocationType`].
1097    pub loc_type: u32,
1098    /// Identifier whose meaning depends on `loc_type`.
1099    pub id: i32,
1100}
1101
1102// =========================================================================
1103// CUmemcpyAttributes — per-copy attributes for cuMemcpyBatchAsync (CUDA 12.8+)
1104// =========================================================================
1105
1106/// Attributes describing one entry (or group of entries) in a
1107/// `cuMemcpyBatchAsync` batch.
1108///
1109/// Mirrors `CUmemcpyAttributes` in `cuda.h` (CUDA 12.8+). `src_access_order`
1110/// is stored as a raw `u32` (the `CUmemcpySrcAccessOrder` enum) so that a
1111/// forward-compatible value emitted by a future driver can be round-tripped
1112/// without UB.
1113#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1114#[repr(C)]
1115pub struct CUmemcpyAttributes {
1116    /// Source access order; see `CUmemcpySrcAccessOrder` in `cuda.h`.
1117    pub src_access_order: u32,
1118    /// Hint for the source memory location.
1119    pub src_loc_hint: CUmemLocation,
1120    /// Hint for the destination memory location.
1121    pub dst_loc_hint: CUmemLocation,
1122    /// Additional copy flags (reserved; `0` on current drivers).
1123    pub flags: u32,
1124}
1125
1126// =========================================================================
1127// CUmemAllocationProp — properties of a VMM allocation request
1128// =========================================================================
1129
1130/// Properties passed to `cuMemCreate` to describe a new VMM allocation.
1131///
1132/// Mirrors `CUmemAllocationProp` in `cuda.h`.
1133///
1134/// The `alloc_type`, `requested_handle_types` and `alloc_flags` fields are
1135/// stored as raw integers so that future driver extensions cannot trigger UB
1136/// via unknown discriminants; convert them to / from
1137/// [`CUmemAllocationType`] / [`CUmemAllocationHandleType`] when interpreting.
1138#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1139#[repr(C)]
1140pub struct CUmemAllocationProp {
1141    /// Allocation type; see [`CUmemAllocationType`].
1142    pub alloc_type: u32,
1143    /// Bitfield of OS handle types to export; see
1144    /// [`CUmemAllocationHandleType`].
1145    pub requested_handle_types: u32,
1146    /// Physical location of the allocation.
1147    pub location: CUmemLocation,
1148    /// Win32 security attributes pointer; null on non-Windows platforms or
1149    /// when no specific security descriptor is required.
1150    pub win32_handle_meta_data: *mut c_void,
1151    /// Reserved for future allocation flags; must be `0` on current drivers.
1152    pub alloc_flags: u64,
1153}
1154
1155// SAFETY: The struct contains a raw pointer (`win32_handle_meta_data`) that
1156// callers are responsible for managing.  The CUDA driver treats the pointer
1157// as opaque, so the struct itself is logically Send+Sync.
1158unsafe impl Send for CUmemAllocationProp {}
1159unsafe impl Sync for CUmemAllocationProp {}
1160
1161impl Default for CUmemAllocationProp {
1162    fn default() -> Self {
1163        Self {
1164            alloc_type: 0,
1165            requested_handle_types: 0,
1166            location: CUmemLocation::default(),
1167            win32_handle_meta_data: std::ptr::null_mut(),
1168            alloc_flags: 0,
1169        }
1170    }
1171}
1172
1173// =========================================================================
1174// CUmemAccessDesc — per-location access permissions for `cuMemSetAccess`
1175// =========================================================================
1176
1177/// Per-location access descriptor for `cuMemSetAccess`.
1178///
1179/// Mirrors `CUmemAccessDesc` in `cuda.h`.  The `flags` field stores a
1180/// [`CUmemAccessFlags`] value as a raw `u32` for FFI safety.
1181#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1182#[repr(C)]
1183pub struct CUmemAccessDesc {
1184    /// Memory location whose access permission is being changed.
1185    pub location: CUmemLocation,
1186    /// Access flags; see [`CUmemAccessFlags`].
1187    pub flags: u32,
1188}
1189
1190// =========================================================================
1191// CUmemPoolProps — properties of a stream-ordered memory pool
1192// =========================================================================
1193
1194/// Properties passed to `cuMemPoolCreate`.
1195///
1196/// Mirrors `CUmemPoolProps` in `cuda.h`.  The trailing `reserved` field is
1197/// part of the public ABI: the CUDA driver expects 56 zero bytes there to
1198/// preserve forward compatibility.
1199#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1200#[repr(C)]
1201pub struct CUmemPoolProps {
1202    /// Allocation type to use when servicing pool requests; see
1203    /// [`CUmemAllocationType`].
1204    pub alloc_type: u32,
1205    /// Bitfield of OS handle types to export; see
1206    /// [`CUmemAllocationHandleType`].
1207    pub handle_types: u32,
1208    /// Physical location backing the pool.
1209    pub location: CUmemLocation,
1210    /// Win32 security-attributes pointer; null on non-Windows platforms or
1211    /// when no specific security descriptor is required.
1212    pub win32_security_attributes: *mut c_void,
1213    /// Maximum aggregate size (bytes) the pool may hold.  `0` means
1214    /// unlimited.
1215    pub max_size: usize,
1216    /// Reserved padding required by the CUDA ABI; must remain zeroed.
1217    pub reserved: [u8; 56],
1218}
1219
1220// SAFETY: The struct contains a raw pointer (`win32_security_attributes`) that
1221// callers are responsible for managing.  The CUDA driver treats the pointer
1222// as opaque, so the struct itself is logically Send+Sync.
1223unsafe impl Send for CUmemPoolProps {}
1224unsafe impl Sync for CUmemPoolProps {}
1225
1226impl Default for CUmemPoolProps {
1227    fn default() -> Self {
1228        Self {
1229            alloc_type: 0,
1230            handle_types: 0,
1231            location: CUmemLocation::default(),
1232            win32_security_attributes: std::ptr::null_mut(),
1233            max_size: 0,
1234            reserved: [0u8; 56],
1235        }
1236    }
1237}
1238
1239// =========================================================================
1240// CUDA_MEMCPY2D — descriptor for `cuMemcpy2D_v2`
1241// =========================================================================
1242
1243/// Descriptor for a 2-D memory copy executed via `cuMemcpy2D_v2`.
1244///
1245/// Mirrors `CUDA_MEMCPY2D` in `cuda.h`.  The CUDA driver inspects only the
1246/// fields appropriate for the source / destination memory types; the
1247/// remaining fields **must** be zeroed.  Use [`CUDA_MEMCPY2D::default`] to
1248/// obtain a zero-initialised descriptor and only set the fields you need.
1249///
1250/// `src_memory_type` and `dst_memory_type` are stored as raw `u32` for FFI
1251/// safety; convert to / from [`CUmemorytype`] manually.
1252#[derive(Debug, Clone, Copy)]
1253#[repr(C)]
1254pub struct CUDA_MEMCPY2D {
1255    /// Source X offset in bytes.
1256    pub src_x_in_bytes: usize,
1257    /// Source Y offset in rows.
1258    pub src_y: usize,
1259    /// Source memory type; see [`CUmemorytype`].
1260    pub src_memory_type: u32,
1261    /// Source host pointer (only valid when `src_memory_type == Host`).
1262    pub src_host: *const c_void,
1263    /// Source device pointer (only valid when `src_memory_type == Device`).
1264    pub src_device: CUdeviceptr,
1265    /// Source CUDA array (only valid when `src_memory_type == Array`).
1266    pub src_array: crate::ffi::CUarray,
1267    /// Source pitch in bytes (`0` selects a tightly-packed layout).
1268    pub src_pitch: usize,
1269    /// Destination X offset in bytes.
1270    pub dst_x_in_bytes: usize,
1271    /// Destination Y offset in rows.
1272    pub dst_y: usize,
1273    /// Destination memory type; see [`CUmemorytype`].
1274    pub dst_memory_type: u32,
1275    /// Destination host pointer (only valid when `dst_memory_type == Host`).
1276    pub dst_host: *mut c_void,
1277    /// Destination device pointer (only valid when `dst_memory_type == Device`).
1278    pub dst_device: CUdeviceptr,
1279    /// Destination CUDA array (only valid when `dst_memory_type == Array`).
1280    pub dst_array: crate::ffi::CUarray,
1281    /// Destination pitch in bytes (`0` selects a tightly-packed layout).
1282    pub dst_pitch: usize,
1283    /// Width of the copied region in bytes.
1284    pub width_in_bytes: usize,
1285    /// Height of the copied region in rows.
1286    pub height: usize,
1287}
1288
1289// SAFETY: The struct contains raw pointers and a CUDA array handle; callers
1290// are responsible for managing the underlying memory and handles.  Treating
1291// the descriptor itself as Send+Sync mirrors the C-side struct, which the
1292// driver may inspect from any thread.
1293unsafe impl Send for CUDA_MEMCPY2D {}
1294unsafe impl Sync for CUDA_MEMCPY2D {}
1295
1296impl Default for CUDA_MEMCPY2D {
1297    fn default() -> Self {
1298        Self {
1299            src_x_in_bytes: 0,
1300            src_y: 0,
1301            src_memory_type: 0,
1302            src_host: std::ptr::null(),
1303            src_device: 0,
1304            src_array: crate::ffi::CUarray::default(),
1305            src_pitch: 0,
1306            dst_x_in_bytes: 0,
1307            dst_y: 0,
1308            dst_memory_type: 0,
1309            dst_host: std::ptr::null_mut(),
1310            dst_device: 0,
1311            dst_array: crate::ffi::CUarray::default(),
1312            dst_pitch: 0,
1313            width_in_bytes: 0,
1314            height: 0,
1315        }
1316    }
1317}
1318
1319// =========================================================================
1320// Submodules — extracted per refactoring policy (<2000 lines per file)
1321// =========================================================================
1322
1323#[path = "ffi_constants.rs"]
1324mod ffi_constants;
1325pub use ffi_constants::*;
1326
1327#[path = "ffi_launch.rs"]
1328mod ffi_launch;
1329pub use ffi_launch::*;
1330
1331#[path = "ffi_descriptors.rs"]
1332mod ffi_descriptors;
1333pub use ffi_descriptors::*;
1334
1335#[path = "ffi_graph.rs"]
1336mod ffi_graph;
1337pub use ffi_graph::*;
1338
1339// =========================================================================
1340// Tests
1341// =========================================================================
1342
1343#[cfg(test)]
1344mod tests {
1345    use super::*;
1346
1347    #[test]
1348    fn test_cuda_success_is_zero() {
1349        assert_eq!(CUDA_SUCCESS, 0);
1350    }
1351
1352    #[test]
1353    fn test_opaque_types_are_pointer_sized() {
1354        assert_eq!(
1355            std::mem::size_of::<CUcontext>(),
1356            std::mem::size_of::<*mut c_void>()
1357        );
1358        assert_eq!(
1359            std::mem::size_of::<CUmodule>(),
1360            std::mem::size_of::<*mut c_void>()
1361        );
1362        assert_eq!(
1363            std::mem::size_of::<CUstream>(),
1364            std::mem::size_of::<*mut c_void>()
1365        );
1366        assert_eq!(
1367            std::mem::size_of::<CUevent>(),
1368            std::mem::size_of::<*mut c_void>()
1369        );
1370        assert_eq!(
1371            std::mem::size_of::<CUfunction>(),
1372            std::mem::size_of::<*mut c_void>()
1373        );
1374        assert_eq!(
1375            std::mem::size_of::<CUmemoryPool>(),
1376            std::mem::size_of::<*mut c_void>()
1377        );
1378    }
1379
1380    #[test]
1381    fn test_handle_default_is_null() {
1382        assert!(CUcontext::default().is_null());
1383        assert!(CUmodule::default().is_null());
1384        assert!(CUfunction::default().is_null());
1385        assert!(CUstream::default().is_null());
1386        assert!(CUevent::default().is_null());
1387        assert!(CUmemoryPool::default().is_null());
1388    }
1389
1390    #[test]
1391    fn test_device_attribute_repr() {
1392        // Original variants
1393        assert_eq!(CUdevice_attribute::MaxThreadsPerBlock as i32, 1);
1394        assert_eq!(CUdevice_attribute::WarpSize as i32, 10);
1395        assert_eq!(CUdevice_attribute::MultiprocessorCount as i32, 16);
1396        assert_eq!(CUdevice_attribute::ComputeCapabilityMajor as i32, 75);
1397        assert_eq!(CUdevice_attribute::ComputeCapabilityMinor as i32, 76);
1398        assert_eq!(CUdevice_attribute::MaxBlocksPerMultiprocessor as i32, 106);
1399        assert_eq!(CUdevice_attribute::L2CacheSize as i32, 38);
1400        assert_eq!(
1401            CUdevice_attribute::MaxSharedMemoryPerMultiprocessor as i32,
1402            81
1403        );
1404        assert_eq!(CUdevice_attribute::ManagedMemory as i32, 83);
1405
1406        // Corrected discriminants (cuda.h `CUdevice_attribute` values)
1407        assert_eq!(CUdevice_attribute::CanTex2dGather as i32, 44);
1408        assert_eq!(CUdevice_attribute::MaxTexture2DGatherWidth as i32, 45);
1409        assert_eq!(CUdevice_attribute::MaxTexture2DGatherHeight as i32, 46);
1410        assert_eq!(CUdevice_attribute::MaxTexture3DWidthAlt as i32, 47);
1411        assert_eq!(CUdevice_attribute::MaxTexture3DHeightAlt as i32, 48);
1412        assert_eq!(CUdevice_attribute::MaxTexture3DDepthAlt as i32, 49);
1413        assert_eq!(CUdevice_attribute::MaxTextureCubemapWidth as i32, 52);
1414        assert_eq!(CUdevice_attribute::CanUseStreamMemOpsV1 as i32, 92);
1415        assert_eq!(CUdevice_attribute::CanUse64BitStreamMemOpsV1 as i32, 93);
1416        assert_eq!(CUdevice_attribute::CanUseStreamWaitValueNorV1 as i32, 94);
1417        assert_eq!(CUdevice_attribute::GlobalL1CacheSupported as i32, 79);
1418        assert_eq!(CUdevice_attribute::LocalL1CacheSupported as i32, 80);
1419        assert_eq!(
1420            CUdevice_attribute::VirtualMemoryManagementSupported as i32,
1421            102
1422        );
1423        assert_eq!(
1424            CUdevice_attribute::HandleTypePosixFileDescriptorSupported as i32,
1425            103
1426        );
1427        assert_eq!(
1428            CUdevice_attribute::HandleTypeWin32HandleSupported as i32,
1429            104
1430        );
1431        assert_eq!(
1432            CUdevice_attribute::HandleTypeWin32KmtHandleSupported as i32,
1433            105
1434        );
1435        assert_eq!(CUdevice_attribute::MaxAccessPolicyWindowSize as i32, 109);
1436        assert_eq!(CUdevice_attribute::ReservedSharedMemoryPerBlock as i32, 111);
1437        assert_eq!(CUdevice_attribute::SparseCudaArraySupported as i32, 112);
1438        assert_eq!(
1439            CUdevice_attribute::ReadOnlyHostRegisterSupported as i32,
1440            113
1441        );
1442        assert_eq!(
1443            CUdevice_attribute::TimelineSemaphoreInteropSupported as i32,
1444            114
1445        );
1446        assert_eq!(CUdevice_attribute::MemoryPoolsSupported as i32, 115);
1447        assert_eq!(CUdevice_attribute::DmaBufSupported as i32, 124);
1448        assert_eq!(CUdevice_attribute::IpcEventSupported as i32, 125);
1449        assert_eq!(CUdevice_attribute::MemSyncDomainCount as i32, 126);
1450        assert_eq!(CUdevice_attribute::TensorMapAccessSupported as i32, 127);
1451        assert_eq!(CUdevice_attribute::ClusterLaunch as i32, 120);
1452        assert_eq!(CUdevice_attribute::UnifiedFunctionPointers as i32, 129);
1453        assert_eq!(CUdevice_attribute::NumaConfig as i32, 130);
1454        assert_eq!(CUdevice_attribute::NumaId as i32, 131);
1455        assert_eq!(CUdevice_attribute::GpuDirectRdmaFabricSupported as i32, 128);
1456        assert_eq!(CUdevice_attribute::MulticastSupported as i32, 132);
1457        assert_eq!(CUdevice_attribute::HostNumaId as i32, 134);
1458    }
1459
1460    #[test]
1461    fn test_jit_option_repr() {
1462        assert_eq!(CUjit_option::MaxRegisters as u32, 0);
1463        assert_eq!(CUjit_option::ThreadsPerBlock as u32, 1);
1464        assert_eq!(CUjit_option::WallTime as u32, 2);
1465        assert_eq!(CUjit_option::InfoLogBuffer as u32, 3);
1466        assert_eq!(CUjit_option::InfoLogBufferSizeBytes as u32, 4);
1467        assert_eq!(CUjit_option::ErrorLogBuffer as u32, 5);
1468        assert_eq!(CUjit_option::ErrorLogBufferSizeBytes as u32, 6);
1469        assert_eq!(CUjit_option::OptimizationLevel as u32, 7);
1470        assert_eq!(CUjit_option::Target as u32, 9);
1471        assert_eq!(CUjit_option::FallbackStrategy as u32, 10);
1472    }
1473
1474    #[test]
1475    #[allow(clippy::assertions_on_constants)]
1476    fn test_error_code_ranges() {
1477        // Basic errors: 1-8
1478        assert!(CUDA_ERROR_INVALID_VALUE < 10);
1479        // Device errors: 100-102
1480        assert!((100..=102).contains(&CUDA_ERROR_NO_DEVICE));
1481        assert!((100..=102).contains(&CUDA_ERROR_INVALID_DEVICE));
1482        assert!((100..=102).contains(&CUDA_ERROR_DEVICE_NOT_LICENSED));
1483        // Image/context errors: 200+
1484        assert!(CUDA_ERROR_INVALID_IMAGE >= 200);
1485        // Launch errors: 700+
1486        assert!(CUDA_ERROR_LAUNCH_FAILED >= 700);
1487        assert!(CUDA_ERROR_ILLEGAL_ADDRESS >= 700);
1488        assert!(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES >= 700);
1489        // Stream capture errors: 900+
1490        assert!(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED >= 900);
1491        // Unknown is 999
1492        assert_eq!(CUDA_ERROR_UNKNOWN, 999);
1493    }
1494
1495    #[test]
1496    fn test_handle_debug_format() {
1497        let ctx = CUcontext::default();
1498        let debug_str = format!("{ctx:?}");
1499        assert!(debug_str.starts_with("CUcontext("));
1500    }
1501
1502    #[test]
1503    fn test_handle_equality() {
1504        let a = CUcontext::default();
1505        let b = CUcontext::default();
1506        assert_eq!(a, b);
1507    }
1508
1509    #[test]
1510    fn test_new_handle_types_are_pointer_sized() {
1511        assert_eq!(
1512            std::mem::size_of::<CUtexref>(),
1513            std::mem::size_of::<*mut c_void>()
1514        );
1515        assert_eq!(
1516            std::mem::size_of::<CUsurfref>(),
1517            std::mem::size_of::<*mut c_void>()
1518        );
1519        // Bindless texture / surface objects are `unsigned long long` values,
1520        // not opaque pointers (correct on 32-bit targets too).
1521        assert_eq!(
1522            std::mem::size_of::<CUtexObject>(),
1523            std::mem::size_of::<u64>()
1524        );
1525        assert_eq!(
1526            std::mem::size_of::<CUsurfObject>(),
1527            std::mem::size_of::<u64>()
1528        );
1529    }
1530
1531    #[test]
1532    fn test_new_handle_defaults_are_null() {
1533        assert!(CUtexref::default().is_null());
1534        assert!(CUsurfref::default().is_null());
1535        assert!(CUtexObject::default().is_null());
1536        assert!(CUsurfObject::default().is_null());
1537    }
1538
1539    #[test]
1540    fn test_memory_type_enum() {
1541        assert_eq!(CUmemorytype::Host as u32, 1);
1542        assert_eq!(CUmemorytype::Device as u32, 2);
1543        assert_eq!(CUmemorytype::Array as u32, 3);
1544        assert_eq!(CUmemorytype::Unified as u32, 4);
1545    }
1546
1547    #[test]
1548    fn test_pointer_attribute_enum() {
1549        assert_eq!(CUpointer_attribute::Context as u32, 1);
1550        assert_eq!(CUpointer_attribute::MemoryType as u32, 2);
1551        assert_eq!(CUpointer_attribute::DevicePointer as u32, 3);
1552        assert_eq!(CUpointer_attribute::HostPointer as u32, 4);
1553        assert_eq!(CUpointer_attribute::IsManaged as u32, 8);
1554        assert_eq!(CUpointer_attribute::DeviceOrdinal as u32, 9);
1555    }
1556
1557    #[test]
1558    fn test_limit_enum() {
1559        assert_eq!(CUlimit::StackSize as u32, 0);
1560        assert_eq!(CUlimit::PrintfFifoSize as u32, 1);
1561        assert_eq!(CUlimit::MallocHeapSize as u32, 2);
1562        assert_eq!(CUlimit::DevRuntimeSyncDepth as u32, 3);
1563        assert_eq!(CUlimit::DevRuntimePendingLaunchCount as u32, 4);
1564        assert_eq!(CUlimit::MaxL2FetchGranularity as u32, 5);
1565        assert_eq!(CUlimit::PersistingL2CacheSize as u32, 6);
1566    }
1567
1568    #[test]
1569    fn test_function_attribute_enum() {
1570        assert_eq!(CUfunction_attribute::MaxThreadsPerBlock as i32, 0);
1571        assert_eq!(CUfunction_attribute::SharedSizeBytes as i32, 1);
1572        assert_eq!(CUfunction_attribute::NumRegs as i32, 4);
1573        assert_eq!(CUfunction_attribute::PtxVersion as i32, 5);
1574        assert_eq!(CUfunction_attribute::BinaryVersion as i32, 6);
1575        assert_eq!(CUfunction_attribute::MaxDynamicSharedSizeBytes as i32, 8);
1576        assert_eq!(
1577            CUfunction_attribute::PreferredSharedMemoryCarveout as i32,
1578            9
1579        );
1580    }
1581
1582    // ---------------------------------------------------------------------
1583    // VMM / Pool / Linker FFI types — added by Wave 1
1584    // ---------------------------------------------------------------------
1585
1586    #[test]
1587    fn test_link_state_handle_is_pointer_sized_and_default_null() {
1588        assert_eq!(
1589            std::mem::size_of::<CUlinkState>(),
1590            std::mem::size_of::<*mut c_void>()
1591        );
1592        assert!(CUlinkState::default().is_null());
1593    }
1594
1595    #[test]
1596    fn test_mem_generic_allocation_handle_is_u64() {
1597        assert_eq!(
1598            std::mem::size_of::<CUmemGenericAllocationHandle>(),
1599            std::mem::size_of::<u64>()
1600        );
1601        let _: CUmemGenericAllocationHandle = 0u64;
1602    }
1603
1604    #[test]
1605    fn test_mem_location_type_repr() {
1606        assert_eq!(CUmemLocationType::Invalid as u32, 0);
1607        assert_eq!(CUmemLocationType::Device as u32, 1);
1608        assert_eq!(CUmemLocationType::Host as u32, 2);
1609        assert_eq!(CUmemLocationType::HostNuma as u32, 3);
1610        assert_eq!(CUmemLocationType::HostNumaCurrent as u32, 4);
1611    }
1612
1613    #[test]
1614    fn test_mem_allocation_type_repr() {
1615        assert_eq!(CUmemAllocationType::Invalid as u32, 0);
1616        assert_eq!(CUmemAllocationType::Pinned as u32, 1);
1617        assert_eq!(CUmemAllocationType::Max as u32, 0x7fff_ffff);
1618    }
1619
1620    #[test]
1621    fn test_mem_allocation_handle_type_repr() {
1622        assert_eq!(CUmemAllocationHandleType::None as u32, 0);
1623        assert_eq!(CUmemAllocationHandleType::PosixFileDescriptor as u32, 1);
1624        assert_eq!(CUmemAllocationHandleType::Win32 as u32, 2);
1625        assert_eq!(CUmemAllocationHandleType::Win32Kmt as u32, 4);
1626        assert_eq!(CUmemAllocationHandleType::Fabric as u32, 8);
1627    }
1628
1629    #[test]
1630    fn test_mem_access_flags_repr() {
1631        assert_eq!(CUmemAccessFlags::None as u32, 0);
1632        assert_eq!(CUmemAccessFlags::Read as u32, 1);
1633        assert_eq!(CUmemAccessFlags::ReadWrite as u32, 3);
1634        assert_eq!(CUmemAccessFlags::Max as u32, 0x7fff_ffff);
1635    }
1636
1637    #[test]
1638    fn test_mem_location_layout() {
1639        // Two consecutive 4-byte fields → 8 bytes, alignment 4.
1640        assert_eq!(std::mem::size_of::<CUmemLocation>(), 8);
1641        assert_eq!(std::mem::align_of::<CUmemLocation>(), 4);
1642        let loc = CUmemLocation::default();
1643        assert_eq!(loc.loc_type, 0);
1644        assert_eq!(loc.id, 0);
1645    }
1646
1647    #[test]
1648    fn test_mem_access_desc_layout() {
1649        // CUmemLocation (8) + flags (u32 = 4) → 12 bytes, alignment 4.
1650        assert_eq!(std::mem::size_of::<CUmemAccessDesc>(), 12);
1651        assert_eq!(std::mem::align_of::<CUmemAccessDesc>(), 4);
1652        let desc = CUmemAccessDesc::default();
1653        assert_eq!(desc.flags, 0);
1654    }
1655
1656    #[test]
1657    fn test_mem_allocation_prop_default_zeroed() {
1658        let prop = CUmemAllocationProp::default();
1659        assert_eq!(prop.alloc_type, 0);
1660        assert_eq!(prop.requested_handle_types, 0);
1661        assert_eq!(prop.location.loc_type, 0);
1662        assert_eq!(prop.location.id, 0);
1663        assert!(prop.win32_handle_meta_data.is_null());
1664        assert_eq!(prop.alloc_flags, 0);
1665    }
1666
1667    #[test]
1668    fn test_mem_pool_props_default_zeroed_and_padded() {
1669        let props = CUmemPoolProps::default();
1670        assert_eq!(props.alloc_type, 0);
1671        assert_eq!(props.handle_types, 0);
1672        assert_eq!(props.location.loc_type, 0);
1673        assert_eq!(props.location.id, 0);
1674        assert!(props.win32_security_attributes.is_null());
1675        assert_eq!(props.max_size, 0);
1676        assert!(props.reserved.iter().all(|&b| b == 0));
1677        // The CUDA ABI mandates 56 reserved bytes.
1678        assert_eq!(props.reserved.len(), 56);
1679    }
1680
1681    #[test]
1682    fn test_memcpy2d_default_zeroed() {
1683        let m = CUDA_MEMCPY2D::default();
1684        assert_eq!(m.src_x_in_bytes, 0);
1685        assert_eq!(m.src_y, 0);
1686        assert_eq!(m.src_memory_type, 0);
1687        assert!(m.src_host.is_null());
1688        assert_eq!(m.src_device, 0);
1689        assert!(m.src_array.is_null());
1690        assert_eq!(m.src_pitch, 0);
1691        assert_eq!(m.dst_x_in_bytes, 0);
1692        assert_eq!(m.dst_y, 0);
1693        assert_eq!(m.dst_memory_type, 0);
1694        assert!(m.dst_host.is_null());
1695        assert_eq!(m.dst_device, 0);
1696        assert!(m.dst_array.is_null());
1697        assert_eq!(m.dst_pitch, 0);
1698        assert_eq!(m.width_in_bytes, 0);
1699        assert_eq!(m.height, 0);
1700    }
1701}
oxicuda_driver/ffi.rs

oxicuda_driver/
ffi.rs