Skip to main content

oxicuda_driver/
ffi.rs

1//! Raw CUDA Driver API FFI types, constants, and enums.
2//!
3//! This module provides the low-level type definitions that mirror the CUDA Driver API
4//! (`cuda.h`). No functions are defined here — only types, opaque pointer aliases,
5//! result-code constants, and `#[repr]` enums used by the dynamically loaded driver
6//! entry points.
7//!
8//! # Safety
9//!
10//! All pointer types in this module are raw pointers intended for FFI use.
11//! They must only be used through the safe wrappers provided by higher-level
12//! modules in `oxicuda-driver`.
13
14use std::ffi::c_void;
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// Core scalar type aliases
19// ---------------------------------------------------------------------------
20
21/// Return code from every CUDA Driver API call.
22///
23/// A value of `0` (`CUDA_SUCCESS`) indicates success; any other value is an
24/// error code. See the `CUDA_*` constants below for the full catalogue.
25pub type CUresult = u32;
26
27/// Ordinal identifier for a CUDA-capable device (0-based).
28pub type CUdevice = i32;
29
30/// Device-side pointer (64-bit address in GPU virtual memory).
31pub type CUdeviceptr = u64;
32
33// ---------------------------------------------------------------------------
34// Opaque handle helpers
35// ---------------------------------------------------------------------------
36
37macro_rules! define_handle {
38    ($(#[$meta:meta])* $name:ident) => {
39        $(#[$meta])*
40        #[repr(transparent)]
41        #[derive(Clone, Copy, PartialEq, Eq, Hash)]
42        pub struct $name(pub *mut c_void);
43
44        // SAFETY: CUDA handles are thread-safe when used with proper
45        // synchronisation via the driver API.
46        unsafe impl Send for $name {}
47        unsafe impl Sync for $name {}
48
49        impl fmt::Debug for $name {
50            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51                write!(f, "{}({:p})", stringify!($name), self.0)
52            }
53        }
54
55        impl Default for $name {
56            fn default() -> Self {
57                Self(std::ptr::null_mut())
58            }
59        }
60
61        impl $name {
62            /// Returns `true` if the handle is null (uninitialised).
63            #[inline]
64            pub fn is_null(self) -> bool {
65                self.0.is_null()
66            }
67        }
68    };
69}
70
71// ---------------------------------------------------------------------------
72// Handle types
73// ---------------------------------------------------------------------------
74
75define_handle! {
76    /// Opaque handle to a CUDA context.
77    CUcontext
78}
79
80define_handle! {
81    /// Opaque handle to a loaded CUDA module (PTX / cubin).
82    CUmodule
83}
84
85define_handle! {
86    /// Opaque handle to a CUDA kernel function within a module.
87    CUfunction
88}
89
90define_handle! {
91    /// Opaque handle to a CUDA stream (command queue).
92    CUstream
93}
94
95define_handle! {
96    /// Opaque handle to a CUDA event (used for timing and synchronisation).
97    CUevent
98}
99
100define_handle! {
101    /// Opaque handle to a CUDA memory pool (`cuMemPool*` family).
102    CUmemoryPool
103}
104
105define_handle! {
106    /// Opaque handle to a CUDA texture reference (legacy API).
107    CUtexref
108}
109
110define_handle! {
111    /// Opaque handle to a CUDA surface reference (legacy API).
112    CUsurfref
113}
114
115define_handle! {
116    /// Opaque handle to a CUDA texture object (modern bindless API).
117    CUtexObject
118}
119
120define_handle! {
121    /// Opaque handle to a CUDA surface object (modern bindless API).
122    CUsurfObject
123}
124
125define_handle! {
126    /// Opaque handle to a CUDA kernel (CUDA 12.8+ library-based kernels).
127    ///
128    /// Used with `cuKernelGetLibrary` to retrieve the library a kernel
129    /// belongs to.
130    CUkernel
131}
132
133define_handle! {
134    /// Opaque handle to a CUDA library (CUDA 12.8+ JIT library API).
135    ///
136    /// Retrieved via `cuKernelGetLibrary` to identify the JIT-compiled
137    /// library that contains a given kernel.
138    CUlibrary
139}
140
141define_handle! {
142    /// Opaque handle to an NVLink multicast object (CUDA 12.8+).
143    ///
144    /// Used with `cuMulticastCreate`, `cuMulticastAddDevice`, and related
145    /// functions to manage NVLink multicast memory regions across devices.
146    CUmulticastObject
147}
148
149// =========================================================================
150// CUmemorytype — memory type identifiers
151// =========================================================================
152
153/// Memory type identifiers returned by pointer attribute queries.
154#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
155#[repr(u32)]
156#[non_exhaustive]
157pub enum CUmemorytype {
158    /// Host (system) memory.
159    Host = 1,
160    /// Device (GPU) memory.
161    Device = 2,
162    /// Array memory.
163    Array = 3,
164    /// Unified (managed) memory.
165    Unified = 4,
166}
167
168// =========================================================================
169// CUpointer_attribute — pointer attribute query keys
170// =========================================================================
171
172/// Pointer attribute identifiers passed to `cuPointerGetAttribute`.
173#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
174#[repr(u32)]
175#[non_exhaustive]
176#[allow(non_camel_case_types)]
177pub enum CUpointer_attribute {
178    /// Query the CUDA context associated with a pointer.
179    Context = 1,
180    /// Query the memory type (host / device / unified) of a pointer.
181    MemoryType = 2,
182    /// Query the device pointer corresponding to a host pointer.
183    DevicePointer = 3,
184    /// Query the host pointer corresponding to a device pointer.
185    HostPointer = 4,
186    /// Query whether the memory is managed (unified).
187    IsManaged = 9,
188    /// Query the device ordinal for the pointer.
189    DeviceOrdinal = 10,
190}
191
192// =========================================================================
193// CUlimit — context limit identifiers
194// =========================================================================
195
196/// Context limit identifiers for `cuCtxSetLimit` / `cuCtxGetLimit`.
197#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
198#[repr(u32)]
199#[non_exhaustive]
200pub enum CUlimit {
201    /// Stack size for each GPU thread.
202    StackSize = 0,
203    /// Size of the printf FIFO.
204    PrintfFifoSize = 1,
205    /// Size of the heap used by `malloc()` on the device.
206    MallocHeapSize = 2,
207    /// Maximum nesting depth of a device runtime launch.
208    DevRuntimeSyncDepth = 3,
209    /// Maximum number of outstanding device runtime launches.
210    DevRuntimePendingLaunchCount = 4,
211    /// L2 cache fetch granularity.
212    MaxL2FetchGranularity = 5,
213    /// Maximum persisting L2 cache size.
214    PersistingL2CacheSize = 6,
215}
216
217// =========================================================================
218// CUfunction_attribute — function attribute query keys
219// =========================================================================
220
221/// Function attribute identifiers passed to `cuFuncGetAttribute`.
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
223#[repr(i32)]
224#[non_exhaustive]
225#[allow(non_camel_case_types)]
226pub enum CUfunction_attribute {
227    /// Maximum threads per block for this function.
228    MaxThreadsPerBlock = 0,
229    /// Shared memory used by this function (bytes).
230    SharedSizeBytes = 1,
231    /// Size of user-allocated constant memory (bytes).
232    ConstSizeBytes = 2,
233    /// Size of local memory used by each thread (bytes).
234    LocalSizeBytes = 3,
235    /// Number of registers used by each thread.
236    NumRegs = 4,
237    /// PTX virtual architecture version.
238    PtxVersion = 5,
239    /// Binary architecture version.
240    BinaryVersion = 6,
241    /// Whether this function has been cached.
242    CacheModeCa = 7,
243    /// Maximum dynamic shared memory size (bytes).
244    MaxDynamicSharedSizeBytes = 8,
245    /// Preferred shared memory carve-out.
246    PreferredSharedMemoryCarveout = 9,
247}
248
249// =========================================================================
250// CUresult constants — every documented CUDA Driver API error code
251// =========================================================================
252
253/// The API call returned with no errors.
254pub const CUDA_SUCCESS: CUresult = 0;
255
256/// One or more parameters passed to the API call are not acceptable.
257pub const CUDA_ERROR_INVALID_VALUE: CUresult = 1;
258
259/// The API call failed because it was unable to allocate enough memory.
260pub const CUDA_ERROR_OUT_OF_MEMORY: CUresult = 2;
261
262/// The CUDA driver has not been initialised via `cuInit`.
263pub const CUDA_ERROR_NOT_INITIALIZED: CUresult = 3;
264
265/// The CUDA driver is shutting down.
266pub const CUDA_ERROR_DEINITIALIZED: CUresult = 4;
267
268/// Profiler is not initialised for this run.
269pub const CUDA_ERROR_PROFILER_DISABLED: CUresult = 5;
270
271/// (Deprecated) Profiler not started.
272pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: CUresult = 6;
273
274/// (Deprecated) Profiler already started.
275pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: CUresult = 7;
276
277/// (Deprecated) Profiler already stopped.
278pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: CUresult = 8;
279
280/// Stub library loaded instead of the real driver.
281pub const CUDA_ERROR_STUB_LIBRARY: CUresult = 34;
282
283/// Device-side assert triggered.
284pub const CUDA_ERROR_DEVICE_UNAVAILABLE: CUresult = 46;
285
286/// No CUDA-capable device is detected.
287pub const CUDA_ERROR_NO_DEVICE: CUresult = 100;
288
289/// The device ordinal supplied is out of range.
290pub const CUDA_ERROR_INVALID_DEVICE: CUresult = 101;
291
292/// The device does not have a valid licence.
293pub const CUDA_ERROR_DEVICE_NOT_LICENSED: CUresult = 102;
294
295/// The PTX or cubin image is invalid.
296pub const CUDA_ERROR_INVALID_IMAGE: CUresult = 200;
297
298/// The supplied context is not valid.
299pub const CUDA_ERROR_INVALID_CONTEXT: CUresult = 201;
300
301/// (Deprecated) Context already current.
302pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CUresult = 202;
303
304/// A map or register operation has failed.
305pub const CUDA_ERROR_MAP_FAILED: CUresult = 205;
306
307/// An unmap or unregister operation has failed.
308pub const CUDA_ERROR_UNMAP_FAILED: CUresult = 206;
309
310/// The specified array is currently mapped.
311pub const CUDA_ERROR_ARRAY_IS_MAPPED: CUresult = 207;
312
313/// The resource is already mapped.
314pub const CUDA_ERROR_ALREADY_MAPPED: CUresult = 208;
315
316/// There is no kernel image available for execution on the device.
317pub const CUDA_ERROR_NO_BINARY_FOR_GPU: CUresult = 209;
318
319/// A resource has already been acquired.
320pub const CUDA_ERROR_ALREADY_ACQUIRED: CUresult = 210;
321
322/// The resource is not mapped.
323pub const CUDA_ERROR_NOT_MAPPED: CUresult = 211;
324
325/// A mapped resource is not available for access as an array.
326pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: CUresult = 212;
327
328/// A mapped resource is not available for access as a pointer.
329pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: CUresult = 213;
330
331/// An uncorrectable ECC error was detected.
332pub const CUDA_ERROR_ECC_UNCORRECTABLE: CUresult = 214;
333
334/// A PTX JIT limit has been reached.
335pub const CUDA_ERROR_UNSUPPORTED_LIMIT: CUresult = 215;
336
337/// The context already has work from another thread bound to it.
338pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: CUresult = 216;
339
340/// Peer access is not supported across the given devices.
341pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: CUresult = 217;
342
343/// The PTX JIT compilation was disabled or the PTX is invalid.
344pub const CUDA_ERROR_INVALID_PTX: CUresult = 218;
345
346/// Invalid graphics context.
347pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: CUresult = 219;
348
349/// NVLINK is uncorrectable.
350pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: CUresult = 220;
351
352/// JIT compiler not found.
353pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: CUresult = 221;
354
355/// Unsupported PTX version.
356pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: CUresult = 222;
357
358/// JIT compilation disabled.
359pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: CUresult = 223;
360
361/// Unsupported exec-affinity type.
362pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: CUresult = 224;
363
364/// Unsupported device-side synchronisation on this device.
365pub const CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = 225;
366
367/// The requested source is invalid.
368pub const CUDA_ERROR_INVALID_SOURCE: CUresult = 300;
369
370/// The named file was not found.
371pub const CUDA_ERROR_FILE_NOT_FOUND: CUresult = 301;
372
373/// A shared-object symbol lookup failed.
374pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = 302;
375
376/// The shared-object init function failed.
377pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: CUresult = 303;
378
379/// An OS call failed.
380pub const CUDA_ERROR_OPERATING_SYSTEM: CUresult = 304;
381
382/// The supplied handle is invalid.
383pub const CUDA_ERROR_INVALID_HANDLE: CUresult = 400;
384
385/// The requested resource is in an illegal state.
386pub const CUDA_ERROR_ILLEGAL_STATE: CUresult = 401;
387
388/// A loss-less compression buffer was detected while doing uncompressed access.
389pub const CUDA_ERROR_LOSSY_QUERY: CUresult = 402;
390
391/// A named symbol was not found.
392pub const CUDA_ERROR_NOT_FOUND: CUresult = 500;
393
394/// The operation is not ready (asynchronous).
395pub const CUDA_ERROR_NOT_READY: CUresult = 600;
396
397/// An illegal memory address was encountered.
398pub const CUDA_ERROR_ILLEGAL_ADDRESS: CUresult = 700;
399
400/// The kernel launch uses too many resources (registers / shared memory).
401pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: CUresult = 701;
402
403/// The kernel launch exceeded the time-out enforced by the driver.
404pub const CUDA_ERROR_LAUNCH_TIMEOUT: CUresult = 702;
405
406/// A launch did not occur on a compatible texturing mode.
407pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: CUresult = 703;
408
409/// Peer access already enabled.
410pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: CUresult = 704;
411
412/// Peer access has not been enabled.
413pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: CUresult = 705;
414
415/// The primary context has already been initialised.
416pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: CUresult = 708;
417
418/// The context is being destroyed.
419pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: CUresult = 709;
420
421/// A 64-bit device assertion triggered.
422pub const CUDA_ERROR_ASSERT: CUresult = 710;
423
424/// Hardware resources to enable peer access are exhausted.
425pub const CUDA_ERROR_TOO_MANY_PEERS: CUresult = 711;
426
427/// The host-side memory region is already registered.
428pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: CUresult = 712;
429
430/// The host-side memory region is not registered.
431pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: CUresult = 713;
432
433/// Hardware stack overflow on the device.
434pub const CUDA_ERROR_HARDWARE_STACK_ERROR: CUresult = 714;
435
436/// Illegal instruction encountered on the device.
437pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: CUresult = 715;
438
439/// Misaligned address on the device.
440pub const CUDA_ERROR_MISALIGNED_ADDRESS: CUresult = 716;
441
442/// Invalid address space.
443pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: CUresult = 717;
444
445/// Invalid program counter on the device.
446pub const CUDA_ERROR_INVALID_PC: CUresult = 718;
447
448/// The kernel launch failed.
449pub const CUDA_ERROR_LAUNCH_FAILED: CUresult = 719;
450
451/// Cooperative launch is too large for the device/kernel.
452pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = 720;
453
454/// The API call is not permitted in the active context.
455pub const CUDA_ERROR_NOT_PERMITTED: CUresult = 800;
456
457/// The API call is not supported by the current driver/device combination.
458pub const CUDA_ERROR_NOT_SUPPORTED: CUresult = 801;
459
460/// System not ready for CUDA operations.
461pub const CUDA_ERROR_SYSTEM_NOT_READY: CUresult = 802;
462
463/// System driver mismatch.
464pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: CUresult = 803;
465
466/// Old-style context incompatible with CUDA 3.2+ API.
467pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: CUresult = 804;
468
469/// MPS connection failed.
470pub const CUDA_ERROR_MPS_CONNECTION_FAILED: CUresult = 805;
471
472/// MPS RPC failure.
473pub const CUDA_ERROR_MPS_RPC_FAILURE: CUresult = 806;
474
475/// MPS server not ready.
476pub const CUDA_ERROR_MPS_SERVER_NOT_READY: CUresult = 807;
477
478/// MPS maximum clients reached.
479pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: CUresult = 808;
480
481/// MPS maximum connections reached.
482pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: CUresult = 809;
483
484/// MPS client terminated.
485pub const CUDA_ERROR_MPS_CLIENT_TERMINATED: CUresult = 810;
486
487/// CDP not supported.
488pub const CUDA_ERROR_CDP_NOT_SUPPORTED: CUresult = 811;
489
490/// CDP version mismatch.
491pub const CUDA_ERROR_CDP_VERSION_MISMATCH: CUresult = 812;
492
493/// Stream capture unsupported.
494pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: CUresult = 900;
495
496/// Stream capture invalidated.
497pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: CUresult = 901;
498
499/// Stream capture merge not permitted.
500pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: CUresult = 902;
501
502/// Stream capture unmatched.
503pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: CUresult = 903;
504
505/// Stream capture unjoined.
506pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: CUresult = 904;
507
508/// Stream capture isolation violation.
509pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: CUresult = 905;
510
511/// Implicit stream in graph capture.
512pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: CUresult = 906;
513
514/// Captured event error.
515pub const CUDA_ERROR_CAPTURED_EVENT: CUresult = 907;
516
517/// Stream capture wrong thread.
518pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: CUresult = 908;
519
520/// The async operation timed out.
521pub const CUDA_ERROR_TIMEOUT: CUresult = 909;
522
523/// The graph update failed.
524pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: CUresult = 910;
525
526/// External device error.
527pub const CUDA_ERROR_EXTERNAL_DEVICE: CUresult = 911;
528
529/// Invalid cluster size.
530pub const CUDA_ERROR_INVALID_CLUSTER_SIZE: CUresult = 912;
531
532/// Function not loaded.
533pub const CUDA_ERROR_FUNCTION_NOT_LOADED: CUresult = 913;
534
535/// Invalid resource type.
536pub const CUDA_ERROR_INVALID_RESOURCE_TYPE: CUresult = 914;
537
538/// Invalid resource configuration.
539pub const CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = 915;
540
541/// An unknown internal error occurred.
542pub const CUDA_ERROR_UNKNOWN: CUresult = 999;
543
544// =========================================================================
545// CUdevice_attribute — device property query keys
546// =========================================================================
547
548/// Device attribute identifiers passed to `cuDeviceGetAttribute`.
549#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
550#[repr(i32)]
551#[non_exhaustive]
552#[allow(non_camel_case_types)]
553pub enum CUdevice_attribute {
554    /// Maximum number of threads per block.
555    MaxThreadsPerBlock = 1,
556    /// Maximum x-dimension of a block.
557    MaxBlockDimX = 2,
558    /// Maximum y-dimension of a block.
559    MaxBlockDimY = 3,
560    /// Maximum z-dimension of a block.
561    MaxBlockDimZ = 4,
562    /// Maximum x-dimension of a grid.
563    MaxGridDimX = 5,
564    /// Maximum y-dimension of a grid.
565    MaxGridDimY = 6,
566    /// Maximum z-dimension of a grid.
567    MaxGridDimZ = 7,
568    /// Maximum shared memory available per block (bytes).
569    MaxSharedMemoryPerBlock = 8,
570    /// Total amount of constant memory on the device (bytes).
571    TotalConstantMemory = 9,
572    /// Warp size in threads.
573    WarpSize = 10,
574    /// Maximum pitch allowed by memory copies (bytes).
575    MaxPitch = 11,
576    /// Maximum number of 32-bit registers per block.
577    MaxRegistersPerBlock = 12,
578    /// Peak clock frequency in kHz.
579    ClockRate = 13,
580    /// Alignment requirement for textures.
581    TextureAlignment = 14,
582    /// Device can possibly copy memory and execute a kernel concurrently.
583    GpuOverlap = 15,
584    /// Number of multiprocessors on the device.
585    MultiprocessorCount = 16,
586    /// Whether there is a run-time limit on kernels.
587    KernelExecTimeout = 17,
588    /// Device is integrated (shares host memory).
589    Integrated = 18,
590    /// Device can map host memory with `cuMemHostAlloc` / `cuMemHostRegister`.
591    CanMapHostMemory = 19,
592    /// Compute mode: default, exclusive, prohibited, etc.
593    ComputeMode = 20,
594    /// Maximum 1D texture width.
595    MaxTexture1DWidth = 21,
596    /// Maximum 2D texture width.
597    MaxTexture2DWidth = 22,
598    /// Maximum 2D texture height.
599    MaxTexture2DHeight = 23,
600    /// Maximum 3D texture width.
601    MaxTexture3DWidth = 24,
602    /// Maximum 3D texture height.
603    MaxTexture3DHeight = 25,
604    /// Maximum 3D texture depth.
605    MaxTexture3DDepth = 26,
606    /// Maximum 2D layered texture width.
607    MaxTexture2DLayeredWidth = 27,
608    /// Maximum 2D layered texture height.
609    MaxTexture2DLayeredHeight = 28,
610    /// Maximum layers in a 2D layered texture.
611    MaxTexture2DLayeredLayers = 29,
612    /// Alignment requirement for surfaces.
613    SurfaceAlignment = 30,
614    /// Device can execute multiple kernels concurrently.
615    ConcurrentKernels = 31,
616    /// Device supports ECC memory.
617    EccEnabled = 32,
618    /// PCI bus ID of the device.
619    PciBusId = 33,
620    /// PCI device ID of the device.
621    PciDeviceId = 34,
622    /// Device is using TCC (Tesla Compute Cluster) driver model.
623    TccDriver = 35,
624    /// Peak memory clock frequency in kHz.
625    MemoryClockRate = 36,
626    /// Global memory bus width in bits.
627    GlobalMemoryBusWidth = 37,
628    /// Size of L2 cache in bytes.
629    L2CacheSize = 38,
630    /// Maximum resident threads per multiprocessor.
631    MaxThreadsPerMultiprocessor = 39,
632    /// Number of asynchronous engines.
633    AsyncEngineCount = 40,
634    /// Device shares a unified address space with the host.
635    UnifiedAddressing = 41,
636    /// Maximum 1D layered texture width.
637    MaxTexture1DLayeredWidth = 42,
638    /// Maximum layers in a 1D layered texture.
639    MaxTexture1DLayeredLayers = 43,
640    /// Maximum 2D texture width if CUDA 2D memory allocation is bound.
641    MaxTexture2DGatherWidth = 44,
642    /// Maximum 2D texture height if CUDA 2D memory allocation is bound.
643    MaxTexture2DGatherHeight = 45,
644    /// Alternate maximum 3D texture width.
645    MaxTexture3DWidthAlt = 47,
646    /// Alternate maximum 3D texture height.
647    MaxTexture3DHeightAlt = 48,
648    /// Alternate maximum 3D texture depth.
649    MaxTexture3DDepthAlt = 49,
650    /// PCI domain ID.
651    PciDomainId = 50,
652    /// Texture pitch alignment.
653    TexturePitchAlignment = 51,
654    /// Maximum 1D mipmapped texture width.
655    MaxTexture1DMipmappedWidth2 = 52,
656    /// Maximum width for a cubemap texture.
657    MaxTextureCubemapWidth = 54,
658    /// Maximum width for a cubemap layered texture.
659    MaxTextureCubemapLayeredWidth = 55,
660    /// Maximum layers in a cubemap layered texture.
661    MaxTextureCubemapLayeredLayers = 56,
662    /// Maximum 1D surface width.
663    MaxSurface1DWidth = 57,
664    /// Maximum 2D surface width.
665    MaxSurface2DWidth = 58,
666    /// Maximum 2D surface height.
667    MaxSurface2DHeight = 59,
668    /// Maximum 3D surface width.
669    MaxSurface3DWidth = 60,
670    /// Maximum 3D surface height.
671    MaxSurface3DHeight = 61,
672    /// Maximum 3D surface depth.
673    MaxSurface3DDepth = 62,
674    /// Maximum cubemap surface width.
675    MaxSurfaceCubemapWidth = 63,
676    /// Maximum 1D layered surface width.
677    MaxSurface1DLayeredWidth = 64,
678    /// Maximum layers in a 1D layered surface.
679    MaxSurface1DLayeredLayers = 65,
680    /// Maximum 2D layered surface width.
681    MaxSurface2DLayeredWidth = 66,
682    /// Maximum 2D layered surface height.
683    MaxSurface2DLayeredHeight = 67,
684    /// Maximum layers in a 2D layered surface.
685    MaxSurface2DLayeredLayers = 68,
686    /// Maximum cubemap layered surface width.
687    MaxSurfaceCubemapLayeredWidth = 69,
688    /// Maximum layers in a cubemap layered surface.
689    MaxSurfaceCubemapLayeredLayers = 70,
690    /// Maximum 1D linear texture width (deprecated).
691    MaxTexture1DLinearWidth = 71,
692    /// Maximum 2D linear texture width.
693    MaxTexture2DLinearWidth = 72,
694    /// Maximum 2D linear texture height.
695    MaxTexture2DLinearHeight = 73,
696    /// Maximum 2D linear texture pitch (bytes).
697    MaxTexture2DLinearPitch = 74,
698    /// Major compute capability version number.
699    ComputeCapabilityMajor = 75,
700    /// Minor compute capability version number.
701    ComputeCapabilityMinor = 76,
702    /// Maximum mipmapped 2D texture width.
703    MaxTexture2DMipmappedWidth = 77,
704    /// Maximum mipmapped 2D texture height.
705    MaxTexture2DMipmappedHeight = 78,
706    /// Maximum mipmapped 1D texture width.
707    MaxTexture1DMipmappedWidth = 79,
708    /// Device supports stream priorities.
709    StreamPrioritiesSupported = 80,
710    /// Maximum shared memory per multiprocessor (bytes).
711    MaxSharedMemoryPerMultiprocessor = 81,
712    /// Maximum registers per multiprocessor.
713    MaxRegistersPerMultiprocessor = 82,
714    /// Device supports managed memory.
715    ManagedMemory = 83,
716    /// Device is on a multi-GPU board.
717    IsMultiGpuBoard = 84,
718    /// Unique identifier for the multi-GPU board group.
719    MultiGpuBoardGroupId = 85,
720    /// Host-visible native-atomic support for float operations.
721    HostNativeAtomicSupported = 86,
722    /// Ratio of single-to-double precision performance.
723    SingleToDoublePrecisionPerfRatio = 87,
724    /// Device supports pageable memory access.
725    PageableMemoryAccess = 88,
726    /// Device can access host registered memory at the same virtual address.
727    ConcurrentManagedAccess = 89,
728    /// Device supports compute preemption.
729    ComputePreemptionSupported = 90,
730    /// Device can access host memory via pageable accesses.
731    CanUseHostPointerForRegisteredMem = 91,
732    /// Reserved attribute (CUDA internal, value 92).
733    Reserved92 = 92,
734    /// Reserved attribute (CUDA internal, value 93).
735    Reserved93 = 93,
736    /// Reserved attribute (CUDA internal, value 94).
737    Reserved94 = 94,
738    /// Device supports cooperative kernel launches.
739    CooperativeLaunch = 95,
740    /// Device supports cooperative kernel launches across multiple GPUs.
741    CooperativeMultiDeviceLaunch = 96,
742    /// Maximum optin shared memory per block.
743    MaxSharedMemoryPerBlockOptin = 97,
744    /// Device supports flushing of outstanding remote writes.
745    CanFlushRemoteWrites = 98,
746    /// Device supports host-side memory-register functions.
747    HostRegisterSupported = 99,
748    /// Device supports pageable memory access using host page tables.
749    PageableMemoryAccessUsesHostPageTables = 100,
750    /// Device supports direct access to managed memory on the host.
751    DirectManagedMemAccessFromHost = 101,
752    /// Device supports virtual memory management APIs.
753    VirtualMemoryManagementSupported = 102,
754    /// Device supports handle-type POSIX file descriptors for IPC.
755    HandleTypePosixFileDescriptorSupported = 103,
756    /// Device supports handle-type Win32 handles for IPC.
757    HandleTypeWin32HandleSupported = 104,
758    /// Device supports handle-type Win32 KMT handles for IPC.
759    HandleTypeWin32KmtHandleSupported = 105,
760    /// Maximum blocks per multiprocessor.
761    MaxBlocksPerMultiprocessor = 106,
762    /// Device supports generic compression for memory.
763    GenericCompressionSupported = 107,
764    /// Maximum persisting L2 cache size (bytes).
765    MaxPersistingL2CacheSize = 108,
766    /// Maximum access-policy window size for L2 cache.
767    MaxAccessPolicyWindowSize = 109,
768    /// Device supports RDMA APIs via `cuMemRangeGetAttribute`.
769    GpuDirectRdmaWithCudaVmmSupported = 110,
770    /// Free memory / total memory on the device accessible via `cuMemGetInfo`.
771    AccessPolicyMaxWindowSize = 111,
772    /// Reserved range of shared memory per SM (bytes).
773    ReservedSharedMemoryPerBlock = 112,
774    /// Device supports timeline semaphore interop.
775    TimelineSemaphoreInteropSupported = 113,
776    /// Device supports memory pools (`cudaMallocAsync`).
777    MemoryPoolsSupported = 115,
778    /// GPU direct RDMA is supported.
779    GpuDirectRdmaSupported = 116,
780    /// GPU direct RDMA flush-writes order.
781    GpuDirectRdmaFlushWritesOptions = 117,
782    /// GPU direct RDMA writes ordering.
783    GpuDirectRdmaWritesOrdering = 118,
784    /// Memory pool supported handle types.
785    MemoryPoolSupportedHandleTypes = 119,
786    /// Device supports cluster launch.
787    ClusterLaunch = 120,
788    /// Deferred mapping CUDA array supported.
789    DeferredMappingCudaArraySupported = 121,
790    /// Device supports IPC event handles.
791    IpcEventSupported = 122,
792    /// Device supports mem-sync domain count.
793    MemSyncDomainCount = 123,
794    /// Device supports tensor-map access to data.
795    TensorMapAccessSupported = 124,
796    /// Unified function pointers supported.
797    UnifiedFunctionPointers = 125,
798    /// NUMA config.
799    NumaConfig = 127,
800    /// NUMA id.
801    NumaId = 128,
802    /// Multicast supported.
803    /// Device supports getting the minimum required per-block shared memory
804    /// for a cooperative launch via the extended attributes.
805    MaxTimelineSemaphoreInteropSupported = 129,
806    /// Device supports memory sync domain operations.
807    MemSyncDomainSupported = 130,
808    /// Device supports GPU-Direct Fabric.
809    GpuDirectRdmaFabricSupported = 131,
810    /// Device supports multicast.
811    MulticastSupported = 132,
812    /// Device supports MPS features.
813    MpsEnabled = 133,
814    /// Host-NUMA identifier.
815    HostNumaId = 134,
816}
817
818// =========================================================================
819// CUjit_option — options for the JIT compiler
820// =========================================================================
821
822/// JIT compilation options passed to `cuModuleLoadDataEx` and related functions.
823#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
824#[repr(u32)]
825#[non_exhaustive]
826#[allow(non_camel_case_types)]
827pub enum CUjit_option {
828    /// Maximum number of registers that a thread may use.
829    MaxRegisters = 0,
830    /// Number of threads per block for the JIT target.
831    ThreadsPerBlock = 1,
832    /// Wall-clock time (ms) for compilation.
833    WallTime = 2,
834    /// Pointer to a buffer for info log output.
835    InfoLogBuffer = 3,
836    /// Size (bytes) of the info-log buffer.
837    InfoLogBufferSizeBytes = 4,
838    /// Pointer to a buffer for error log output.
839    ErrorLogBuffer = 5,
840    /// Size (bytes) of the error-log buffer.
841    ErrorLogBufferSizeBytes = 6,
842    /// Optimisation level (0-4).
843    OptimizationLevel = 7,
844    /// Determines the target based on the current attached context.
845    TargetFromCuContext = 8,
846    /// Specific compute target (sm_XX).
847    Target = 9,
848    /// Fallback strategy when exact match is not found.
849    FallbackStrategy = 10,
850    /// Specifies whether to generate debug info.
851    GenerateDebugInfo = 11,
852    /// Generate verbose log messages.
853    LogVerbose = 12,
854    /// Generate line-number information.
855    GenerateLineInfo = 13,
856    /// Cache mode (on / off).
857    CacheMode = 14,
858    /// (Internal) New SM3X option.
859    Sm3xOpt = 15,
860    /// Fast compile flag.
861    FastCompile = 16,
862    /// Global symbol names.
863    GlobalSymbolNames = 17,
864    /// Global symbol addresses.
865    GlobalSymbolAddresses = 18,
866    /// Number of global symbols.
867    GlobalSymbolCount = 19,
868    /// LTO flag.
869    Lto = 20,
870    /// FTZ (flush-to-zero) flag.
871    Ftz = 21,
872    /// Prec-div flag.
873    PrecDiv = 22,
874    /// Prec-sqrt flag.
875    PrecSqrt = 23,
876    /// FMA flag.
877    Fma = 24,
878    /// Referenced kernel names.
879    ReferencedKernelNames = 25,
880    /// Referenced kernel count.
881    ReferencedKernelCount = 26,
882    /// Referenced variable names.
883    ReferencedVariableNames = 27,
884    /// Referenced variable count.
885    ReferencedVariableCount = 28,
886    /// Optimise unused device variables.
887    OptimizeUnusedDeviceVariables = 29,
888    /// Position-independent code.
889    PositionIndependentCode = 30,
890}
891
892// =========================================================================
893// CUjitInputType — input types for the linker
894// =========================================================================
895
896/// Input types for `cuLinkAddData` / `cuLinkAddFile`.
897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
898#[repr(u32)]
899#[non_exhaustive]
900pub enum CUjitInputType {
901    /// PTX source code.
902    Ptx = 1,
903    /// Compiled device code (cubin).
904    Cubin = 2,
905    /// Fat binary bundle.
906    Fatbin = 3,
907    /// Relocatable device object.
908    Object = 4,
909    /// Device code library.
910    Library = 5,
911}
912
913// =========================================================================
914// Stream creation flags
915// =========================================================================
916
917/// Default stream creation flag (implicit synchronisation with the NULL stream).
918pub const CU_STREAM_DEFAULT: u32 = 0;
919
920/// Stream does not synchronise with the NULL stream.
921pub const CU_STREAM_NON_BLOCKING: u32 = 1;
922
923// =========================================================================
924// Stream-ordered memory pool attributes (CUDA 11.2+)
925// =========================================================================
926
927/// Pool reuse policy: follow event dependencies before reusing a freed block.
928pub const CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: u32 = 1;
929
930/// Pool reuse policy: allow opportunistic reuse without ordering guarantees.
931pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: u32 = 2;
932
933/// Pool reuse policy: allow the driver to insert internal dependencies for reuse.
934pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: u32 = 3;
935
936/// Release threshold (bytes): memory returned to OS when usage drops below this.
937pub const CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: u32 = 4;
938
939/// Current reserved memory in bytes (read-only).
940pub const CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: u32 = 5;
941
942/// High-water mark of reserved memory in bytes (resettable).
943pub const CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: u32 = 6;
944
945/// Current used memory in bytes (read-only).
946pub const CU_MEMPOOL_ATTR_USED_MEM_CURRENT: u32 = 7;
947
948/// High-water mark of used memory in bytes (resettable).
949pub const CU_MEMPOOL_ATTR_USED_MEM_HIGH: u32 = 8;
950
951// =========================================================================
952// Event creation flags
953// =========================================================================
954
955/// Default event creation flag.
956pub const CU_EVENT_DEFAULT: u32 = 0;
957
958/// Event uses blocking synchronisation.
959pub const CU_EVENT_BLOCKING_SYNC: u32 = 1;
960
961/// Event does not record timing data (faster).
962pub const CU_EVENT_DISABLE_TIMING: u32 = 2;
963
964/// Event may be used as an interprocess event.
965pub const CU_EVENT_INTERPROCESS: u32 = 4;
966
967// =========================================================================
968// Memory-attach flags (for managed / mapped memory)
969// =========================================================================
970
971/// Memory is accessible from any stream on any device.
972pub const CU_MEM_ATTACH_GLOBAL: u32 = 1;
973
974/// Memory is initially accessible only from the allocating stream/host.
975pub const CU_MEM_ATTACH_HOST: u32 = 2;
976
977/// Memory is initially accessible only from a single stream.
978pub const CU_MEM_ATTACH_SINGLE: u32 = 4;
979
980// =========================================================================
981// cuMemHostRegister flags
982// =========================================================================
983
984/// Registered memory is portable across CUDA contexts.
985pub const CU_MEMHOSTREGISTER_PORTABLE: u32 = 0x01;
986
987/// Registered memory is mapped into the device address space.
988pub const CU_MEMHOSTREGISTER_DEVICEMAP: u32 = 0x02;
989
990/// Pointer is to I/O memory (not system RAM).
991pub const CU_MEMHOSTREGISTER_IOMEMORY: u32 = 0x04;
992
993/// Registered memory will not be written by the GPU (read-only).
994pub const CU_MEMHOSTREGISTER_READ_ONLY: u32 = 0x08;
995
996// =========================================================================
997// cuPointerGetAttribute attribute codes
998// =========================================================================
999
1000/// Query the CUDA context associated with a pointer.
1001pub const CU_POINTER_ATTRIBUTE_CONTEXT: u32 = 1;
1002
1003/// Query the memory type (host / device / unified) of a pointer.
1004pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: u32 = 2;
1005
1006/// Query the device pointer corresponding to a host pointer.
1007pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: u32 = 3;
1008
1009/// Query the host pointer corresponding to a device pointer.
1010pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: u32 = 4;
1011
1012/// Query whether the memory is managed (unified).
1013pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: u32 = 7;
1014
1015// =========================================================================
1016// CU_MEMORYTYPE values (returned by pointer attribute queries)
1017// =========================================================================
1018
1019/// Host (system) memory.
1020pub const CU_MEMORYTYPE_HOST: u32 = 1;
1021
1022/// Device (GPU) memory.
1023pub const CU_MEMORYTYPE_DEVICE: u32 = 2;
1024
1025/// Array memory.
1026pub const CU_MEMORYTYPE_ARRAY: u32 = 3;
1027
1028/// Unified (managed) memory.
1029pub const CU_MEMORYTYPE_UNIFIED: u32 = 4;
1030
1031// =========================================================================
1032// Context scheduling flags
1033// =========================================================================
1034
1035/// The driver picks the most appropriate scheduling mode.
1036pub const CU_CTX_SCHED_AUTO: u32 = 0;
1037
1038/// Actively spin when waiting for results from the GPU.
1039pub const CU_CTX_SCHED_SPIN: u32 = 1;
1040
1041/// Yield the CPU when waiting for results from the GPU.
1042pub const CU_CTX_SCHED_YIELD: u32 = 2;
1043
1044/// Block the calling thread when waiting for results.
1045pub const CU_CTX_SCHED_BLOCKING_SYNC: u32 = 4;
1046
1047/// Mask for the scheduling flags.
1048pub const CU_CTX_SCHED_MASK: u32 = 0x07;
1049
1050/// Support mapped pinned allocations.
1051pub const CU_CTX_MAP_HOST: u32 = 0x08;
1052
1053/// Keep local memory allocation after launch.
1054pub const CU_CTX_LMEM_RESIZE_TO_MAX: u32 = 0x10;
1055
1056/// Coredump enable.
1057pub const CU_CTX_COREDUMP_ENABLE: u32 = 0x20;
1058
1059/// User coredump enable.
1060pub const CU_CTX_USER_COREDUMP_ENABLE: u32 = 0x40;
1061
1062/// Sync-memops flag.
1063pub const CU_CTX_SYNC_MEMOPS: u32 = 0x80;
1064
1065/// Mask for all context flags.
1066pub const CU_CTX_FLAGS_MASK: u32 = 0xFF;
1067
1068// =========================================================================
1069// Function attribute values (used with cuFuncGetAttribute)
1070// =========================================================================
1071
1072/// Maximum threads per block for this function.
1073pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: i32 = 0;
1074
1075/// Shared memory used by this function (bytes).
1076pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: i32 = 1;
1077
1078/// Size of user-allocated constant memory (bytes).
1079pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: i32 = 2;
1080
1081/// Size of local memory used by each thread (bytes).
1082pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: i32 = 3;
1083
1084/// Number of registers used by each thread.
1085pub const CU_FUNC_ATTRIBUTE_NUM_REGS: i32 = 4;
1086
1087/// PTX virtual architecture version (e.g. 70 for sm_70).
1088pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: i32 = 5;
1089
1090/// Binary architecture version (e.g. 70 for sm_70).
1091pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: i32 = 6;
1092
1093/// Whether this function has been cached.
1094pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: i32 = 7;
1095
1096/// Maximum dynamic shared memory size (bytes).
1097pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: i32 = 8;
1098
1099/// Preferred shared memory carve-out.
1100pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: i32 = 9;
1101
1102/// Cluster size setting.
1103pub const CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: i32 = 10;
1104
1105/// Required cluster width.
1106pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: i32 = 11;
1107
1108/// Required cluster height.
1109pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: i32 = 12;
1110
1111/// Required cluster depth.
1112pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: i32 = 13;
1113
1114/// Non-portable cluster size allowed.
1115pub const CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: i32 = 14;
1116
1117/// Required cluster scheduling policy preference.
1118pub const CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: i32 = 15;
1119
1120// =========================================================================
1121// Memory advise values
1122// =========================================================================
1123
1124/// Hint that the data will be read mostly.
1125pub const CU_MEM_ADVISE_SET_READ_MOSTLY: u32 = 1;
1126
1127/// Unset read-mostly hint.
1128pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: u32 = 2;
1129
1130/// Set the preferred location to the specified device.
1131pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: u32 = 3;
1132
1133/// Unset the preferred location.
1134pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: u32 = 4;
1135
1136/// Set access from the specified device.
1137pub const CU_MEM_ADVISE_SET_ACCESSED_BY: u32 = 5;
1138
1139/// Unset access from the specified device.
1140pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: u32 = 6;
1141
1142// =========================================================================
1143// Limit values (cuCtxSetLimit / cuCtxGetLimit)
1144// =========================================================================
1145
1146/// Stack size for each GPU thread.
1147pub const CU_LIMIT_STACK_SIZE: u32 = 0;
1148
1149/// Size of the printf FIFO.
1150pub const CU_LIMIT_PRINTF_FIFO_SIZE: u32 = 1;
1151
1152/// Size of the heap used by `malloc()` on the device.
1153pub const CU_LIMIT_MALLOC_HEAP_SIZE: u32 = 2;
1154
1155/// Maximum nesting depth of a device runtime launch.
1156pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: u32 = 3;
1157
1158/// Maximum number of outstanding device runtime launches.
1159pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: u32 = 4;
1160
1161/// L2 cache fetch granularity.
1162pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: u32 = 5;
1163
1164/// Maximum persisting L2 cache size.
1165pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: u32 = 6;
1166
1167// =========================================================================
1168// Occupancy flags
1169// =========================================================================
1170
1171/// Default occupancy calculation.
1172pub const CU_OCCUPANCY_DEFAULT: u32 = 0;
1173
1174/// Disable caching override.
1175pub const CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE: u32 = 1;
1176
1177// =========================================================================
1178// cuLaunchKernelEx cluster launch types (CUDA 12.x)
1179// =========================================================================
1180
1181/// Attribute identifier for `CuLaunchAttribute`.
1182///
1183/// Controls which extended kernel launch feature is configured.
1184/// Used with `cuLaunchKernelEx` (CUDA 12.0+).
1185#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1186#[repr(u32)]
1187pub enum CuLaunchAttributeId {
1188    /// Controls whether shared memory reuse is ignored.
1189    IgnoreSharedMemoryReuse = 1,
1190    /// Specifies thread block cluster dimensions (sm_90+).
1191    ClusterDimension = 2,
1192    /// Controls cluster scheduling policy preference.
1193    ClusterSchedulingPolicyPreference = 3,
1194    /// Enables programmatic stream serialization.
1195    ProgrammaticStreamSerialization = 4,
1196    /// Specifies a programmatic completion event.
1197    ProgrammaticEvent = 5,
1198    /// Specifies kernel launch priority.
1199    Priority = 6,
1200    /// Maps memory synchronization domains.
1201    MemSyncDomainMap = 7,
1202    /// Sets memory synchronization domain.
1203    MemSyncDomain = 8,
1204    /// Specifies a launch completion event.
1205    LaunchCompletionEvent = 9,
1206    /// Configures device-updatable kernel node.
1207    DeviceUpdatableKernelNode = 10,
1208}
1209
1210/// Cluster dimension for thread block clusters (sm_90+).
1211///
1212/// Specifies how many thread blocks form one cluster in each dimension.
1213/// Used inside [`CuLaunchAttributeValue`] when the attribute id is
1214/// [`CuLaunchAttributeId::ClusterDimension`].
1215#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1216#[repr(C)]
1217pub struct CuLaunchAttributeClusterDim {
1218    /// Cluster extent in X dimension.
1219    pub x: u32,
1220    /// Cluster extent in Y dimension.
1221    pub y: u32,
1222    /// Cluster extent in Z dimension.
1223    pub z: u32,
1224}
1225
1226/// Value union for `CuLaunchAttribute`.
1227///
1228/// # Safety
1229///
1230/// This is a C union — callers must only read the field that matches
1231/// the accompanying [`CuLaunchAttributeId`] discriminant.
1232/// Padding ensures the union is always 64 bytes, matching the CUDA ABI.
1233#[repr(C)]
1234pub union CuLaunchAttributeValue {
1235    /// Cluster dimension configuration (when id == `ClusterDimension`).
1236    pub cluster_dim: CuLaunchAttributeClusterDim,
1237    /// Scalar u32 value (for single-word attributes).
1238    pub value_u32: u32,
1239    /// Raw padding to maintain 64-byte ABI alignment.
1240    pub pad: [u8; 64],
1241}
1242
1243// Manual Clone/Copy for the union (derive cannot handle unions with non-Copy
1244// fields, but all union fields here are effectively POD).
1245// `Copy` is declared first so that the `Clone` impl can delegate to it.
1246impl Copy for CuLaunchAttributeValue {}
1247
1248impl Clone for CuLaunchAttributeValue {
1249    fn clone(&self) -> Self {
1250        // Delegate to Copy — canonical approach for Copy types.
1251        *self
1252    }
1253}
1254
1255/// A single extended kernel launch attribute (id + value pair).
1256///
1257/// Used in the `attrs` array of [`CuLaunchConfig`].
1258#[repr(C)]
1259#[derive(Clone, Copy)]
1260pub struct CuLaunchAttribute {
1261    /// Which feature this attribute configures.
1262    pub id: CuLaunchAttributeId,
1263    /// Alignment padding (must be zero).
1264    pub pad: [u8; 4],
1265    /// The attribute value — interpret according to `id`.
1266    pub value: CuLaunchAttributeValue,
1267}
1268
1269/// Extended kernel launch configuration for `cuLaunchKernelEx` (CUDA 12.0+).
1270///
1271/// Supersedes the individual parameters of `cuLaunchKernel` and adds
1272/// support for thread block clusters, launch priorities, and other
1273/// CUDA 12.x features.
1274///
1275/// # Example
1276///
1277/// ```rust
1278/// use oxicuda_driver::ffi::{
1279///     CuLaunchConfig, CuLaunchAttribute, CuLaunchAttributeId,
1280///     CuLaunchAttributeValue, CuLaunchAttributeClusterDim, CUstream,
1281/// };
1282///
1283/// // Build a cluster-launch config for a 2×1×1 cluster.
1284/// let cluster_attr = CuLaunchAttribute {
1285///     id: CuLaunchAttributeId::ClusterDimension,
1286///     pad: [0u8; 4],
1287///     value: CuLaunchAttributeValue {
1288///         cluster_dim: CuLaunchAttributeClusterDim { x: 2, y: 1, z: 1 },
1289///     },
1290/// };
1291/// let _config = CuLaunchConfig {
1292///     grid_dim_x: 8,
1293///     grid_dim_y: 1,
1294///     grid_dim_z: 1,
1295///     block_dim_x: 256,
1296///     block_dim_y: 1,
1297///     block_dim_z: 1,
1298///     shared_mem_bytes: 0,
1299///     stream: CUstream::default(),
1300///     attrs: std::ptr::null(),
1301///     num_attrs: 0,
1302/// };
1303/// ```
1304#[repr(C)]
1305pub struct CuLaunchConfig {
1306    /// Grid dimension in X.
1307    pub grid_dim_x: u32,
1308    /// Grid dimension in Y.
1309    pub grid_dim_y: u32,
1310    /// Grid dimension in Z.
1311    pub grid_dim_z: u32,
1312    /// Block dimension in X (threads per block in X).
1313    pub block_dim_x: u32,
1314    /// Block dimension in Y.
1315    pub block_dim_y: u32,
1316    /// Block dimension in Z.
1317    pub block_dim_z: u32,
1318    /// Dynamic shared memory per block in bytes.
1319    pub shared_mem_bytes: u32,
1320    /// Stream to submit the kernel on.
1321    pub stream: CUstream,
1322    /// Pointer to an array of `num_attrs` attributes (may be null if zero).
1323    pub attrs: *const CuLaunchAttribute,
1324    /// Number of entries in `attrs`.
1325    pub num_attrs: u32,
1326}
1327
1328// SAFETY: CuLaunchConfig is a plain data structure mirroring the CUDA ABI.
1329// The raw pointer `attrs` must be valid for the lifetime of the config, but
1330// the struct itself is Send + Sync because no interior mutation occurs.
1331unsafe impl Send for CuLaunchConfig {}
1332unsafe impl Sync for CuLaunchConfig {}
1333
1334// =========================================================================
1335// CUarray / CUmipmappedArray — opaque CUDA array handles
1336// =========================================================================
1337
1338define_handle! {
1339    /// Opaque handle to a CUDA array (1-D, 2-D, or 3-D texture memory).
1340    ///
1341    /// Allocated by `cuArrayCreate_v2` / `cuArray3DCreate_v2` and freed by
1342    /// `cuArrayDestroy`. Arrays can be bound to texture objects via
1343    /// [`CUDA_RESOURCE_DESC`].
1344    CUarray
1345}
1346
1347define_handle! {
1348    /// Opaque handle to a CUDA mipmapped array (Mip-mapped texture memory).
1349    ///
1350    /// Allocated by `cuMipmappedArrayCreate` and freed by
1351    /// `cuMipmappedArrayDestroy`.
1352    CUmipmappedArray
1353}
1354
1355// =========================================================================
1356// CUarray_format — channel element format for CUDA arrays
1357// =========================================================================
1358
1359/// Element format for CUDA arrays.  Mirrors `CUarray_format_enum` in the
1360/// CUDA driver API header.
1361#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1362#[repr(u32)]
1363#[non_exhaustive]
1364#[allow(non_camel_case_types)]
1365pub enum CUarray_format {
1366    /// 8-bit unsigned integer channel.
1367    UnsignedInt8 = 0x01,
1368    /// 16-bit unsigned integer channel.
1369    UnsignedInt16 = 0x02,
1370    /// 32-bit unsigned integer channel.
1371    UnsignedInt32 = 0x03,
1372    /// 8-bit signed integer channel.
1373    SignedInt8 = 0x08,
1374    /// 16-bit signed integer channel.
1375    SignedInt16 = 0x09,
1376    /// 32-bit signed integer channel.
1377    SignedInt32 = 0x0a,
1378    /// 16-bit IEEE 754 half-precision float channel.
1379    Half = 0x10,
1380    /// 32-bit IEEE 754 single-precision float channel.
1381    Float = 0x20,
1382    /// NV12 planar YUV format (special 2-plane layout).
1383    Nv12 = 0xb0,
1384    /// 8-bit unsigned normalized integer (1 channel).
1385    UnormInt8X1 = 0xc0,
1386    /// 8-bit unsigned normalized integer (2 channels).
1387    UnormInt8X2 = 0xc1,
1388    /// 8-bit unsigned normalized integer (4 channels).
1389    UnormInt8X4 = 0xc2,
1390    /// 16-bit unsigned normalized integer (1 channel).
1391    UnormInt16X1 = 0xc3,
1392    /// 16-bit unsigned normalized integer (2 channels).
1393    UnormInt16X2 = 0xc4,
1394    /// 16-bit unsigned normalized integer (4 channels).
1395    UnormInt16X4 = 0xc5,
1396    /// 8-bit signed normalized integer (1 channel).
1397    SnormInt8X1 = 0xc6,
1398    /// 8-bit signed normalized integer (2 channels).
1399    SnormInt8X2 = 0xc7,
1400    /// 8-bit signed normalized integer (4 channels).
1401    SnormInt8X4 = 0xc8,
1402    /// 16-bit signed normalized integer (1 channel).
1403    SnormInt16X1 = 0xc9,
1404    /// 16-bit signed normalized integer (2 channels).
1405    SnormInt16X2 = 0xca,
1406    /// 16-bit signed normalized integer (4 channels).
1407    SnormInt16X4 = 0xcb,
1408    /// BC1 compressed (DXT1) unsigned.
1409    Bc1Unorm = 0x91,
1410    /// BC1 compressed (DXT1) unsigned, sRGB.
1411    Bc1UnormSrgb = 0x92,
1412    /// BC2 compressed (DXT3) unsigned.
1413    Bc2Unorm = 0x93,
1414    /// BC2 compressed (DXT3) unsigned, sRGB.
1415    Bc2UnormSrgb = 0x94,
1416    /// BC3 compressed (DXT5) unsigned.
1417    Bc3Unorm = 0x95,
1418    /// BC3 compressed (DXT5) unsigned, sRGB.
1419    Bc3UnormSrgb = 0x96,
1420    /// BC4 unsigned.
1421    Bc4Unorm = 0x97,
1422    /// BC4 signed.
1423    Bc4Snorm = 0x98,
1424    /// BC5 unsigned.
1425    Bc5Unorm = 0x99,
1426    /// BC5 signed.
1427    Bc5Snorm = 0x9a,
1428    /// BC6H unsigned 16-bit float.
1429    Bc6hUf16 = 0x9b,
1430    /// BC6H signed 16-bit float.
1431    Bc6hSf16 = 0x9c,
1432    /// BC7 unsigned.
1433    Bc7Unorm = 0x9d,
1434    /// BC7 unsigned, sRGB.
1435    Bc7UnormSrgb = 0x9e,
1436}
1437
1438// =========================================================================
1439// CUresourcetype — resource type for texture/surface objects
1440// =========================================================================
1441
1442/// Resource type discriminant for [`CUDA_RESOURCE_DESC`].
1443#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1444#[repr(u32)]
1445#[non_exhaustive]
1446pub enum CUresourcetype {
1447    /// CUDA array resource.
1448    Array = 0x00,
1449    /// CUDA mipmapped array resource.
1450    MipmappedArray = 0x01,
1451    /// Linear memory resource (1-D, no filtering beyond point).
1452    Linear = 0x02,
1453    /// Pitched 2-D linear memory resource.
1454    Pitch2d = 0x03,
1455}
1456
1457// =========================================================================
1458// CUaddress_mode — texture coordinate wrapping mode
1459// =========================================================================
1460
1461/// Texture coordinate address-wrap mode for [`CUDA_TEXTURE_DESC`].
1462#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1463#[repr(u32)]
1464#[allow(non_camel_case_types)]
1465pub enum CUaddress_mode {
1466    /// Wrap (tiles) — coordinates outside [0, dim) wrap around.
1467    Wrap = 0,
1468    /// Clamp — coordinates are clamped to [0, dim-1].
1469    Clamp = 1,
1470    /// Mirror — coordinates are mirrored across array boundaries.
1471    Mirror = 2,
1472    /// Border — out-of-range coordinates return the border color.
1473    Border = 3,
1474}
1475
1476// =========================================================================
1477// CUfilter_mode — texture / mipmap filtering mode
1478// =========================================================================
1479
1480/// Texture / mipmap sampling filter mode for [`CUDA_TEXTURE_DESC`].
1481#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1482#[repr(u32)]
1483#[allow(non_camel_case_types)]
1484pub enum CUfilter_mode {
1485    /// Nearest-neighbor (point) sampling.
1486    Point = 0,
1487    /// Bilinear (linear) filtering.
1488    Linear = 1,
1489}
1490
1491// =========================================================================
1492// CUresourceViewFormat — re-interpretation format for resource views
1493// =========================================================================
1494
1495/// Format used to re-interpret a CUDA array in a resource view.
1496///
1497/// Mirrors `CUresourceViewFormat_enum` in the CUDA driver API header.
1498#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1499#[repr(u32)]
1500#[non_exhaustive]
1501pub enum CUresourceViewFormat {
1502    /// No re-interpretation (use the array's own format).
1503    None = 0x00,
1504    /// Re-interpret as 1×8-bit unsigned integer.
1505    Uint1x8 = 0x01,
1506    /// Re-interpret as 2×8-bit unsigned integer.
1507    Uint2x8 = 0x02,
1508    /// Re-interpret as 4×8-bit unsigned integer.
1509    Uint4x8 = 0x03,
1510    /// Re-interpret as 1×8-bit signed integer.
1511    Sint1x8 = 0x04,
1512    /// Re-interpret as 2×8-bit signed integer.
1513    Sint2x8 = 0x05,
1514    /// Re-interpret as 4×8-bit signed integer.
1515    Sint4x8 = 0x06,
1516    /// Re-interpret as 1×16-bit unsigned integer.
1517    Uint1x16 = 0x07,
1518    /// Re-interpret as 2×16-bit unsigned integer.
1519    Uint2x16 = 0x08,
1520    /// Re-interpret as 4×16-bit unsigned integer.
1521    Uint4x16 = 0x09,
1522    /// Re-interpret as 1×16-bit signed integer.
1523    Sint1x16 = 0x0a,
1524    /// Re-interpret as 2×16-bit signed integer.
1525    Sint2x16 = 0x0b,
1526    /// Re-interpret as 4×16-bit signed integer.
1527    Sint4x16 = 0x0c,
1528    /// Re-interpret as 1×32-bit unsigned integer.
1529    Uint1x32 = 0x0d,
1530    /// Re-interpret as 2×32-bit unsigned integer.
1531    Uint2x32 = 0x0e,
1532    /// Re-interpret as 4×32-bit unsigned integer.
1533    Uint4x32 = 0x0f,
1534    /// Re-interpret as 1×32-bit signed integer.
1535    Sint1x32 = 0x10,
1536    /// Re-interpret as 2×32-bit signed integer.
1537    Sint2x32 = 0x11,
1538    /// Re-interpret as 4×32-bit signed integer.
1539    Sint4x32 = 0x12,
1540    /// Re-interpret as 1×16-bit float.
1541    Float1x16 = 0x13,
1542    /// Re-interpret as 2×16-bit float.
1543    Float2x16 = 0x14,
1544    /// Re-interpret as 4×16-bit float.
1545    Float4x16 = 0x15,
1546    /// Re-interpret as 1×32-bit float.
1547    Float1x32 = 0x16,
1548    /// Re-interpret as 2×32-bit float.
1549    Float2x32 = 0x17,
1550    /// Re-interpret as 4×32-bit float.
1551    Float4x32 = 0x18,
1552    /// BC1 unsigned normal compressed.
1553    UnsignedBc1 = 0x19,
1554    /// BC2 unsigned normal compressed.
1555    UnsignedBc2 = 0x1a,
1556    /// BC3 unsigned normal compressed.
1557    UnsignedBc3 = 0x1b,
1558    /// BC4 unsigned normal compressed.
1559    UnsignedBc4 = 0x1c,
1560    /// BC4 signed normal compressed.
1561    SignedBc4 = 0x1d,
1562    /// BC5 unsigned normal compressed.
1563    UnsignedBc5 = 0x1e,
1564    /// BC5 signed normal compressed.
1565    SignedBc5 = 0x1f,
1566    /// BC6H unsigned half-float.
1567    UnsignedBc6h = 0x20,
1568    /// BC6H signed half-float.
1569    SignedBc6h = 0x21,
1570    /// BC7 unsigned.
1571    UnsignedBc7 = 0x22,
1572    /// NV12 planar YUV.
1573    Nv12 = 0x23,
1574}
1575
1576// =========================================================================
1577// CUDA_ARRAY_DESCRIPTOR — descriptor for 1-D and 2-D CUDA arrays
1578// =========================================================================
1579
1580/// Descriptor passed to `cuArrayCreate_v2` / `cuArrayGetDescriptor_v2`.
1581///
1582/// Mirrors `CUDA_ARRAY_DESCRIPTOR_v2` in the CUDA driver API.
1583#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1584#[repr(C)]
1585pub struct CUDA_ARRAY_DESCRIPTOR {
1586    /// Width of the array in elements.
1587    pub width: usize,
1588    /// Height of the array in elements (0 for 1-D arrays).
1589    pub height: usize,
1590    /// Element format (data type of each channel).
1591    pub format: CUarray_format,
1592    /// Number of channels (1, 2, or 4).
1593    pub num_channels: u32,
1594}
1595
1596// =========================================================================
1597// CUDA_ARRAY3D_DESCRIPTOR — descriptor for 3-D CUDA arrays
1598// =========================================================================
1599
1600/// Descriptor passed to `cuArray3DCreate_v2` / `cuArray3DGetDescriptor_v2`.
1601///
1602/// Mirrors `CUDA_ARRAY3D_DESCRIPTOR_v2` in the CUDA driver API.  The `flags`
1603/// field accepts constants such as `CUDA_ARRAY3D_LAYERED` (0x01),
1604/// `CUDA_ARRAY3D_SURFACE_LDST` (0x02), `CUDA_ARRAY3D_CUBEMAP` (0x04), and
1605/// `CUDA_ARRAY3D_TEXTURE_GATHER` (0x08).
1606#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1607#[repr(C)]
1608pub struct CUDA_ARRAY3D_DESCRIPTOR {
1609    /// Width of the array in elements.
1610    pub width: usize,
1611    /// Height of the array in elements (0 for 1-D arrays).
1612    pub height: usize,
1613    /// Depth of the array in elements (0 for 1-D and 2-D arrays).
1614    pub depth: usize,
1615    /// Element format.
1616    pub format: CUarray_format,
1617    /// Number of channels (1, 2, or 4).
1618    pub num_channels: u32,
1619    /// Creation flags (see [`CUDA_ARRAY3D_LAYERED`] etc.).
1620    pub flags: u32,
1621}
1622
1623/// Flag: allocate a layered CUDA array (`CUDA_ARRAY3D_LAYERED`).
1624pub const CUDA_ARRAY3D_LAYERED: u32 = 0x01;
1625/// Flag: array usable as a surface load/store target (`CUDA_ARRAY3D_SURFACE_LDST`).
1626pub const CUDA_ARRAY3D_SURFACE_LDST: u32 = 0x02;
1627/// Flag: allocate a cubemap array (`CUDA_ARRAY3D_CUBEMAP`).
1628pub const CUDA_ARRAY3D_CUBEMAP: u32 = 0x04;
1629/// Flag: array usable with `cudaTextureGather` (`CUDA_ARRAY3D_TEXTURE_GATHER`).
1630pub const CUDA_ARRAY3D_TEXTURE_GATHER: u32 = 0x08;
1631
1632// =========================================================================
1633// CUDA_RESOURCE_DESC — resource descriptor union for tex/surf objects
1634// =========================================================================
1635
1636/// Inner data for an `Array` resource (variant of [`CudaResourceDescRes`]).
1637#[derive(Clone, Copy)]
1638#[repr(C)]
1639pub struct CudaResourceDescArray {
1640    /// CUDA array handle.
1641    pub h_array: CUarray,
1642}
1643
1644/// Inner data for a `MipmappedArray` resource.
1645#[derive(Clone, Copy)]
1646#[repr(C)]
1647pub struct CudaResourceDescMipmap {
1648    /// Mipmapped array handle.
1649    pub h_mipmapped_array: CUmipmappedArray,
1650}
1651
1652/// Inner data for a `Linear` (1-D linear memory) resource.
1653#[derive(Clone, Copy)]
1654#[repr(C)]
1655pub struct CudaResourceDescLinear {
1656    /// Device pointer to the linear region.
1657    pub dev_ptr: CUdeviceptr,
1658    /// Channel element format.
1659    pub format: CUarray_format,
1660    /// Number of channels.
1661    pub num_channels: u32,
1662    /// Total size in bytes.
1663    pub size_in_bytes: usize,
1664}
1665
1666/// Inner data for a `Pitch2D` (2-D pitched linear memory) resource.
1667#[derive(Clone, Copy)]
1668#[repr(C)]
1669pub struct CudaResourceDescPitch2d {
1670    /// Device pointer to the pitched region (first row).
1671    pub dev_ptr: CUdeviceptr,
1672    /// Channel element format.
1673    pub format: CUarray_format,
1674    /// Number of channels.
1675    pub num_channels: u32,
1676    /// Width of the array in elements.
1677    pub width_in_elements: usize,
1678    /// Height of the array in elements.
1679    pub height: usize,
1680    /// Row pitch in bytes (stride between rows).
1681    pub pitch_in_bytes: usize,
1682}
1683
1684/// Union of resource descriptors for [`CUDA_RESOURCE_DESC`].
1685///
1686/// # Safety
1687///
1688/// Callers must only read the field whose discriminant matches the
1689/// `res_type` field of the enclosing [`CUDA_RESOURCE_DESC`].
1690#[repr(C)]
1691pub union CudaResourceDescRes {
1692    /// Array resource.
1693    pub array: CudaResourceDescArray,
1694    /// Mipmapped array resource.
1695    pub mipmap: CudaResourceDescMipmap,
1696    /// 1-D linear memory resource.
1697    pub linear: CudaResourceDescLinear,
1698    /// 2-D pitched linear memory resource.
1699    pub pitch2d: CudaResourceDescPitch2d,
1700    /// Padding: ensures the union is 128 bytes (32 × i32), matching the ABI.
1701    pub reserved: [i32; 32],
1702}
1703
1704/// Resource descriptor passed to `cuTexObjectCreate` / `cuSurfObjectCreate`.
1705///
1706/// Mirrors `CUDA_RESOURCE_DESC` in the CUDA driver API header.
1707#[repr(C)]
1708pub struct CUDA_RESOURCE_DESC {
1709    /// Identifies which union field inside `res` is valid.
1710    pub res_type: CUresourcetype,
1711    /// Resource payload — interpret via `res_type`.
1712    pub res: CudaResourceDescRes,
1713    /// Reserved flags (must be zero).
1714    pub flags: u32,
1715}
1716
1717// =========================================================================
1718// CUDA_TEXTURE_DESC — texture object sampling parameters
1719// =========================================================================
1720
1721/// Texture object descriptor passed to `cuTexObjectCreate`.
1722///
1723/// Mirrors `CUDA_TEXTURE_DESC` in the CUDA driver API.  All fields that the
1724/// caller does not set explicitly should be zeroed.
1725///
1726/// # Layout
1727///
1728/// The struct is `#[repr(C)]` and contains 64 bytes of reserved padding so
1729/// that it matches the binary ABI expected by the driver.
1730#[derive(Clone, Copy)]
1731#[repr(C)]
1732pub struct CUDA_TEXTURE_DESC {
1733    /// Address mode for each coordinate dimension (`[U, V, W]`).
1734    pub address_mode: [CUaddress_mode; 3],
1735    /// Texture filter mode (point or linear).
1736    pub filter_mode: CUfilter_mode,
1737    /// Flags: bit 0 = `CU_TRSF_READ_AS_INTEGER`, bit 1 = `CU_TRSF_NORMALIZED_COORDINATES`,
1738    /// bit 2 = `CU_TRSF_SRGB`, bit 3 = `CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION`.
1739    pub flags: u32,
1740    /// Maximum anisotropy ratio (1–16; 1 disables anisotropy).
1741    pub max_anisotropy: u32,
1742    /// Mipmap filter mode.
1743    pub mipmap_filter_mode: CUfilter_mode,
1744    /// Mipmap level-of-detail bias.
1745    pub mipmap_level_bias: f32,
1746    /// Minimum mipmap LOD clamp value.
1747    pub min_mipmap_level_clamp: f32,
1748    /// Maximum mipmap LOD clamp value.
1749    pub max_mipmap_level_clamp: f32,
1750    /// Border color (RGBA, applied when address mode is `Border`).
1751    pub border_color: [f32; 4],
1752    /// Reserved: must be zero.
1753    pub reserved: [i32; 12],
1754}
1755
1756/// Flag: texture reads return raw integers (no type conversion).
1757pub const CU_TRSF_READ_AS_INTEGER: u32 = 0x01;
1758/// Flag: texture coordinates are normalized to [0, 1).
1759pub const CU_TRSF_NORMALIZED_COORDINATES: u32 = 0x02;
1760/// Flag: sRGB gamma encoding is applied during sampling.
1761pub const CU_TRSF_SRGB: u32 = 0x10;
1762/// Flag: disable hardware trilinear optimisation.
1763pub const CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION: u32 = 0x20;
1764
1765// =========================================================================
1766// CUDA_RESOURCE_VIEW_DESC — optional re-interpretation of array resources
1767// =========================================================================
1768
1769/// Optional resource view descriptor for `cuTexObjectCreate`.
1770///
1771/// Allows the caller to specify a sub-region, a different channel
1772/// interpretation format, or a mipmap range for a [`CUDA_RESOURCE_DESC`] that
1773/// wraps a CUDA array.  Pass a null pointer to `cuTexObjectCreate` to skip the
1774/// view override.
1775///
1776/// Mirrors `CUDA_RESOURCE_VIEW_DESC` in the CUDA driver API.
1777#[derive(Clone, Copy)]
1778#[repr(C)]
1779pub struct CUDA_RESOURCE_VIEW_DESC {
1780    /// Format to use for the resource view (re-interpretation).
1781    pub format: CUresourceViewFormat,
1782    /// Width of the view in elements.
1783    pub width: usize,
1784    /// Height of the view in elements.
1785    pub height: usize,
1786    /// Depth of the view in elements.
1787    pub depth: usize,
1788    /// First mipmap level included in the view.
1789    pub first_mipmap_level: u32,
1790    /// Last mipmap level included in the view.
1791    pub last_mipmap_level: u32,
1792    /// First array layer in a layered resource.
1793    pub first_layer: u32,
1794    /// Last array layer in a layered resource.
1795    pub last_layer: u32,
1796    /// Reserved: must be zero.
1797    pub reserved: [u32; 16],
1798}
1799
1800// =========================================================================
1801// Tests
1802// =========================================================================
1803
1804#[cfg(test)]
1805mod tests {
1806    use super::*;
1807
1808    #[test]
1809    fn test_cuda_success_is_zero() {
1810        assert_eq!(CUDA_SUCCESS, 0);
1811    }
1812
1813    #[test]
1814    fn test_opaque_types_are_pointer_sized() {
1815        assert_eq!(
1816            std::mem::size_of::<CUcontext>(),
1817            std::mem::size_of::<*mut c_void>()
1818        );
1819        assert_eq!(
1820            std::mem::size_of::<CUmodule>(),
1821            std::mem::size_of::<*mut c_void>()
1822        );
1823        assert_eq!(
1824            std::mem::size_of::<CUstream>(),
1825            std::mem::size_of::<*mut c_void>()
1826        );
1827        assert_eq!(
1828            std::mem::size_of::<CUevent>(),
1829            std::mem::size_of::<*mut c_void>()
1830        );
1831        assert_eq!(
1832            std::mem::size_of::<CUfunction>(),
1833            std::mem::size_of::<*mut c_void>()
1834        );
1835        assert_eq!(
1836            std::mem::size_of::<CUmemoryPool>(),
1837            std::mem::size_of::<*mut c_void>()
1838        );
1839    }
1840
1841    #[test]
1842    fn test_handle_default_is_null() {
1843        assert!(CUcontext::default().is_null());
1844        assert!(CUmodule::default().is_null());
1845        assert!(CUfunction::default().is_null());
1846        assert!(CUstream::default().is_null());
1847        assert!(CUevent::default().is_null());
1848        assert!(CUmemoryPool::default().is_null());
1849    }
1850
1851    #[test]
1852    fn test_device_attribute_repr() {
1853        // Original variants
1854        assert_eq!(CUdevice_attribute::MaxThreadsPerBlock as i32, 1);
1855        assert_eq!(CUdevice_attribute::WarpSize as i32, 10);
1856        assert_eq!(CUdevice_attribute::MultiprocessorCount as i32, 16);
1857        assert_eq!(CUdevice_attribute::ComputeCapabilityMajor as i32, 75);
1858        assert_eq!(CUdevice_attribute::ComputeCapabilityMinor as i32, 76);
1859        assert_eq!(CUdevice_attribute::MaxBlocksPerMultiprocessor as i32, 106);
1860        assert_eq!(CUdevice_attribute::L2CacheSize as i32, 38);
1861        assert_eq!(
1862            CUdevice_attribute::MaxSharedMemoryPerMultiprocessor as i32,
1863            81
1864        );
1865        assert_eq!(CUdevice_attribute::ManagedMemory as i32, 83);
1866
1867        // New variants
1868        assert_eq!(CUdevice_attribute::MaxTexture2DGatherWidth as i32, 44);
1869        assert_eq!(CUdevice_attribute::MaxTexture2DGatherHeight as i32, 45);
1870        assert_eq!(CUdevice_attribute::MaxTexture3DWidthAlt as i32, 47);
1871        assert_eq!(CUdevice_attribute::MaxTexture3DHeightAlt as i32, 48);
1872        assert_eq!(CUdevice_attribute::MaxTexture3DDepthAlt as i32, 49);
1873        assert_eq!(CUdevice_attribute::MaxTexture1DMipmappedWidth2 as i32, 52);
1874        assert_eq!(CUdevice_attribute::Reserved92 as i32, 92);
1875        assert_eq!(CUdevice_attribute::Reserved93 as i32, 93);
1876        assert_eq!(CUdevice_attribute::Reserved94 as i32, 94);
1877        assert_eq!(
1878            CUdevice_attribute::VirtualMemoryManagementSupported as i32,
1879            102
1880        );
1881        assert_eq!(
1882            CUdevice_attribute::HandleTypePosixFileDescriptorSupported as i32,
1883            103
1884        );
1885        assert_eq!(
1886            CUdevice_attribute::HandleTypeWin32HandleSupported as i32,
1887            104
1888        );
1889        assert_eq!(
1890            CUdevice_attribute::HandleTypeWin32KmtHandleSupported as i32,
1891            105
1892        );
1893        assert_eq!(CUdevice_attribute::AccessPolicyMaxWindowSize as i32, 111);
1894        assert_eq!(CUdevice_attribute::ReservedSharedMemoryPerBlock as i32, 112);
1895        assert_eq!(
1896            CUdevice_attribute::TimelineSemaphoreInteropSupported as i32,
1897            113
1898        );
1899        assert_eq!(CUdevice_attribute::MemoryPoolsSupported as i32, 115);
1900        assert_eq!(CUdevice_attribute::ClusterLaunch as i32, 120);
1901        assert_eq!(CUdevice_attribute::UnifiedFunctionPointers as i32, 125);
1902        assert_eq!(
1903            CUdevice_attribute::MaxTimelineSemaphoreInteropSupported as i32,
1904            129
1905        );
1906        assert_eq!(CUdevice_attribute::MemSyncDomainSupported as i32, 130);
1907        assert_eq!(CUdevice_attribute::GpuDirectRdmaFabricSupported as i32, 131);
1908    }
1909
1910    #[test]
1911    fn test_jit_option_repr() {
1912        assert_eq!(CUjit_option::MaxRegisters as u32, 0);
1913        assert_eq!(CUjit_option::ThreadsPerBlock as u32, 1);
1914        assert_eq!(CUjit_option::WallTime as u32, 2);
1915        assert_eq!(CUjit_option::InfoLogBuffer as u32, 3);
1916        assert_eq!(CUjit_option::InfoLogBufferSizeBytes as u32, 4);
1917        assert_eq!(CUjit_option::ErrorLogBuffer as u32, 5);
1918        assert_eq!(CUjit_option::ErrorLogBufferSizeBytes as u32, 6);
1919        assert_eq!(CUjit_option::OptimizationLevel as u32, 7);
1920        assert_eq!(CUjit_option::Target as u32, 9);
1921        assert_eq!(CUjit_option::FallbackStrategy as u32, 10);
1922    }
1923
1924    #[test]
1925    fn test_stream_and_event_flags() {
1926        assert_eq!(CU_STREAM_DEFAULT, 0);
1927        assert_eq!(CU_STREAM_NON_BLOCKING, 1);
1928        assert_eq!(CU_EVENT_DEFAULT, 0);
1929        assert_eq!(CU_EVENT_BLOCKING_SYNC, 1);
1930        assert_eq!(CU_EVENT_DISABLE_TIMING, 2);
1931        assert_eq!(CU_EVENT_INTERPROCESS, 4);
1932    }
1933
1934    #[test]
1935    fn test_context_scheduling_flags() {
1936        assert_eq!(CU_CTX_SCHED_AUTO, 0);
1937        assert_eq!(CU_CTX_SCHED_SPIN, 1);
1938        assert_eq!(CU_CTX_SCHED_YIELD, 2);
1939        assert_eq!(CU_CTX_SCHED_BLOCKING_SYNC, 4);
1940    }
1941
1942    #[test]
1943    fn test_mem_attach_flags() {
1944        assert_eq!(CU_MEM_ATTACH_GLOBAL, 1);
1945        assert_eq!(CU_MEM_ATTACH_HOST, 2);
1946        assert_eq!(CU_MEM_ATTACH_SINGLE, 4);
1947    }
1948
1949    #[test]
1950    #[allow(clippy::assertions_on_constants)]
1951    fn test_error_code_ranges() {
1952        // Basic errors: 1-8
1953        assert!(CUDA_ERROR_INVALID_VALUE < 10);
1954        // Device errors: 100-102
1955        assert!((100..=102).contains(&CUDA_ERROR_NO_DEVICE));
1956        assert!((100..=102).contains(&CUDA_ERROR_INVALID_DEVICE));
1957        assert!((100..=102).contains(&CUDA_ERROR_DEVICE_NOT_LICENSED));
1958        // Image/context errors: 200+
1959        assert!(CUDA_ERROR_INVALID_IMAGE >= 200);
1960        // Launch errors: 700+
1961        assert!(CUDA_ERROR_LAUNCH_FAILED >= 700);
1962        assert!(CUDA_ERROR_ILLEGAL_ADDRESS >= 700);
1963        assert!(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES >= 700);
1964        // Stream capture errors: 900+
1965        assert!(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED >= 900);
1966        // Unknown is 999
1967        assert_eq!(CUDA_ERROR_UNKNOWN, 999);
1968    }
1969
1970    #[test]
1971    fn test_func_attribute_constants() {
1972        assert_eq!(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 0);
1973        assert_eq!(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, 1);
1974        assert_eq!(CU_FUNC_ATTRIBUTE_NUM_REGS, 4);
1975    }
1976
1977    #[test]
1978    fn test_limit_constants() {
1979        assert_eq!(CU_LIMIT_STACK_SIZE, 0);
1980        assert_eq!(CU_LIMIT_PRINTF_FIFO_SIZE, 1);
1981        assert_eq!(CU_LIMIT_MALLOC_HEAP_SIZE, 2);
1982    }
1983
1984    #[test]
1985    fn test_memory_type_constants() {
1986        assert_eq!(CU_MEMORYTYPE_HOST, 1);
1987        assert_eq!(CU_MEMORYTYPE_DEVICE, 2);
1988        assert_eq!(CU_MEMORYTYPE_ARRAY, 3);
1989        assert_eq!(CU_MEMORYTYPE_UNIFIED, 4);
1990    }
1991
1992    #[test]
1993    fn test_handle_debug_format() {
1994        let ctx = CUcontext::default();
1995        let debug_str = format!("{ctx:?}");
1996        assert!(debug_str.starts_with("CUcontext("));
1997    }
1998
1999    #[test]
2000    fn test_handle_equality() {
2001        let a = CUcontext::default();
2002        let b = CUcontext::default();
2003        assert_eq!(a, b);
2004    }
2005
2006    #[test]
2007    fn test_new_handle_types_are_pointer_sized() {
2008        assert_eq!(
2009            std::mem::size_of::<CUtexref>(),
2010            std::mem::size_of::<*mut c_void>()
2011        );
2012        assert_eq!(
2013            std::mem::size_of::<CUsurfref>(),
2014            std::mem::size_of::<*mut c_void>()
2015        );
2016        assert_eq!(
2017            std::mem::size_of::<CUtexObject>(),
2018            std::mem::size_of::<*mut c_void>()
2019        );
2020        assert_eq!(
2021            std::mem::size_of::<CUsurfObject>(),
2022            std::mem::size_of::<*mut c_void>()
2023        );
2024    }
2025
2026    #[test]
2027    fn test_new_handle_defaults_are_null() {
2028        assert!(CUtexref::default().is_null());
2029        assert!(CUsurfref::default().is_null());
2030        assert!(CUtexObject::default().is_null());
2031        assert!(CUsurfObject::default().is_null());
2032    }
2033
2034    #[test]
2035    fn test_memory_type_enum() {
2036        assert_eq!(CUmemorytype::Host as u32, 1);
2037        assert_eq!(CUmemorytype::Device as u32, 2);
2038        assert_eq!(CUmemorytype::Array as u32, 3);
2039        assert_eq!(CUmemorytype::Unified as u32, 4);
2040    }
2041
2042    #[test]
2043    fn test_pointer_attribute_enum() {
2044        assert_eq!(CUpointer_attribute::Context as u32, 1);
2045        assert_eq!(CUpointer_attribute::MemoryType as u32, 2);
2046        assert_eq!(CUpointer_attribute::DevicePointer as u32, 3);
2047        assert_eq!(CUpointer_attribute::HostPointer as u32, 4);
2048        assert_eq!(CUpointer_attribute::IsManaged as u32, 9);
2049        assert_eq!(CUpointer_attribute::DeviceOrdinal as u32, 10);
2050    }
2051
2052    #[test]
2053    fn test_limit_enum() {
2054        assert_eq!(CUlimit::StackSize as u32, 0);
2055        assert_eq!(CUlimit::PrintfFifoSize as u32, 1);
2056        assert_eq!(CUlimit::MallocHeapSize as u32, 2);
2057        assert_eq!(CUlimit::DevRuntimeSyncDepth as u32, 3);
2058        assert_eq!(CUlimit::DevRuntimePendingLaunchCount as u32, 4);
2059        assert_eq!(CUlimit::MaxL2FetchGranularity as u32, 5);
2060        assert_eq!(CUlimit::PersistingL2CacheSize as u32, 6);
2061    }
2062
2063    #[test]
2064    fn test_function_attribute_enum() {
2065        assert_eq!(CUfunction_attribute::MaxThreadsPerBlock as i32, 0);
2066        assert_eq!(CUfunction_attribute::SharedSizeBytes as i32, 1);
2067        assert_eq!(CUfunction_attribute::NumRegs as i32, 4);
2068        assert_eq!(CUfunction_attribute::PtxVersion as i32, 5);
2069        assert_eq!(CUfunction_attribute::BinaryVersion as i32, 6);
2070        assert_eq!(CUfunction_attribute::MaxDynamicSharedSizeBytes as i32, 8);
2071        assert_eq!(
2072            CUfunction_attribute::PreferredSharedMemoryCarveout as i32,
2073            9
2074        );
2075    }
2076}