oxicuda_driver/ffi.rs
1//! Raw CUDA Driver API FFI types, constants, and enums.
2//!
3//! This module provides the low-level type definitions that mirror the CUDA Driver API
4//! (`cuda.h`). No functions are defined here — only types, opaque pointer aliases,
5//! result-code constants, and `#[repr]` enums used by the dynamically loaded driver
6//! entry points.
7//!
8//! # Safety
9//!
10//! All pointer types in this module are raw pointers intended for FFI use.
11//! They must only be used through the safe wrappers provided by higher-level
12//! modules in `oxicuda-driver`.
13
14use std::ffi::c_void;
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// Core scalar type aliases
19// ---------------------------------------------------------------------------
20
21/// Return code from every CUDA Driver API call.
22///
23/// A value of `0` (`CUDA_SUCCESS`) indicates success; any other value is an
24/// error code. See the `CUDA_*` constants below for the full catalogue.
25pub type CUresult = u32;
26
27/// Ordinal identifier for a CUDA-capable device (0-based).
28pub type CUdevice = i32;
29
30/// Device-side pointer (64-bit address in GPU virtual memory).
31pub type CUdeviceptr = u64;
32
33// ---------------------------------------------------------------------------
34// Opaque handle helpers
35// ---------------------------------------------------------------------------
36
37macro_rules! define_handle {
38 ($(#[$meta:meta])* $name:ident) => {
39 $(#[$meta])*
40 #[repr(transparent)]
41 #[derive(Clone, Copy, PartialEq, Eq, Hash)]
42 pub struct $name(pub *mut c_void);
43
44 // SAFETY: CUDA handles are thread-safe when used with proper
45 // synchronisation via the driver API.
46 unsafe impl Send for $name {}
47 unsafe impl Sync for $name {}
48
49 impl fmt::Debug for $name {
50 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51 write!(f, "{}({:p})", stringify!($name), self.0)
52 }
53 }
54
55 impl Default for $name {
56 fn default() -> Self {
57 Self(std::ptr::null_mut())
58 }
59 }
60
61 impl $name {
62 /// Returns `true` if the handle is null (uninitialised).
63 #[inline]
64 pub fn is_null(self) -> bool {
65 self.0.is_null()
66 }
67 }
68 };
69}
70
71// ---------------------------------------------------------------------------
72// Handle types
73// ---------------------------------------------------------------------------
74
75define_handle! {
76 /// Opaque handle to a CUDA context.
77 CUcontext
78}
79
80define_handle! {
81 /// Opaque handle to a loaded CUDA module (PTX / cubin).
82 CUmodule
83}
84
85define_handle! {
86 /// Opaque handle to a CUDA kernel function within a module.
87 CUfunction
88}
89
90define_handle! {
91 /// Opaque handle to a CUDA stream (command queue).
92 CUstream
93}
94
95define_handle! {
96 /// Opaque handle to a CUDA event (used for timing and synchronisation).
97 CUevent
98}
99
100define_handle! {
101 /// Opaque handle to a CUDA memory pool (`cuMemPool*` family).
102 CUmemoryPool
103}
104
105define_handle! {
106 /// Opaque handle to a CUDA texture reference (legacy API).
107 CUtexref
108}
109
110define_handle! {
111 /// Opaque handle to a CUDA surface reference (legacy API).
112 CUsurfref
113}
114
115define_handle! {
116 /// Opaque handle to a CUDA texture object (modern bindless API).
117 CUtexObject
118}
119
120define_handle! {
121 /// Opaque handle to a CUDA surface object (modern bindless API).
122 CUsurfObject
123}
124
125define_handle! {
126 /// Opaque handle to a CUDA kernel (CUDA 12.8+ library-based kernels).
127 ///
128 /// Used with `cuKernelGetLibrary` to retrieve the library a kernel
129 /// belongs to.
130 CUkernel
131}
132
133define_handle! {
134 /// Opaque handle to a CUDA library (CUDA 12.8+ JIT library API).
135 ///
136 /// Retrieved via `cuKernelGetLibrary` to identify the JIT-compiled
137 /// library that contains a given kernel.
138 CUlibrary
139}
140
141define_handle! {
142 /// Opaque handle to an NVLink multicast object (CUDA 12.8+).
143 ///
144 /// Used with `cuMulticastCreate`, `cuMulticastAddDevice`, and related
145 /// functions to manage NVLink multicast memory regions across devices.
146 CUmulticastObject
147}
148
149// =========================================================================
150// CUmemorytype — memory type identifiers
151// =========================================================================
152
153/// Memory type identifiers returned by pointer attribute queries.
154#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
155#[repr(u32)]
156#[non_exhaustive]
157pub enum CUmemorytype {
158 /// Host (system) memory.
159 Host = 1,
160 /// Device (GPU) memory.
161 Device = 2,
162 /// Array memory.
163 Array = 3,
164 /// Unified (managed) memory.
165 Unified = 4,
166}
167
168// =========================================================================
169// CUpointer_attribute — pointer attribute query keys
170// =========================================================================
171
172/// Pointer attribute identifiers passed to `cuPointerGetAttribute`.
173#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
174#[repr(u32)]
175#[non_exhaustive]
176#[allow(non_camel_case_types)]
177pub enum CUpointer_attribute {
178 /// Query the CUDA context associated with a pointer.
179 Context = 1,
180 /// Query the memory type (host / device / unified) of a pointer.
181 MemoryType = 2,
182 /// Query the device pointer corresponding to a host pointer.
183 DevicePointer = 3,
184 /// Query the host pointer corresponding to a device pointer.
185 HostPointer = 4,
186 /// Query whether the memory is managed (unified).
187 IsManaged = 9,
188 /// Query the device ordinal for the pointer.
189 DeviceOrdinal = 10,
190}
191
192// =========================================================================
193// CUlimit — context limit identifiers
194// =========================================================================
195
196/// Context limit identifiers for `cuCtxSetLimit` / `cuCtxGetLimit`.
197#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
198#[repr(u32)]
199#[non_exhaustive]
200pub enum CUlimit {
201 /// Stack size for each GPU thread.
202 StackSize = 0,
203 /// Size of the printf FIFO.
204 PrintfFifoSize = 1,
205 /// Size of the heap used by `malloc()` on the device.
206 MallocHeapSize = 2,
207 /// Maximum nesting depth of a device runtime launch.
208 DevRuntimeSyncDepth = 3,
209 /// Maximum number of outstanding device runtime launches.
210 DevRuntimePendingLaunchCount = 4,
211 /// L2 cache fetch granularity.
212 MaxL2FetchGranularity = 5,
213 /// Maximum persisting L2 cache size.
214 PersistingL2CacheSize = 6,
215}
216
217// =========================================================================
218// CUfunction_attribute — function attribute query keys
219// =========================================================================
220
221/// Function attribute identifiers passed to `cuFuncGetAttribute`.
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
223#[repr(i32)]
224#[non_exhaustive]
225#[allow(non_camel_case_types)]
226pub enum CUfunction_attribute {
227 /// Maximum threads per block for this function.
228 MaxThreadsPerBlock = 0,
229 /// Shared memory used by this function (bytes).
230 SharedSizeBytes = 1,
231 /// Size of user-allocated constant memory (bytes).
232 ConstSizeBytes = 2,
233 /// Size of local memory used by each thread (bytes).
234 LocalSizeBytes = 3,
235 /// Number of registers used by each thread.
236 NumRegs = 4,
237 /// PTX virtual architecture version.
238 PtxVersion = 5,
239 /// Binary architecture version.
240 BinaryVersion = 6,
241 /// Whether this function has been cached.
242 CacheModeCa = 7,
243 /// Maximum dynamic shared memory size (bytes).
244 MaxDynamicSharedSizeBytes = 8,
245 /// Preferred shared memory carve-out.
246 PreferredSharedMemoryCarveout = 9,
247}
248
249// =========================================================================
250// CUresult constants — every documented CUDA Driver API error code
251// =========================================================================
252
253/// The API call returned with no errors.
254pub const CUDA_SUCCESS: CUresult = 0;
255
256/// One or more parameters passed to the API call are not acceptable.
257pub const CUDA_ERROR_INVALID_VALUE: CUresult = 1;
258
259/// The API call failed because it was unable to allocate enough memory.
260pub const CUDA_ERROR_OUT_OF_MEMORY: CUresult = 2;
261
262/// The CUDA driver has not been initialised via `cuInit`.
263pub const CUDA_ERROR_NOT_INITIALIZED: CUresult = 3;
264
265/// The CUDA driver is shutting down.
266pub const CUDA_ERROR_DEINITIALIZED: CUresult = 4;
267
268/// Profiler is not initialised for this run.
269pub const CUDA_ERROR_PROFILER_DISABLED: CUresult = 5;
270
271/// (Deprecated) Profiler not started.
272pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: CUresult = 6;
273
274/// (Deprecated) Profiler already started.
275pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: CUresult = 7;
276
277/// (Deprecated) Profiler already stopped.
278pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: CUresult = 8;
279
280/// Stub library loaded instead of the real driver.
281pub const CUDA_ERROR_STUB_LIBRARY: CUresult = 34;
282
283/// Device-side assert triggered.
284pub const CUDA_ERROR_DEVICE_UNAVAILABLE: CUresult = 46;
285
286/// No CUDA-capable device is detected.
287pub const CUDA_ERROR_NO_DEVICE: CUresult = 100;
288
289/// The device ordinal supplied is out of range.
290pub const CUDA_ERROR_INVALID_DEVICE: CUresult = 101;
291
292/// The device does not have a valid licence.
293pub const CUDA_ERROR_DEVICE_NOT_LICENSED: CUresult = 102;
294
295/// The PTX or cubin image is invalid.
296pub const CUDA_ERROR_INVALID_IMAGE: CUresult = 200;
297
298/// The supplied context is not valid.
299pub const CUDA_ERROR_INVALID_CONTEXT: CUresult = 201;
300
301/// (Deprecated) Context already current.
302pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CUresult = 202;
303
304/// A map or register operation has failed.
305pub const CUDA_ERROR_MAP_FAILED: CUresult = 205;
306
307/// An unmap or unregister operation has failed.
308pub const CUDA_ERROR_UNMAP_FAILED: CUresult = 206;
309
310/// The specified array is currently mapped.
311pub const CUDA_ERROR_ARRAY_IS_MAPPED: CUresult = 207;
312
313/// The resource is already mapped.
314pub const CUDA_ERROR_ALREADY_MAPPED: CUresult = 208;
315
316/// There is no kernel image available for execution on the device.
317pub const CUDA_ERROR_NO_BINARY_FOR_GPU: CUresult = 209;
318
319/// A resource has already been acquired.
320pub const CUDA_ERROR_ALREADY_ACQUIRED: CUresult = 210;
321
322/// The resource is not mapped.
323pub const CUDA_ERROR_NOT_MAPPED: CUresult = 211;
324
325/// A mapped resource is not available for access as an array.
326pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: CUresult = 212;
327
328/// A mapped resource is not available for access as a pointer.
329pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: CUresult = 213;
330
331/// An uncorrectable ECC error was detected.
332pub const CUDA_ERROR_ECC_UNCORRECTABLE: CUresult = 214;
333
334/// A PTX JIT limit has been reached.
335pub const CUDA_ERROR_UNSUPPORTED_LIMIT: CUresult = 215;
336
337/// The context already has work from another thread bound to it.
338pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: CUresult = 216;
339
340/// Peer access is not supported across the given devices.
341pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: CUresult = 217;
342
343/// The PTX JIT compilation was disabled or the PTX is invalid.
344pub const CUDA_ERROR_INVALID_PTX: CUresult = 218;
345
346/// Invalid graphics context.
347pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: CUresult = 219;
348
349/// NVLINK is uncorrectable.
350pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: CUresult = 220;
351
352/// JIT compiler not found.
353pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: CUresult = 221;
354
355/// Unsupported PTX version.
356pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: CUresult = 222;
357
358/// JIT compilation disabled.
359pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: CUresult = 223;
360
361/// Unsupported exec-affinity type.
362pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: CUresult = 224;
363
364/// Unsupported device-side synchronisation on this device.
365pub const CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = 225;
366
367/// The requested source is invalid.
368pub const CUDA_ERROR_INVALID_SOURCE: CUresult = 300;
369
370/// The named file was not found.
371pub const CUDA_ERROR_FILE_NOT_FOUND: CUresult = 301;
372
373/// A shared-object symbol lookup failed.
374pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = 302;
375
376/// The shared-object init function failed.
377pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: CUresult = 303;
378
379/// An OS call failed.
380pub const CUDA_ERROR_OPERATING_SYSTEM: CUresult = 304;
381
382/// The supplied handle is invalid.
383pub const CUDA_ERROR_INVALID_HANDLE: CUresult = 400;
384
385/// The requested resource is in an illegal state.
386pub const CUDA_ERROR_ILLEGAL_STATE: CUresult = 401;
387
388/// A loss-less compression buffer was detected while doing uncompressed access.
389pub const CUDA_ERROR_LOSSY_QUERY: CUresult = 402;
390
391/// A named symbol was not found.
392pub const CUDA_ERROR_NOT_FOUND: CUresult = 500;
393
394/// The operation is not ready (asynchronous).
395pub const CUDA_ERROR_NOT_READY: CUresult = 600;
396
397/// An illegal memory address was encountered.
398pub const CUDA_ERROR_ILLEGAL_ADDRESS: CUresult = 700;
399
400/// The kernel launch uses too many resources (registers / shared memory).
401pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: CUresult = 701;
402
403/// The kernel launch exceeded the time-out enforced by the driver.
404pub const CUDA_ERROR_LAUNCH_TIMEOUT: CUresult = 702;
405
406/// A launch did not occur on a compatible texturing mode.
407pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: CUresult = 703;
408
409/// Peer access already enabled.
410pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: CUresult = 704;
411
412/// Peer access has not been enabled.
413pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: CUresult = 705;
414
415/// The primary context has already been initialised.
416pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: CUresult = 708;
417
418/// The context is being destroyed.
419pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: CUresult = 709;
420
421/// A 64-bit device assertion triggered.
422pub const CUDA_ERROR_ASSERT: CUresult = 710;
423
424/// Hardware resources to enable peer access are exhausted.
425pub const CUDA_ERROR_TOO_MANY_PEERS: CUresult = 711;
426
427/// The host-side memory region is already registered.
428pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: CUresult = 712;
429
430/// The host-side memory region is not registered.
431pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: CUresult = 713;
432
433/// Hardware stack overflow on the device.
434pub const CUDA_ERROR_HARDWARE_STACK_ERROR: CUresult = 714;
435
436/// Illegal instruction encountered on the device.
437pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: CUresult = 715;
438
439/// Misaligned address on the device.
440pub const CUDA_ERROR_MISALIGNED_ADDRESS: CUresult = 716;
441
442/// Invalid address space.
443pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: CUresult = 717;
444
445/// Invalid program counter on the device.
446pub const CUDA_ERROR_INVALID_PC: CUresult = 718;
447
448/// The kernel launch failed.
449pub const CUDA_ERROR_LAUNCH_FAILED: CUresult = 719;
450
451/// Cooperative launch is too large for the device/kernel.
452pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = 720;
453
454/// The API call is not permitted in the active context.
455pub const CUDA_ERROR_NOT_PERMITTED: CUresult = 800;
456
457/// The API call is not supported by the current driver/device combination.
458pub const CUDA_ERROR_NOT_SUPPORTED: CUresult = 801;
459
460/// System not ready for CUDA operations.
461pub const CUDA_ERROR_SYSTEM_NOT_READY: CUresult = 802;
462
463/// System driver mismatch.
464pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: CUresult = 803;
465
466/// Old-style context incompatible with CUDA 3.2+ API.
467pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: CUresult = 804;
468
469/// MPS connection failed.
470pub const CUDA_ERROR_MPS_CONNECTION_FAILED: CUresult = 805;
471
472/// MPS RPC failure.
473pub const CUDA_ERROR_MPS_RPC_FAILURE: CUresult = 806;
474
475/// MPS server not ready.
476pub const CUDA_ERROR_MPS_SERVER_NOT_READY: CUresult = 807;
477
478/// MPS maximum clients reached.
479pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: CUresult = 808;
480
481/// MPS maximum connections reached.
482pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: CUresult = 809;
483
484/// MPS client terminated.
485pub const CUDA_ERROR_MPS_CLIENT_TERMINATED: CUresult = 810;
486
487/// CDP not supported.
488pub const CUDA_ERROR_CDP_NOT_SUPPORTED: CUresult = 811;
489
490/// CDP version mismatch.
491pub const CUDA_ERROR_CDP_VERSION_MISMATCH: CUresult = 812;
492
493/// Stream capture unsupported.
494pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: CUresult = 900;
495
496/// Stream capture invalidated.
497pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: CUresult = 901;
498
499/// Stream capture merge not permitted.
500pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: CUresult = 902;
501
502/// Stream capture unmatched.
503pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: CUresult = 903;
504
505/// Stream capture unjoined.
506pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: CUresult = 904;
507
508/// Stream capture isolation violation.
509pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: CUresult = 905;
510
511/// Implicit stream in graph capture.
512pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: CUresult = 906;
513
514/// Captured event error.
515pub const CUDA_ERROR_CAPTURED_EVENT: CUresult = 907;
516
517/// Stream capture wrong thread.
518pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: CUresult = 908;
519
520/// The async operation timed out.
521pub const CUDA_ERROR_TIMEOUT: CUresult = 909;
522
523/// The graph update failed.
524pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: CUresult = 910;
525
526/// External device error.
527pub const CUDA_ERROR_EXTERNAL_DEVICE: CUresult = 911;
528
529/// Invalid cluster size.
530pub const CUDA_ERROR_INVALID_CLUSTER_SIZE: CUresult = 912;
531
532/// Function not loaded.
533pub const CUDA_ERROR_FUNCTION_NOT_LOADED: CUresult = 913;
534
535/// Invalid resource type.
536pub const CUDA_ERROR_INVALID_RESOURCE_TYPE: CUresult = 914;
537
538/// Invalid resource configuration.
539pub const CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = 915;
540
541/// An unknown internal error occurred.
542pub const CUDA_ERROR_UNKNOWN: CUresult = 999;
543
544// =========================================================================
545// CUdevice_attribute — device property query keys
546// =========================================================================
547
548/// Device attribute identifiers passed to `cuDeviceGetAttribute`.
549#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
550#[repr(i32)]
551#[non_exhaustive]
552#[allow(non_camel_case_types)]
553pub enum CUdevice_attribute {
554 /// Maximum number of threads per block.
555 MaxThreadsPerBlock = 1,
556 /// Maximum x-dimension of a block.
557 MaxBlockDimX = 2,
558 /// Maximum y-dimension of a block.
559 MaxBlockDimY = 3,
560 /// Maximum z-dimension of a block.
561 MaxBlockDimZ = 4,
562 /// Maximum x-dimension of a grid.
563 MaxGridDimX = 5,
564 /// Maximum y-dimension of a grid.
565 MaxGridDimY = 6,
566 /// Maximum z-dimension of a grid.
567 MaxGridDimZ = 7,
568 /// Maximum shared memory available per block (bytes).
569 MaxSharedMemoryPerBlock = 8,
570 /// Total amount of constant memory on the device (bytes).
571 TotalConstantMemory = 9,
572 /// Warp size in threads.
573 WarpSize = 10,
574 /// Maximum pitch allowed by memory copies (bytes).
575 MaxPitch = 11,
576 /// Maximum number of 32-bit registers per block.
577 MaxRegistersPerBlock = 12,
578 /// Peak clock frequency in kHz.
579 ClockRate = 13,
580 /// Alignment requirement for textures.
581 TextureAlignment = 14,
582 /// Device can possibly copy memory and execute a kernel concurrently.
583 GpuOverlap = 15,
584 /// Number of multiprocessors on the device.
585 MultiprocessorCount = 16,
586 /// Whether there is a run-time limit on kernels.
587 KernelExecTimeout = 17,
588 /// Device is integrated (shares host memory).
589 Integrated = 18,
590 /// Device can map host memory with `cuMemHostAlloc` / `cuMemHostRegister`.
591 CanMapHostMemory = 19,
592 /// Compute mode: default, exclusive, prohibited, etc.
593 ComputeMode = 20,
594 /// Maximum 1D texture width.
595 MaxTexture1DWidth = 21,
596 /// Maximum 2D texture width.
597 MaxTexture2DWidth = 22,
598 /// Maximum 2D texture height.
599 MaxTexture2DHeight = 23,
600 /// Maximum 3D texture width.
601 MaxTexture3DWidth = 24,
602 /// Maximum 3D texture height.
603 MaxTexture3DHeight = 25,
604 /// Maximum 3D texture depth.
605 MaxTexture3DDepth = 26,
606 /// Maximum 2D layered texture width.
607 MaxTexture2DLayeredWidth = 27,
608 /// Maximum 2D layered texture height.
609 MaxTexture2DLayeredHeight = 28,
610 /// Maximum layers in a 2D layered texture.
611 MaxTexture2DLayeredLayers = 29,
612 /// Alignment requirement for surfaces.
613 SurfaceAlignment = 30,
614 /// Device can execute multiple kernels concurrently.
615 ConcurrentKernels = 31,
616 /// Device supports ECC memory.
617 EccEnabled = 32,
618 /// PCI bus ID of the device.
619 PciBusId = 33,
620 /// PCI device ID of the device.
621 PciDeviceId = 34,
622 /// Device is using TCC (Tesla Compute Cluster) driver model.
623 TccDriver = 35,
624 /// Peak memory clock frequency in kHz.
625 MemoryClockRate = 36,
626 /// Global memory bus width in bits.
627 GlobalMemoryBusWidth = 37,
628 /// Size of L2 cache in bytes.
629 L2CacheSize = 38,
630 /// Maximum resident threads per multiprocessor.
631 MaxThreadsPerMultiprocessor = 39,
632 /// Number of asynchronous engines.
633 AsyncEngineCount = 40,
634 /// Device shares a unified address space with the host.
635 UnifiedAddressing = 41,
636 /// Maximum 1D layered texture width.
637 MaxTexture1DLayeredWidth = 42,
638 /// Maximum layers in a 1D layered texture.
639 MaxTexture1DLayeredLayers = 43,
640 /// Maximum 2D texture width if CUDA 2D memory allocation is bound.
641 MaxTexture2DGatherWidth = 44,
642 /// Maximum 2D texture height if CUDA 2D memory allocation is bound.
643 MaxTexture2DGatherHeight = 45,
644 /// Alternate maximum 3D texture width.
645 MaxTexture3DWidthAlt = 47,
646 /// Alternate maximum 3D texture height.
647 MaxTexture3DHeightAlt = 48,
648 /// Alternate maximum 3D texture depth.
649 MaxTexture3DDepthAlt = 49,
650 /// PCI domain ID.
651 PciDomainId = 50,
652 /// Texture pitch alignment.
653 TexturePitchAlignment = 51,
654 /// Maximum 1D mipmapped texture width.
655 MaxTexture1DMipmappedWidth2 = 52,
656 /// Maximum width for a cubemap texture.
657 MaxTextureCubemapWidth = 54,
658 /// Maximum width for a cubemap layered texture.
659 MaxTextureCubemapLayeredWidth = 55,
660 /// Maximum layers in a cubemap layered texture.
661 MaxTextureCubemapLayeredLayers = 56,
662 /// Maximum 1D surface width.
663 MaxSurface1DWidth = 57,
664 /// Maximum 2D surface width.
665 MaxSurface2DWidth = 58,
666 /// Maximum 2D surface height.
667 MaxSurface2DHeight = 59,
668 /// Maximum 3D surface width.
669 MaxSurface3DWidth = 60,
670 /// Maximum 3D surface height.
671 MaxSurface3DHeight = 61,
672 /// Maximum 3D surface depth.
673 MaxSurface3DDepth = 62,
674 /// Maximum cubemap surface width.
675 MaxSurfaceCubemapWidth = 63,
676 /// Maximum 1D layered surface width.
677 MaxSurface1DLayeredWidth = 64,
678 /// Maximum layers in a 1D layered surface.
679 MaxSurface1DLayeredLayers = 65,
680 /// Maximum 2D layered surface width.
681 MaxSurface2DLayeredWidth = 66,
682 /// Maximum 2D layered surface height.
683 MaxSurface2DLayeredHeight = 67,
684 /// Maximum layers in a 2D layered surface.
685 MaxSurface2DLayeredLayers = 68,
686 /// Maximum cubemap layered surface width.
687 MaxSurfaceCubemapLayeredWidth = 69,
688 /// Maximum layers in a cubemap layered surface.
689 MaxSurfaceCubemapLayeredLayers = 70,
690 /// Maximum 1D linear texture width (deprecated).
691 MaxTexture1DLinearWidth = 71,
692 /// Maximum 2D linear texture width.
693 MaxTexture2DLinearWidth = 72,
694 /// Maximum 2D linear texture height.
695 MaxTexture2DLinearHeight = 73,
696 /// Maximum 2D linear texture pitch (bytes).
697 MaxTexture2DLinearPitch = 74,
698 /// Major compute capability version number.
699 ComputeCapabilityMajor = 75,
700 /// Minor compute capability version number.
701 ComputeCapabilityMinor = 76,
702 /// Maximum mipmapped 2D texture width.
703 MaxTexture2DMipmappedWidth = 77,
704 /// Maximum mipmapped 2D texture height.
705 MaxTexture2DMipmappedHeight = 78,
706 /// Maximum mipmapped 1D texture width.
707 MaxTexture1DMipmappedWidth = 79,
708 /// Device supports stream priorities.
709 StreamPrioritiesSupported = 80,
710 /// Maximum shared memory per multiprocessor (bytes).
711 MaxSharedMemoryPerMultiprocessor = 81,
712 /// Maximum registers per multiprocessor.
713 MaxRegistersPerMultiprocessor = 82,
714 /// Device supports managed memory.
715 ManagedMemory = 83,
716 /// Device is on a multi-GPU board.
717 IsMultiGpuBoard = 84,
718 /// Unique identifier for the multi-GPU board group.
719 MultiGpuBoardGroupId = 85,
720 /// Host-visible native-atomic support for float operations.
721 HostNativeAtomicSupported = 86,
722 /// Ratio of single-to-double precision performance.
723 SingleToDoublePrecisionPerfRatio = 87,
724 /// Device supports pageable memory access.
725 PageableMemoryAccess = 88,
726 /// Device can access host registered memory at the same virtual address.
727 ConcurrentManagedAccess = 89,
728 /// Device supports compute preemption.
729 ComputePreemptionSupported = 90,
730 /// Device can access host memory via pageable accesses.
731 CanUseHostPointerForRegisteredMem = 91,
732 /// Reserved attribute (CUDA internal, value 92).
733 Reserved92 = 92,
734 /// Reserved attribute (CUDA internal, value 93).
735 Reserved93 = 93,
736 /// Reserved attribute (CUDA internal, value 94).
737 Reserved94 = 94,
738 /// Device supports cooperative kernel launches.
739 CooperativeLaunch = 95,
740 /// Device supports cooperative kernel launches across multiple GPUs.
741 CooperativeMultiDeviceLaunch = 96,
742 /// Maximum optin shared memory per block.
743 MaxSharedMemoryPerBlockOptin = 97,
744 /// Device supports flushing of outstanding remote writes.
745 CanFlushRemoteWrites = 98,
746 /// Device supports host-side memory-register functions.
747 HostRegisterSupported = 99,
748 /// Device supports pageable memory access using host page tables.
749 PageableMemoryAccessUsesHostPageTables = 100,
750 /// Device supports direct access to managed memory on the host.
751 DirectManagedMemAccessFromHost = 101,
752 /// Device supports virtual memory management APIs.
753 VirtualMemoryManagementSupported = 102,
754 /// Device supports handle-type POSIX file descriptors for IPC.
755 HandleTypePosixFileDescriptorSupported = 103,
756 /// Device supports handle-type Win32 handles for IPC.
757 HandleTypeWin32HandleSupported = 104,
758 /// Device supports handle-type Win32 KMT handles for IPC.
759 HandleTypeWin32KmtHandleSupported = 105,
760 /// Maximum blocks per multiprocessor.
761 MaxBlocksPerMultiprocessor = 106,
762 /// Device supports generic compression for memory.
763 GenericCompressionSupported = 107,
764 /// Maximum persisting L2 cache size (bytes).
765 MaxPersistingL2CacheSize = 108,
766 /// Maximum access-policy window size for L2 cache.
767 MaxAccessPolicyWindowSize = 109,
768 /// Device supports RDMA APIs via `cuMemRangeGetAttribute`.
769 GpuDirectRdmaWithCudaVmmSupported = 110,
770 /// Free memory / total memory on the device accessible via `cuMemGetInfo`.
771 AccessPolicyMaxWindowSize = 111,
772 /// Reserved range of shared memory per SM (bytes).
773 ReservedSharedMemoryPerBlock = 112,
774 /// Device supports timeline semaphore interop.
775 TimelineSemaphoreInteropSupported = 113,
776 /// Device supports memory pools (`cudaMallocAsync`).
777 MemoryPoolsSupported = 115,
778 /// GPU direct RDMA is supported.
779 GpuDirectRdmaSupported = 116,
780 /// GPU direct RDMA flush-writes order.
781 GpuDirectRdmaFlushWritesOptions = 117,
782 /// GPU direct RDMA writes ordering.
783 GpuDirectRdmaWritesOrdering = 118,
784 /// Memory pool supported handle types.
785 MemoryPoolSupportedHandleTypes = 119,
786 /// Device supports cluster launch.
787 ClusterLaunch = 120,
788 /// Deferred mapping CUDA array supported.
789 DeferredMappingCudaArraySupported = 121,
790 /// Device supports IPC event handles.
791 IpcEventSupported = 122,
792 /// Device supports mem-sync domain count.
793 MemSyncDomainCount = 123,
794 /// Device supports tensor-map access to data.
795 TensorMapAccessSupported = 124,
796 /// Unified function pointers supported.
797 UnifiedFunctionPointers = 125,
798 /// NUMA config.
799 NumaConfig = 127,
800 /// NUMA id.
801 NumaId = 128,
802 /// Multicast supported.
803 /// Device supports getting the minimum required per-block shared memory
804 /// for a cooperative launch via the extended attributes.
805 MaxTimelineSemaphoreInteropSupported = 129,
806 /// Device supports memory sync domain operations.
807 MemSyncDomainSupported = 130,
808 /// Device supports GPU-Direct Fabric.
809 GpuDirectRdmaFabricSupported = 131,
810 /// Device supports multicast.
811 MulticastSupported = 132,
812 /// Device supports MPS features.
813 MpsEnabled = 133,
814 /// Host-NUMA identifier.
815 HostNumaId = 134,
816}
817
818// =========================================================================
819// CUjit_option — options for the JIT compiler
820// =========================================================================
821
822/// JIT compilation options passed to `cuModuleLoadDataEx` and related functions.
823#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
824#[repr(u32)]
825#[non_exhaustive]
826#[allow(non_camel_case_types)]
827pub enum CUjit_option {
828 /// Maximum number of registers that a thread may use.
829 MaxRegisters = 0,
830 /// Number of threads per block for the JIT target.
831 ThreadsPerBlock = 1,
832 /// Wall-clock time (ms) for compilation.
833 WallTime = 2,
834 /// Pointer to a buffer for info log output.
835 InfoLogBuffer = 3,
836 /// Size (bytes) of the info-log buffer.
837 InfoLogBufferSizeBytes = 4,
838 /// Pointer to a buffer for error log output.
839 ErrorLogBuffer = 5,
840 /// Size (bytes) of the error-log buffer.
841 ErrorLogBufferSizeBytes = 6,
842 /// Optimisation level (0-4).
843 OptimizationLevel = 7,
844 /// Determines the target based on the current attached context.
845 TargetFromCuContext = 8,
846 /// Specific compute target (sm_XX).
847 Target = 9,
848 /// Fallback strategy when exact match is not found.
849 FallbackStrategy = 10,
850 /// Specifies whether to generate debug info.
851 GenerateDebugInfo = 11,
852 /// Generate verbose log messages.
853 LogVerbose = 12,
854 /// Generate line-number information.
855 GenerateLineInfo = 13,
856 /// Cache mode (on / off).
857 CacheMode = 14,
858 /// (Internal) New SM3X option.
859 Sm3xOpt = 15,
860 /// Fast compile flag.
861 FastCompile = 16,
862 /// Global symbol names.
863 GlobalSymbolNames = 17,
864 /// Global symbol addresses.
865 GlobalSymbolAddresses = 18,
866 /// Number of global symbols.
867 GlobalSymbolCount = 19,
868 /// LTO flag.
869 Lto = 20,
870 /// FTZ (flush-to-zero) flag.
871 Ftz = 21,
872 /// Prec-div flag.
873 PrecDiv = 22,
874 /// Prec-sqrt flag.
875 PrecSqrt = 23,
876 /// FMA flag.
877 Fma = 24,
878 /// Referenced kernel names.
879 ReferencedKernelNames = 25,
880 /// Referenced kernel count.
881 ReferencedKernelCount = 26,
882 /// Referenced variable names.
883 ReferencedVariableNames = 27,
884 /// Referenced variable count.
885 ReferencedVariableCount = 28,
886 /// Optimise unused device variables.
887 OptimizeUnusedDeviceVariables = 29,
888 /// Position-independent code.
889 PositionIndependentCode = 30,
890}
891
892// =========================================================================
893// CUjitInputType — input types for the linker
894// =========================================================================
895
896/// Input types for `cuLinkAddData` / `cuLinkAddFile`.
897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
898#[repr(u32)]
899#[non_exhaustive]
900pub enum CUjitInputType {
901 /// PTX source code.
902 Ptx = 1,
903 /// Compiled device code (cubin).
904 Cubin = 2,
905 /// Fat binary bundle.
906 Fatbin = 3,
907 /// Relocatable device object.
908 Object = 4,
909 /// Device code library.
910 Library = 5,
911}
912
913// =========================================================================
914// Stream creation flags
915// =========================================================================
916
917/// Default stream creation flag (implicit synchronisation with the NULL stream).
918pub const CU_STREAM_DEFAULT: u32 = 0;
919
920/// Stream does not synchronise with the NULL stream.
921pub const CU_STREAM_NON_BLOCKING: u32 = 1;
922
923// =========================================================================
924// Stream-ordered memory pool attributes (CUDA 11.2+)
925// =========================================================================
926
927/// Pool reuse policy: follow event dependencies before reusing a freed block.
928pub const CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: u32 = 1;
929
930/// Pool reuse policy: allow opportunistic reuse without ordering guarantees.
931pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: u32 = 2;
932
933/// Pool reuse policy: allow the driver to insert internal dependencies for reuse.
934pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: u32 = 3;
935
936/// Release threshold (bytes): memory returned to OS when usage drops below this.
937pub const CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: u32 = 4;
938
939/// Current reserved memory in bytes (read-only).
940pub const CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: u32 = 5;
941
942/// High-water mark of reserved memory in bytes (resettable).
943pub const CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: u32 = 6;
944
945/// Current used memory in bytes (read-only).
946pub const CU_MEMPOOL_ATTR_USED_MEM_CURRENT: u32 = 7;
947
948/// High-water mark of used memory in bytes (resettable).
949pub const CU_MEMPOOL_ATTR_USED_MEM_HIGH: u32 = 8;
950
951// =========================================================================
952// Event creation flags
953// =========================================================================
954
955/// Default event creation flag.
956pub const CU_EVENT_DEFAULT: u32 = 0;
957
958/// Event uses blocking synchronisation.
959pub const CU_EVENT_BLOCKING_SYNC: u32 = 1;
960
961/// Event does not record timing data (faster).
962pub const CU_EVENT_DISABLE_TIMING: u32 = 2;
963
964/// Event may be used as an interprocess event.
965pub const CU_EVENT_INTERPROCESS: u32 = 4;
966
967// =========================================================================
968// Memory-attach flags (for managed / mapped memory)
969// =========================================================================
970
971/// Memory is accessible from any stream on any device.
972pub const CU_MEM_ATTACH_GLOBAL: u32 = 1;
973
974/// Memory is initially accessible only from the allocating stream/host.
975pub const CU_MEM_ATTACH_HOST: u32 = 2;
976
977/// Memory is initially accessible only from a single stream.
978pub const CU_MEM_ATTACH_SINGLE: u32 = 4;
979
980// =========================================================================
981// cuMemHostRegister flags
982// =========================================================================
983
984/// Registered memory is portable across CUDA contexts.
985pub const CU_MEMHOSTREGISTER_PORTABLE: u32 = 0x01;
986
987/// Registered memory is mapped into the device address space.
988pub const CU_MEMHOSTREGISTER_DEVICEMAP: u32 = 0x02;
989
990/// Pointer is to I/O memory (not system RAM).
991pub const CU_MEMHOSTREGISTER_IOMEMORY: u32 = 0x04;
992
993/// Registered memory will not be written by the GPU (read-only).
994pub const CU_MEMHOSTREGISTER_READ_ONLY: u32 = 0x08;
995
996// =========================================================================
997// cuPointerGetAttribute attribute codes
998// =========================================================================
999
1000/// Query the CUDA context associated with a pointer.
1001pub const CU_POINTER_ATTRIBUTE_CONTEXT: u32 = 1;
1002
1003/// Query the memory type (host / device / unified) of a pointer.
1004pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: u32 = 2;
1005
1006/// Query the device pointer corresponding to a host pointer.
1007pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: u32 = 3;
1008
1009/// Query the host pointer corresponding to a device pointer.
1010pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: u32 = 4;
1011
1012/// Query whether the memory is managed (unified).
1013pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: u32 = 7;
1014
1015// =========================================================================
1016// CU_MEMORYTYPE values (returned by pointer attribute queries)
1017// =========================================================================
1018
1019/// Host (system) memory.
1020pub const CU_MEMORYTYPE_HOST: u32 = 1;
1021
1022/// Device (GPU) memory.
1023pub const CU_MEMORYTYPE_DEVICE: u32 = 2;
1024
1025/// Array memory.
1026pub const CU_MEMORYTYPE_ARRAY: u32 = 3;
1027
1028/// Unified (managed) memory.
1029pub const CU_MEMORYTYPE_UNIFIED: u32 = 4;
1030
1031// =========================================================================
1032// Context scheduling flags
1033// =========================================================================
1034
1035/// The driver picks the most appropriate scheduling mode.
1036pub const CU_CTX_SCHED_AUTO: u32 = 0;
1037
1038/// Actively spin when waiting for results from the GPU.
1039pub const CU_CTX_SCHED_SPIN: u32 = 1;
1040
1041/// Yield the CPU when waiting for results from the GPU.
1042pub const CU_CTX_SCHED_YIELD: u32 = 2;
1043
1044/// Block the calling thread when waiting for results.
1045pub const CU_CTX_SCHED_BLOCKING_SYNC: u32 = 4;
1046
1047/// Mask for the scheduling flags.
1048pub const CU_CTX_SCHED_MASK: u32 = 0x07;
1049
1050/// Support mapped pinned allocations.
1051pub const CU_CTX_MAP_HOST: u32 = 0x08;
1052
1053/// Keep local memory allocation after launch.
1054pub const CU_CTX_LMEM_RESIZE_TO_MAX: u32 = 0x10;
1055
1056/// Coredump enable.
1057pub const CU_CTX_COREDUMP_ENABLE: u32 = 0x20;
1058
1059/// User coredump enable.
1060pub const CU_CTX_USER_COREDUMP_ENABLE: u32 = 0x40;
1061
1062/// Sync-memops flag.
1063pub const CU_CTX_SYNC_MEMOPS: u32 = 0x80;
1064
1065/// Mask for all context flags.
1066pub const CU_CTX_FLAGS_MASK: u32 = 0xFF;
1067
1068// =========================================================================
1069// Function attribute values (used with cuFuncGetAttribute)
1070// =========================================================================
1071
1072/// Maximum threads per block for this function.
1073pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: i32 = 0;
1074
1075/// Shared memory used by this function (bytes).
1076pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: i32 = 1;
1077
1078/// Size of user-allocated constant memory (bytes).
1079pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: i32 = 2;
1080
1081/// Size of local memory used by each thread (bytes).
1082pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: i32 = 3;
1083
1084/// Number of registers used by each thread.
1085pub const CU_FUNC_ATTRIBUTE_NUM_REGS: i32 = 4;
1086
1087/// PTX virtual architecture version (e.g. 70 for sm_70).
1088pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: i32 = 5;
1089
1090/// Binary architecture version (e.g. 70 for sm_70).
1091pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: i32 = 6;
1092
1093/// Whether this function has been cached.
1094pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: i32 = 7;
1095
1096/// Maximum dynamic shared memory size (bytes).
1097pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: i32 = 8;
1098
1099/// Preferred shared memory carve-out.
1100pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: i32 = 9;
1101
1102/// Cluster size setting.
1103pub const CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: i32 = 10;
1104
1105/// Required cluster width.
1106pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: i32 = 11;
1107
1108/// Required cluster height.
1109pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: i32 = 12;
1110
1111/// Required cluster depth.
1112pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: i32 = 13;
1113
1114/// Non-portable cluster size allowed.
1115pub const CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: i32 = 14;
1116
1117/// Required cluster scheduling policy preference.
1118pub const CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: i32 = 15;
1119
1120// =========================================================================
1121// Memory advise values
1122// =========================================================================
1123
1124/// Hint that the data will be read mostly.
1125pub const CU_MEM_ADVISE_SET_READ_MOSTLY: u32 = 1;
1126
1127/// Unset read-mostly hint.
1128pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: u32 = 2;
1129
1130/// Set the preferred location to the specified device.
1131pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: u32 = 3;
1132
1133/// Unset the preferred location.
1134pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: u32 = 4;
1135
1136/// Set access from the specified device.
1137pub const CU_MEM_ADVISE_SET_ACCESSED_BY: u32 = 5;
1138
1139/// Unset access from the specified device.
1140pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: u32 = 6;
1141
1142// =========================================================================
1143// Limit values (cuCtxSetLimit / cuCtxGetLimit)
1144// =========================================================================
1145
1146/// Stack size for each GPU thread.
1147pub const CU_LIMIT_STACK_SIZE: u32 = 0;
1148
1149/// Size of the printf FIFO.
1150pub const CU_LIMIT_PRINTF_FIFO_SIZE: u32 = 1;
1151
1152/// Size of the heap used by `malloc()` on the device.
1153pub const CU_LIMIT_MALLOC_HEAP_SIZE: u32 = 2;
1154
1155/// Maximum nesting depth of a device runtime launch.
1156pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: u32 = 3;
1157
1158/// Maximum number of outstanding device runtime launches.
1159pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: u32 = 4;
1160
1161/// L2 cache fetch granularity.
1162pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: u32 = 5;
1163
1164/// Maximum persisting L2 cache size.
1165pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: u32 = 6;
1166
1167// =========================================================================
1168// Occupancy flags
1169// =========================================================================
1170
1171/// Default occupancy calculation.
1172pub const CU_OCCUPANCY_DEFAULT: u32 = 0;
1173
1174/// Disable caching override.
1175pub const CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE: u32 = 1;
1176
1177// =========================================================================
1178// cuLaunchKernelEx cluster launch types (CUDA 12.x)
1179// =========================================================================
1180
1181/// Attribute identifier for `CuLaunchAttribute`.
1182///
1183/// Controls which extended kernel launch feature is configured.
1184/// Used with `cuLaunchKernelEx` (CUDA 12.0+).
1185#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1186#[repr(u32)]
1187pub enum CuLaunchAttributeId {
1188 /// Controls whether shared memory reuse is ignored.
1189 IgnoreSharedMemoryReuse = 1,
1190 /// Specifies thread block cluster dimensions (sm_90+).
1191 ClusterDimension = 2,
1192 /// Controls cluster scheduling policy preference.
1193 ClusterSchedulingPolicyPreference = 3,
1194 /// Enables programmatic stream serialization.
1195 ProgrammaticStreamSerialization = 4,
1196 /// Specifies a programmatic completion event.
1197 ProgrammaticEvent = 5,
1198 /// Specifies kernel launch priority.
1199 Priority = 6,
1200 /// Maps memory synchronization domains.
1201 MemSyncDomainMap = 7,
1202 /// Sets memory synchronization domain.
1203 MemSyncDomain = 8,
1204 /// Specifies a launch completion event.
1205 LaunchCompletionEvent = 9,
1206 /// Configures device-updatable kernel node.
1207 DeviceUpdatableKernelNode = 10,
1208}
1209
1210/// Cluster dimension for thread block clusters (sm_90+).
1211///
1212/// Specifies how many thread blocks form one cluster in each dimension.
1213/// Used inside [`CuLaunchAttributeValue`] when the attribute id is
1214/// [`CuLaunchAttributeId::ClusterDimension`].
1215#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1216#[repr(C)]
1217pub struct CuLaunchAttributeClusterDim {
1218 /// Cluster extent in X dimension.
1219 pub x: u32,
1220 /// Cluster extent in Y dimension.
1221 pub y: u32,
1222 /// Cluster extent in Z dimension.
1223 pub z: u32,
1224}
1225
1226/// Value union for `CuLaunchAttribute`.
1227///
1228/// # Safety
1229///
1230/// This is a C union — callers must only read the field that matches
1231/// the accompanying [`CuLaunchAttributeId`] discriminant.
1232/// Padding ensures the union is always 64 bytes, matching the CUDA ABI.
1233#[repr(C)]
1234pub union CuLaunchAttributeValue {
1235 /// Cluster dimension configuration (when id == `ClusterDimension`).
1236 pub cluster_dim: CuLaunchAttributeClusterDim,
1237 /// Scalar u32 value (for single-word attributes).
1238 pub value_u32: u32,
1239 /// Raw padding to maintain 64-byte ABI alignment.
1240 pub pad: [u8; 64],
1241}
1242
1243// Manual Clone/Copy for the union (derive cannot handle unions with non-Copy
1244// fields, but all union fields here are effectively POD).
1245// `Copy` is declared first so that the `Clone` impl can delegate to it.
1246impl Copy for CuLaunchAttributeValue {}
1247
1248impl Clone for CuLaunchAttributeValue {
1249 fn clone(&self) -> Self {
1250 // Delegate to Copy — canonical approach for Copy types.
1251 *self
1252 }
1253}
1254
1255/// A single extended kernel launch attribute (id + value pair).
1256///
1257/// Used in the `attrs` array of [`CuLaunchConfig`].
1258#[repr(C)]
1259#[derive(Clone, Copy)]
1260pub struct CuLaunchAttribute {
1261 /// Which feature this attribute configures.
1262 pub id: CuLaunchAttributeId,
1263 /// Alignment padding (must be zero).
1264 pub pad: [u8; 4],
1265 /// The attribute value — interpret according to `id`.
1266 pub value: CuLaunchAttributeValue,
1267}
1268
1269/// Extended kernel launch configuration for `cuLaunchKernelEx` (CUDA 12.0+).
1270///
1271/// Supersedes the individual parameters of `cuLaunchKernel` and adds
1272/// support for thread block clusters, launch priorities, and other
1273/// CUDA 12.x features.
1274///
1275/// # Example
1276///
1277/// ```rust
1278/// use oxicuda_driver::ffi::{
1279/// CuLaunchConfig, CuLaunchAttribute, CuLaunchAttributeId,
1280/// CuLaunchAttributeValue, CuLaunchAttributeClusterDim, CUstream,
1281/// };
1282///
1283/// // Build a cluster-launch config for a 2×1×1 cluster.
1284/// let cluster_attr = CuLaunchAttribute {
1285/// id: CuLaunchAttributeId::ClusterDimension,
1286/// pad: [0u8; 4],
1287/// value: CuLaunchAttributeValue {
1288/// cluster_dim: CuLaunchAttributeClusterDim { x: 2, y: 1, z: 1 },
1289/// },
1290/// };
1291/// let _config = CuLaunchConfig {
1292/// grid_dim_x: 8,
1293/// grid_dim_y: 1,
1294/// grid_dim_z: 1,
1295/// block_dim_x: 256,
1296/// block_dim_y: 1,
1297/// block_dim_z: 1,
1298/// shared_mem_bytes: 0,
1299/// stream: CUstream::default(),
1300/// attrs: std::ptr::null(),
1301/// num_attrs: 0,
1302/// };
1303/// ```
1304#[repr(C)]
1305pub struct CuLaunchConfig {
1306 /// Grid dimension in X.
1307 pub grid_dim_x: u32,
1308 /// Grid dimension in Y.
1309 pub grid_dim_y: u32,
1310 /// Grid dimension in Z.
1311 pub grid_dim_z: u32,
1312 /// Block dimension in X (threads per block in X).
1313 pub block_dim_x: u32,
1314 /// Block dimension in Y.
1315 pub block_dim_y: u32,
1316 /// Block dimension in Z.
1317 pub block_dim_z: u32,
1318 /// Dynamic shared memory per block in bytes.
1319 pub shared_mem_bytes: u32,
1320 /// Stream to submit the kernel on.
1321 pub stream: CUstream,
1322 /// Pointer to an array of `num_attrs` attributes (may be null if zero).
1323 pub attrs: *const CuLaunchAttribute,
1324 /// Number of entries in `attrs`.
1325 pub num_attrs: u32,
1326}
1327
1328// SAFETY: CuLaunchConfig is a plain data structure mirroring the CUDA ABI.
1329// The raw pointer `attrs` must be valid for the lifetime of the config, but
1330// the struct itself is Send + Sync because no interior mutation occurs.
1331unsafe impl Send for CuLaunchConfig {}
1332unsafe impl Sync for CuLaunchConfig {}
1333
1334// =========================================================================
1335// CUarray / CUmipmappedArray — opaque CUDA array handles
1336// =========================================================================
1337
1338define_handle! {
1339 /// Opaque handle to a CUDA array (1-D, 2-D, or 3-D texture memory).
1340 ///
1341 /// Allocated by `cuArrayCreate_v2` / `cuArray3DCreate_v2` and freed by
1342 /// `cuArrayDestroy`. Arrays can be bound to texture objects via
1343 /// [`CUDA_RESOURCE_DESC`].
1344 CUarray
1345}
1346
1347define_handle! {
1348 /// Opaque handle to a CUDA mipmapped array (Mip-mapped texture memory).
1349 ///
1350 /// Allocated by `cuMipmappedArrayCreate` and freed by
1351 /// `cuMipmappedArrayDestroy`.
1352 CUmipmappedArray
1353}
1354
1355// =========================================================================
1356// CUarray_format — channel element format for CUDA arrays
1357// =========================================================================
1358
1359/// Element format for CUDA arrays. Mirrors `CUarray_format_enum` in the
1360/// CUDA driver API header.
1361#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1362#[repr(u32)]
1363#[non_exhaustive]
1364#[allow(non_camel_case_types)]
1365pub enum CUarray_format {
1366 /// 8-bit unsigned integer channel.
1367 UnsignedInt8 = 0x01,
1368 /// 16-bit unsigned integer channel.
1369 UnsignedInt16 = 0x02,
1370 /// 32-bit unsigned integer channel.
1371 UnsignedInt32 = 0x03,
1372 /// 8-bit signed integer channel.
1373 SignedInt8 = 0x08,
1374 /// 16-bit signed integer channel.
1375 SignedInt16 = 0x09,
1376 /// 32-bit signed integer channel.
1377 SignedInt32 = 0x0a,
1378 /// 16-bit IEEE 754 half-precision float channel.
1379 Half = 0x10,
1380 /// 32-bit IEEE 754 single-precision float channel.
1381 Float = 0x20,
1382 /// NV12 planar YUV format (special 2-plane layout).
1383 Nv12 = 0xb0,
1384 /// 8-bit unsigned normalized integer (1 channel).
1385 UnormInt8X1 = 0xc0,
1386 /// 8-bit unsigned normalized integer (2 channels).
1387 UnormInt8X2 = 0xc1,
1388 /// 8-bit unsigned normalized integer (4 channels).
1389 UnormInt8X4 = 0xc2,
1390 /// 16-bit unsigned normalized integer (1 channel).
1391 UnormInt16X1 = 0xc3,
1392 /// 16-bit unsigned normalized integer (2 channels).
1393 UnormInt16X2 = 0xc4,
1394 /// 16-bit unsigned normalized integer (4 channels).
1395 UnormInt16X4 = 0xc5,
1396 /// 8-bit signed normalized integer (1 channel).
1397 SnormInt8X1 = 0xc6,
1398 /// 8-bit signed normalized integer (2 channels).
1399 SnormInt8X2 = 0xc7,
1400 /// 8-bit signed normalized integer (4 channels).
1401 SnormInt8X4 = 0xc8,
1402 /// 16-bit signed normalized integer (1 channel).
1403 SnormInt16X1 = 0xc9,
1404 /// 16-bit signed normalized integer (2 channels).
1405 SnormInt16X2 = 0xca,
1406 /// 16-bit signed normalized integer (4 channels).
1407 SnormInt16X4 = 0xcb,
1408 /// BC1 compressed (DXT1) unsigned.
1409 Bc1Unorm = 0x91,
1410 /// BC1 compressed (DXT1) unsigned, sRGB.
1411 Bc1UnormSrgb = 0x92,
1412 /// BC2 compressed (DXT3) unsigned.
1413 Bc2Unorm = 0x93,
1414 /// BC2 compressed (DXT3) unsigned, sRGB.
1415 Bc2UnormSrgb = 0x94,
1416 /// BC3 compressed (DXT5) unsigned.
1417 Bc3Unorm = 0x95,
1418 /// BC3 compressed (DXT5) unsigned, sRGB.
1419 Bc3UnormSrgb = 0x96,
1420 /// BC4 unsigned.
1421 Bc4Unorm = 0x97,
1422 /// BC4 signed.
1423 Bc4Snorm = 0x98,
1424 /// BC5 unsigned.
1425 Bc5Unorm = 0x99,
1426 /// BC5 signed.
1427 Bc5Snorm = 0x9a,
1428 /// BC6H unsigned 16-bit float.
1429 Bc6hUf16 = 0x9b,
1430 /// BC6H signed 16-bit float.
1431 Bc6hSf16 = 0x9c,
1432 /// BC7 unsigned.
1433 Bc7Unorm = 0x9d,
1434 /// BC7 unsigned, sRGB.
1435 Bc7UnormSrgb = 0x9e,
1436}
1437
1438// =========================================================================
1439// CUresourcetype — resource type for texture/surface objects
1440// =========================================================================
1441
1442/// Resource type discriminant for [`CUDA_RESOURCE_DESC`].
1443#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1444#[repr(u32)]
1445#[non_exhaustive]
1446pub enum CUresourcetype {
1447 /// CUDA array resource.
1448 Array = 0x00,
1449 /// CUDA mipmapped array resource.
1450 MipmappedArray = 0x01,
1451 /// Linear memory resource (1-D, no filtering beyond point).
1452 Linear = 0x02,
1453 /// Pitched 2-D linear memory resource.
1454 Pitch2d = 0x03,
1455}
1456
1457// =========================================================================
1458// CUaddress_mode — texture coordinate wrapping mode
1459// =========================================================================
1460
1461/// Texture coordinate address-wrap mode for [`CUDA_TEXTURE_DESC`].
1462#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1463#[repr(u32)]
1464#[allow(non_camel_case_types)]
1465pub enum CUaddress_mode {
1466 /// Wrap (tiles) — coordinates outside [0, dim) wrap around.
1467 Wrap = 0,
1468 /// Clamp — coordinates are clamped to [0, dim-1].
1469 Clamp = 1,
1470 /// Mirror — coordinates are mirrored across array boundaries.
1471 Mirror = 2,
1472 /// Border — out-of-range coordinates return the border color.
1473 Border = 3,
1474}
1475
1476// =========================================================================
1477// CUfilter_mode — texture / mipmap filtering mode
1478// =========================================================================
1479
1480/// Texture / mipmap sampling filter mode for [`CUDA_TEXTURE_DESC`].
1481#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1482#[repr(u32)]
1483#[allow(non_camel_case_types)]
1484pub enum CUfilter_mode {
1485 /// Nearest-neighbor (point) sampling.
1486 Point = 0,
1487 /// Bilinear (linear) filtering.
1488 Linear = 1,
1489}
1490
1491// =========================================================================
1492// CUresourceViewFormat — re-interpretation format for resource views
1493// =========================================================================
1494
1495/// Format used to re-interpret a CUDA array in a resource view.
1496///
1497/// Mirrors `CUresourceViewFormat_enum` in the CUDA driver API header.
1498#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1499#[repr(u32)]
1500#[non_exhaustive]
1501pub enum CUresourceViewFormat {
1502 /// No re-interpretation (use the array's own format).
1503 None = 0x00,
1504 /// Re-interpret as 1×8-bit unsigned integer.
1505 Uint1x8 = 0x01,
1506 /// Re-interpret as 2×8-bit unsigned integer.
1507 Uint2x8 = 0x02,
1508 /// Re-interpret as 4×8-bit unsigned integer.
1509 Uint4x8 = 0x03,
1510 /// Re-interpret as 1×8-bit signed integer.
1511 Sint1x8 = 0x04,
1512 /// Re-interpret as 2×8-bit signed integer.
1513 Sint2x8 = 0x05,
1514 /// Re-interpret as 4×8-bit signed integer.
1515 Sint4x8 = 0x06,
1516 /// Re-interpret as 1×16-bit unsigned integer.
1517 Uint1x16 = 0x07,
1518 /// Re-interpret as 2×16-bit unsigned integer.
1519 Uint2x16 = 0x08,
1520 /// Re-interpret as 4×16-bit unsigned integer.
1521 Uint4x16 = 0x09,
1522 /// Re-interpret as 1×16-bit signed integer.
1523 Sint1x16 = 0x0a,
1524 /// Re-interpret as 2×16-bit signed integer.
1525 Sint2x16 = 0x0b,
1526 /// Re-interpret as 4×16-bit signed integer.
1527 Sint4x16 = 0x0c,
1528 /// Re-interpret as 1×32-bit unsigned integer.
1529 Uint1x32 = 0x0d,
1530 /// Re-interpret as 2×32-bit unsigned integer.
1531 Uint2x32 = 0x0e,
1532 /// Re-interpret as 4×32-bit unsigned integer.
1533 Uint4x32 = 0x0f,
1534 /// Re-interpret as 1×32-bit signed integer.
1535 Sint1x32 = 0x10,
1536 /// Re-interpret as 2×32-bit signed integer.
1537 Sint2x32 = 0x11,
1538 /// Re-interpret as 4×32-bit signed integer.
1539 Sint4x32 = 0x12,
1540 /// Re-interpret as 1×16-bit float.
1541 Float1x16 = 0x13,
1542 /// Re-interpret as 2×16-bit float.
1543 Float2x16 = 0x14,
1544 /// Re-interpret as 4×16-bit float.
1545 Float4x16 = 0x15,
1546 /// Re-interpret as 1×32-bit float.
1547 Float1x32 = 0x16,
1548 /// Re-interpret as 2×32-bit float.
1549 Float2x32 = 0x17,
1550 /// Re-interpret as 4×32-bit float.
1551 Float4x32 = 0x18,
1552 /// BC1 unsigned normal compressed.
1553 UnsignedBc1 = 0x19,
1554 /// BC2 unsigned normal compressed.
1555 UnsignedBc2 = 0x1a,
1556 /// BC3 unsigned normal compressed.
1557 UnsignedBc3 = 0x1b,
1558 /// BC4 unsigned normal compressed.
1559 UnsignedBc4 = 0x1c,
1560 /// BC4 signed normal compressed.
1561 SignedBc4 = 0x1d,
1562 /// BC5 unsigned normal compressed.
1563 UnsignedBc5 = 0x1e,
1564 /// BC5 signed normal compressed.
1565 SignedBc5 = 0x1f,
1566 /// BC6H unsigned half-float.
1567 UnsignedBc6h = 0x20,
1568 /// BC6H signed half-float.
1569 SignedBc6h = 0x21,
1570 /// BC7 unsigned.
1571 UnsignedBc7 = 0x22,
1572 /// NV12 planar YUV.
1573 Nv12 = 0x23,
1574}
1575
1576// =========================================================================
1577// CUDA_ARRAY_DESCRIPTOR — descriptor for 1-D and 2-D CUDA arrays
1578// =========================================================================
1579
1580/// Descriptor passed to `cuArrayCreate_v2` / `cuArrayGetDescriptor_v2`.
1581///
1582/// Mirrors `CUDA_ARRAY_DESCRIPTOR_v2` in the CUDA driver API.
1583#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1584#[repr(C)]
1585pub struct CUDA_ARRAY_DESCRIPTOR {
1586 /// Width of the array in elements.
1587 pub width: usize,
1588 /// Height of the array in elements (0 for 1-D arrays).
1589 pub height: usize,
1590 /// Element format (data type of each channel).
1591 pub format: CUarray_format,
1592 /// Number of channels (1, 2, or 4).
1593 pub num_channels: u32,
1594}
1595
1596// =========================================================================
1597// CUDA_ARRAY3D_DESCRIPTOR — descriptor for 3-D CUDA arrays
1598// =========================================================================
1599
1600/// Descriptor passed to `cuArray3DCreate_v2` / `cuArray3DGetDescriptor_v2`.
1601///
1602/// Mirrors `CUDA_ARRAY3D_DESCRIPTOR_v2` in the CUDA driver API. The `flags`
1603/// field accepts constants such as `CUDA_ARRAY3D_LAYERED` (0x01),
1604/// `CUDA_ARRAY3D_SURFACE_LDST` (0x02), `CUDA_ARRAY3D_CUBEMAP` (0x04), and
1605/// `CUDA_ARRAY3D_TEXTURE_GATHER` (0x08).
1606#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1607#[repr(C)]
1608pub struct CUDA_ARRAY3D_DESCRIPTOR {
1609 /// Width of the array in elements.
1610 pub width: usize,
1611 /// Height of the array in elements (0 for 1-D arrays).
1612 pub height: usize,
1613 /// Depth of the array in elements (0 for 1-D and 2-D arrays).
1614 pub depth: usize,
1615 /// Element format.
1616 pub format: CUarray_format,
1617 /// Number of channels (1, 2, or 4).
1618 pub num_channels: u32,
1619 /// Creation flags (see [`CUDA_ARRAY3D_LAYERED`] etc.).
1620 pub flags: u32,
1621}
1622
1623/// Flag: allocate a layered CUDA array (`CUDA_ARRAY3D_LAYERED`).
1624pub const CUDA_ARRAY3D_LAYERED: u32 = 0x01;
1625/// Flag: array usable as a surface load/store target (`CUDA_ARRAY3D_SURFACE_LDST`).
1626pub const CUDA_ARRAY3D_SURFACE_LDST: u32 = 0x02;
1627/// Flag: allocate a cubemap array (`CUDA_ARRAY3D_CUBEMAP`).
1628pub const CUDA_ARRAY3D_CUBEMAP: u32 = 0x04;
1629/// Flag: array usable with `cudaTextureGather` (`CUDA_ARRAY3D_TEXTURE_GATHER`).
1630pub const CUDA_ARRAY3D_TEXTURE_GATHER: u32 = 0x08;
1631
1632// =========================================================================
1633// CUDA_RESOURCE_DESC — resource descriptor union for tex/surf objects
1634// =========================================================================
1635
1636/// Inner data for an `Array` resource (variant of [`CudaResourceDescRes`]).
1637#[derive(Clone, Copy)]
1638#[repr(C)]
1639pub struct CudaResourceDescArray {
1640 /// CUDA array handle.
1641 pub h_array: CUarray,
1642}
1643
1644/// Inner data for a `MipmappedArray` resource.
1645#[derive(Clone, Copy)]
1646#[repr(C)]
1647pub struct CudaResourceDescMipmap {
1648 /// Mipmapped array handle.
1649 pub h_mipmapped_array: CUmipmappedArray,
1650}
1651
1652/// Inner data for a `Linear` (1-D linear memory) resource.
1653#[derive(Clone, Copy)]
1654#[repr(C)]
1655pub struct CudaResourceDescLinear {
1656 /// Device pointer to the linear region.
1657 pub dev_ptr: CUdeviceptr,
1658 /// Channel element format.
1659 pub format: CUarray_format,
1660 /// Number of channels.
1661 pub num_channels: u32,
1662 /// Total size in bytes.
1663 pub size_in_bytes: usize,
1664}
1665
1666/// Inner data for a `Pitch2D` (2-D pitched linear memory) resource.
1667#[derive(Clone, Copy)]
1668#[repr(C)]
1669pub struct CudaResourceDescPitch2d {
1670 /// Device pointer to the pitched region (first row).
1671 pub dev_ptr: CUdeviceptr,
1672 /// Channel element format.
1673 pub format: CUarray_format,
1674 /// Number of channels.
1675 pub num_channels: u32,
1676 /// Width of the array in elements.
1677 pub width_in_elements: usize,
1678 /// Height of the array in elements.
1679 pub height: usize,
1680 /// Row pitch in bytes (stride between rows).
1681 pub pitch_in_bytes: usize,
1682}
1683
1684/// Union of resource descriptors for [`CUDA_RESOURCE_DESC`].
1685///
1686/// # Safety
1687///
1688/// Callers must only read the field whose discriminant matches the
1689/// `res_type` field of the enclosing [`CUDA_RESOURCE_DESC`].
1690#[repr(C)]
1691pub union CudaResourceDescRes {
1692 /// Array resource.
1693 pub array: CudaResourceDescArray,
1694 /// Mipmapped array resource.
1695 pub mipmap: CudaResourceDescMipmap,
1696 /// 1-D linear memory resource.
1697 pub linear: CudaResourceDescLinear,
1698 /// 2-D pitched linear memory resource.
1699 pub pitch2d: CudaResourceDescPitch2d,
1700 /// Padding: ensures the union is 128 bytes (32 × i32), matching the ABI.
1701 pub reserved: [i32; 32],
1702}
1703
1704/// Resource descriptor passed to `cuTexObjectCreate` / `cuSurfObjectCreate`.
1705///
1706/// Mirrors `CUDA_RESOURCE_DESC` in the CUDA driver API header.
1707#[repr(C)]
1708pub struct CUDA_RESOURCE_DESC {
1709 /// Identifies which union field inside `res` is valid.
1710 pub res_type: CUresourcetype,
1711 /// Resource payload — interpret via `res_type`.
1712 pub res: CudaResourceDescRes,
1713 /// Reserved flags (must be zero).
1714 pub flags: u32,
1715}
1716
1717// =========================================================================
1718// CUDA_TEXTURE_DESC — texture object sampling parameters
1719// =========================================================================
1720
1721/// Texture object descriptor passed to `cuTexObjectCreate`.
1722///
1723/// Mirrors `CUDA_TEXTURE_DESC` in the CUDA driver API. All fields that the
1724/// caller does not set explicitly should be zeroed.
1725///
1726/// # Layout
1727///
1728/// The struct is `#[repr(C)]` and contains 64 bytes of reserved padding so
1729/// that it matches the binary ABI expected by the driver.
1730#[derive(Clone, Copy)]
1731#[repr(C)]
1732pub struct CUDA_TEXTURE_DESC {
1733 /// Address mode for each coordinate dimension (`[U, V, W]`).
1734 pub address_mode: [CUaddress_mode; 3],
1735 /// Texture filter mode (point or linear).
1736 pub filter_mode: CUfilter_mode,
1737 /// Flags: bit 0 = `CU_TRSF_READ_AS_INTEGER`, bit 1 = `CU_TRSF_NORMALIZED_COORDINATES`,
1738 /// bit 2 = `CU_TRSF_SRGB`, bit 3 = `CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION`.
1739 pub flags: u32,
1740 /// Maximum anisotropy ratio (1–16; 1 disables anisotropy).
1741 pub max_anisotropy: u32,
1742 /// Mipmap filter mode.
1743 pub mipmap_filter_mode: CUfilter_mode,
1744 /// Mipmap level-of-detail bias.
1745 pub mipmap_level_bias: f32,
1746 /// Minimum mipmap LOD clamp value.
1747 pub min_mipmap_level_clamp: f32,
1748 /// Maximum mipmap LOD clamp value.
1749 pub max_mipmap_level_clamp: f32,
1750 /// Border color (RGBA, applied when address mode is `Border`).
1751 pub border_color: [f32; 4],
1752 /// Reserved: must be zero.
1753 pub reserved: [i32; 12],
1754}
1755
1756/// Flag: texture reads return raw integers (no type conversion).
1757pub const CU_TRSF_READ_AS_INTEGER: u32 = 0x01;
1758/// Flag: texture coordinates are normalized to [0, 1).
1759pub const CU_TRSF_NORMALIZED_COORDINATES: u32 = 0x02;
1760/// Flag: sRGB gamma encoding is applied during sampling.
1761pub const CU_TRSF_SRGB: u32 = 0x10;
1762/// Flag: disable hardware trilinear optimisation.
1763pub const CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION: u32 = 0x20;
1764
1765// =========================================================================
1766// CUDA_RESOURCE_VIEW_DESC — optional re-interpretation of array resources
1767// =========================================================================
1768
1769/// Optional resource view descriptor for `cuTexObjectCreate`.
1770///
1771/// Allows the caller to specify a sub-region, a different channel
1772/// interpretation format, or a mipmap range for a [`CUDA_RESOURCE_DESC`] that
1773/// wraps a CUDA array. Pass a null pointer to `cuTexObjectCreate` to skip the
1774/// view override.
1775///
1776/// Mirrors `CUDA_RESOURCE_VIEW_DESC` in the CUDA driver API.
1777#[derive(Clone, Copy)]
1778#[repr(C)]
1779pub struct CUDA_RESOURCE_VIEW_DESC {
1780 /// Format to use for the resource view (re-interpretation).
1781 pub format: CUresourceViewFormat,
1782 /// Width of the view in elements.
1783 pub width: usize,
1784 /// Height of the view in elements.
1785 pub height: usize,
1786 /// Depth of the view in elements.
1787 pub depth: usize,
1788 /// First mipmap level included in the view.
1789 pub first_mipmap_level: u32,
1790 /// Last mipmap level included in the view.
1791 pub last_mipmap_level: u32,
1792 /// First array layer in a layered resource.
1793 pub first_layer: u32,
1794 /// Last array layer in a layered resource.
1795 pub last_layer: u32,
1796 /// Reserved: must be zero.
1797 pub reserved: [u32; 16],
1798}
1799
1800// =========================================================================
1801// Tests
1802// =========================================================================
1803
1804#[cfg(test)]
1805mod tests {
1806 use super::*;
1807
1808 #[test]
1809 fn test_cuda_success_is_zero() {
1810 assert_eq!(CUDA_SUCCESS, 0);
1811 }
1812
1813 #[test]
1814 fn test_opaque_types_are_pointer_sized() {
1815 assert_eq!(
1816 std::mem::size_of::<CUcontext>(),
1817 std::mem::size_of::<*mut c_void>()
1818 );
1819 assert_eq!(
1820 std::mem::size_of::<CUmodule>(),
1821 std::mem::size_of::<*mut c_void>()
1822 );
1823 assert_eq!(
1824 std::mem::size_of::<CUstream>(),
1825 std::mem::size_of::<*mut c_void>()
1826 );
1827 assert_eq!(
1828 std::mem::size_of::<CUevent>(),
1829 std::mem::size_of::<*mut c_void>()
1830 );
1831 assert_eq!(
1832 std::mem::size_of::<CUfunction>(),
1833 std::mem::size_of::<*mut c_void>()
1834 );
1835 assert_eq!(
1836 std::mem::size_of::<CUmemoryPool>(),
1837 std::mem::size_of::<*mut c_void>()
1838 );
1839 }
1840
1841 #[test]
1842 fn test_handle_default_is_null() {
1843 assert!(CUcontext::default().is_null());
1844 assert!(CUmodule::default().is_null());
1845 assert!(CUfunction::default().is_null());
1846 assert!(CUstream::default().is_null());
1847 assert!(CUevent::default().is_null());
1848 assert!(CUmemoryPool::default().is_null());
1849 }
1850
1851 #[test]
1852 fn test_device_attribute_repr() {
1853 // Original variants
1854 assert_eq!(CUdevice_attribute::MaxThreadsPerBlock as i32, 1);
1855 assert_eq!(CUdevice_attribute::WarpSize as i32, 10);
1856 assert_eq!(CUdevice_attribute::MultiprocessorCount as i32, 16);
1857 assert_eq!(CUdevice_attribute::ComputeCapabilityMajor as i32, 75);
1858 assert_eq!(CUdevice_attribute::ComputeCapabilityMinor as i32, 76);
1859 assert_eq!(CUdevice_attribute::MaxBlocksPerMultiprocessor as i32, 106);
1860 assert_eq!(CUdevice_attribute::L2CacheSize as i32, 38);
1861 assert_eq!(
1862 CUdevice_attribute::MaxSharedMemoryPerMultiprocessor as i32,
1863 81
1864 );
1865 assert_eq!(CUdevice_attribute::ManagedMemory as i32, 83);
1866
1867 // New variants
1868 assert_eq!(CUdevice_attribute::MaxTexture2DGatherWidth as i32, 44);
1869 assert_eq!(CUdevice_attribute::MaxTexture2DGatherHeight as i32, 45);
1870 assert_eq!(CUdevice_attribute::MaxTexture3DWidthAlt as i32, 47);
1871 assert_eq!(CUdevice_attribute::MaxTexture3DHeightAlt as i32, 48);
1872 assert_eq!(CUdevice_attribute::MaxTexture3DDepthAlt as i32, 49);
1873 assert_eq!(CUdevice_attribute::MaxTexture1DMipmappedWidth2 as i32, 52);
1874 assert_eq!(CUdevice_attribute::Reserved92 as i32, 92);
1875 assert_eq!(CUdevice_attribute::Reserved93 as i32, 93);
1876 assert_eq!(CUdevice_attribute::Reserved94 as i32, 94);
1877 assert_eq!(
1878 CUdevice_attribute::VirtualMemoryManagementSupported as i32,
1879 102
1880 );
1881 assert_eq!(
1882 CUdevice_attribute::HandleTypePosixFileDescriptorSupported as i32,
1883 103
1884 );
1885 assert_eq!(
1886 CUdevice_attribute::HandleTypeWin32HandleSupported as i32,
1887 104
1888 );
1889 assert_eq!(
1890 CUdevice_attribute::HandleTypeWin32KmtHandleSupported as i32,
1891 105
1892 );
1893 assert_eq!(CUdevice_attribute::AccessPolicyMaxWindowSize as i32, 111);
1894 assert_eq!(CUdevice_attribute::ReservedSharedMemoryPerBlock as i32, 112);
1895 assert_eq!(
1896 CUdevice_attribute::TimelineSemaphoreInteropSupported as i32,
1897 113
1898 );
1899 assert_eq!(CUdevice_attribute::MemoryPoolsSupported as i32, 115);
1900 assert_eq!(CUdevice_attribute::ClusterLaunch as i32, 120);
1901 assert_eq!(CUdevice_attribute::UnifiedFunctionPointers as i32, 125);
1902 assert_eq!(
1903 CUdevice_attribute::MaxTimelineSemaphoreInteropSupported as i32,
1904 129
1905 );
1906 assert_eq!(CUdevice_attribute::MemSyncDomainSupported as i32, 130);
1907 assert_eq!(CUdevice_attribute::GpuDirectRdmaFabricSupported as i32, 131);
1908 }
1909
1910 #[test]
1911 fn test_jit_option_repr() {
1912 assert_eq!(CUjit_option::MaxRegisters as u32, 0);
1913 assert_eq!(CUjit_option::ThreadsPerBlock as u32, 1);
1914 assert_eq!(CUjit_option::WallTime as u32, 2);
1915 assert_eq!(CUjit_option::InfoLogBuffer as u32, 3);
1916 assert_eq!(CUjit_option::InfoLogBufferSizeBytes as u32, 4);
1917 assert_eq!(CUjit_option::ErrorLogBuffer as u32, 5);
1918 assert_eq!(CUjit_option::ErrorLogBufferSizeBytes as u32, 6);
1919 assert_eq!(CUjit_option::OptimizationLevel as u32, 7);
1920 assert_eq!(CUjit_option::Target as u32, 9);
1921 assert_eq!(CUjit_option::FallbackStrategy as u32, 10);
1922 }
1923
1924 #[test]
1925 fn test_stream_and_event_flags() {
1926 assert_eq!(CU_STREAM_DEFAULT, 0);
1927 assert_eq!(CU_STREAM_NON_BLOCKING, 1);
1928 assert_eq!(CU_EVENT_DEFAULT, 0);
1929 assert_eq!(CU_EVENT_BLOCKING_SYNC, 1);
1930 assert_eq!(CU_EVENT_DISABLE_TIMING, 2);
1931 assert_eq!(CU_EVENT_INTERPROCESS, 4);
1932 }
1933
1934 #[test]
1935 fn test_context_scheduling_flags() {
1936 assert_eq!(CU_CTX_SCHED_AUTO, 0);
1937 assert_eq!(CU_CTX_SCHED_SPIN, 1);
1938 assert_eq!(CU_CTX_SCHED_YIELD, 2);
1939 assert_eq!(CU_CTX_SCHED_BLOCKING_SYNC, 4);
1940 }
1941
1942 #[test]
1943 fn test_mem_attach_flags() {
1944 assert_eq!(CU_MEM_ATTACH_GLOBAL, 1);
1945 assert_eq!(CU_MEM_ATTACH_HOST, 2);
1946 assert_eq!(CU_MEM_ATTACH_SINGLE, 4);
1947 }
1948
1949 #[test]
1950 #[allow(clippy::assertions_on_constants)]
1951 fn test_error_code_ranges() {
1952 // Basic errors: 1-8
1953 assert!(CUDA_ERROR_INVALID_VALUE < 10);
1954 // Device errors: 100-102
1955 assert!((100..=102).contains(&CUDA_ERROR_NO_DEVICE));
1956 assert!((100..=102).contains(&CUDA_ERROR_INVALID_DEVICE));
1957 assert!((100..=102).contains(&CUDA_ERROR_DEVICE_NOT_LICENSED));
1958 // Image/context errors: 200+
1959 assert!(CUDA_ERROR_INVALID_IMAGE >= 200);
1960 // Launch errors: 700+
1961 assert!(CUDA_ERROR_LAUNCH_FAILED >= 700);
1962 assert!(CUDA_ERROR_ILLEGAL_ADDRESS >= 700);
1963 assert!(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES >= 700);
1964 // Stream capture errors: 900+
1965 assert!(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED >= 900);
1966 // Unknown is 999
1967 assert_eq!(CUDA_ERROR_UNKNOWN, 999);
1968 }
1969
1970 #[test]
1971 fn test_func_attribute_constants() {
1972 assert_eq!(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 0);
1973 assert_eq!(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, 1);
1974 assert_eq!(CU_FUNC_ATTRIBUTE_NUM_REGS, 4);
1975 }
1976
1977 #[test]
1978 fn test_limit_constants() {
1979 assert_eq!(CU_LIMIT_STACK_SIZE, 0);
1980 assert_eq!(CU_LIMIT_PRINTF_FIFO_SIZE, 1);
1981 assert_eq!(CU_LIMIT_MALLOC_HEAP_SIZE, 2);
1982 }
1983
1984 #[test]
1985 fn test_memory_type_constants() {
1986 assert_eq!(CU_MEMORYTYPE_HOST, 1);
1987 assert_eq!(CU_MEMORYTYPE_DEVICE, 2);
1988 assert_eq!(CU_MEMORYTYPE_ARRAY, 3);
1989 assert_eq!(CU_MEMORYTYPE_UNIFIED, 4);
1990 }
1991
1992 #[test]
1993 fn test_handle_debug_format() {
1994 let ctx = CUcontext::default();
1995 let debug_str = format!("{ctx:?}");
1996 assert!(debug_str.starts_with("CUcontext("));
1997 }
1998
1999 #[test]
2000 fn test_handle_equality() {
2001 let a = CUcontext::default();
2002 let b = CUcontext::default();
2003 assert_eq!(a, b);
2004 }
2005
2006 #[test]
2007 fn test_new_handle_types_are_pointer_sized() {
2008 assert_eq!(
2009 std::mem::size_of::<CUtexref>(),
2010 std::mem::size_of::<*mut c_void>()
2011 );
2012 assert_eq!(
2013 std::mem::size_of::<CUsurfref>(),
2014 std::mem::size_of::<*mut c_void>()
2015 );
2016 assert_eq!(
2017 std::mem::size_of::<CUtexObject>(),
2018 std::mem::size_of::<*mut c_void>()
2019 );
2020 assert_eq!(
2021 std::mem::size_of::<CUsurfObject>(),
2022 std::mem::size_of::<*mut c_void>()
2023 );
2024 }
2025
2026 #[test]
2027 fn test_new_handle_defaults_are_null() {
2028 assert!(CUtexref::default().is_null());
2029 assert!(CUsurfref::default().is_null());
2030 assert!(CUtexObject::default().is_null());
2031 assert!(CUsurfObject::default().is_null());
2032 }
2033
2034 #[test]
2035 fn test_memory_type_enum() {
2036 assert_eq!(CUmemorytype::Host as u32, 1);
2037 assert_eq!(CUmemorytype::Device as u32, 2);
2038 assert_eq!(CUmemorytype::Array as u32, 3);
2039 assert_eq!(CUmemorytype::Unified as u32, 4);
2040 }
2041
2042 #[test]
2043 fn test_pointer_attribute_enum() {
2044 assert_eq!(CUpointer_attribute::Context as u32, 1);
2045 assert_eq!(CUpointer_attribute::MemoryType as u32, 2);
2046 assert_eq!(CUpointer_attribute::DevicePointer as u32, 3);
2047 assert_eq!(CUpointer_attribute::HostPointer as u32, 4);
2048 assert_eq!(CUpointer_attribute::IsManaged as u32, 9);
2049 assert_eq!(CUpointer_attribute::DeviceOrdinal as u32, 10);
2050 }
2051
2052 #[test]
2053 fn test_limit_enum() {
2054 assert_eq!(CUlimit::StackSize as u32, 0);
2055 assert_eq!(CUlimit::PrintfFifoSize as u32, 1);
2056 assert_eq!(CUlimit::MallocHeapSize as u32, 2);
2057 assert_eq!(CUlimit::DevRuntimeSyncDepth as u32, 3);
2058 assert_eq!(CUlimit::DevRuntimePendingLaunchCount as u32, 4);
2059 assert_eq!(CUlimit::MaxL2FetchGranularity as u32, 5);
2060 assert_eq!(CUlimit::PersistingL2CacheSize as u32, 6);
2061 }
2062
2063 #[test]
2064 fn test_function_attribute_enum() {
2065 assert_eq!(CUfunction_attribute::MaxThreadsPerBlock as i32, 0);
2066 assert_eq!(CUfunction_attribute::SharedSizeBytes as i32, 1);
2067 assert_eq!(CUfunction_attribute::NumRegs as i32, 4);
2068 assert_eq!(CUfunction_attribute::PtxVersion as i32, 5);
2069 assert_eq!(CUfunction_attribute::BinaryVersion as i32, 6);
2070 assert_eq!(CUfunction_attribute::MaxDynamicSharedSizeBytes as i32, 8);
2071 assert_eq!(
2072 CUfunction_attribute::PreferredSharedMemoryCarveout as i32,
2073 9
2074 );
2075 }
2076}