oxicuda_driver/ffi.rs
1//! Raw CUDA Driver API FFI types, constants, and enums.
2//!
3//! This module provides the low-level type definitions that mirror the CUDA Driver API
4//! (`cuda.h`). No functions are defined here — only types, opaque pointer aliases,
5//! result-code constants, and `#[repr]` enums used by the dynamically loaded driver
6//! entry points.
7//!
8//! # Safety
9//!
10//! All pointer types in this module are raw pointers intended for FFI use.
11//! They must only be used through the safe wrappers provided by higher-level
12//! modules in `oxicuda-driver`.
13
14use std::ffi::c_void;
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// Core scalar type aliases
19// ---------------------------------------------------------------------------
20
21/// Return code from every CUDA Driver API call.
22///
23/// A value of `0` (`CUDA_SUCCESS`) indicates success; any other value is an
24/// error code. See the `CUDA_*` constants below for the full catalogue.
25pub type CUresult = u32;
26
27/// Ordinal identifier for a CUDA-capable device (0-based).
28pub type CUdevice = i32;
29
30/// Device-side pointer (64-bit address in GPU virtual memory).
31pub type CUdeviceptr = u64;
32
33// ---------------------------------------------------------------------------
34// Opaque handle helpers
35// ---------------------------------------------------------------------------
36
37macro_rules! define_handle {
38 ($(#[$meta:meta])* $name:ident) => {
39 $(#[$meta])*
40 #[repr(transparent)]
41 #[derive(Clone, Copy, PartialEq, Eq, Hash)]
42 pub struct $name(pub *mut c_void);
43
44 // SAFETY: CUDA handles are thread-safe when used with proper
45 // synchronisation via the driver API.
46 unsafe impl Send for $name {}
47 unsafe impl Sync for $name {}
48
49 impl fmt::Debug for $name {
50 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51 write!(f, "{}({:p})", stringify!($name), self.0)
52 }
53 }
54
55 impl Default for $name {
56 fn default() -> Self {
57 Self(std::ptr::null_mut())
58 }
59 }
60
61 impl $name {
62 /// Returns `true` if the handle is null (uninitialised).
63 #[inline]
64 pub fn is_null(self) -> bool {
65 self.0.is_null()
66 }
67 }
68 };
69}
70
71// ---------------------------------------------------------------------------
72// Handle types
73// ---------------------------------------------------------------------------
74
75define_handle! {
76 /// Opaque handle to a CUDA context.
77 CUcontext
78}
79
80define_handle! {
81 /// Opaque handle to a loaded CUDA module (PTX / cubin).
82 CUmodule
83}
84
85define_handle! {
86 /// Opaque handle to a CUDA kernel function within a module.
87 CUfunction
88}
89
90define_handle! {
91 /// Opaque handle to a CUDA stream (command queue).
92 CUstream
93}
94
95define_handle! {
96 /// Opaque handle to a CUDA event (used for timing and synchronisation).
97 CUevent
98}
99
100define_handle! {
101 /// Opaque handle to a CUDA memory pool (`cuMemPool*` family).
102 CUmemoryPool
103}
104
105define_handle! {
106 /// Opaque handle to a CUDA texture reference (legacy API).
107 CUtexref
108}
109
110define_handle! {
111 /// Opaque handle to a CUDA surface reference (legacy API).
112 CUsurfref
113}
114
115define_handle! {
116 /// Opaque handle to a CUDA texture object (modern bindless API).
117 CUtexObject
118}
119
120define_handle! {
121 /// Opaque handle to a CUDA surface object (modern bindless API).
122 CUsurfObject
123}
124
125define_handle! {
126 /// Opaque handle to a CUDA kernel (CUDA 12.8+ library-based kernels).
127 ///
128 /// Used with `cuKernelGetLibrary` to retrieve the library a kernel
129 /// belongs to.
130 CUkernel
131}
132
133define_handle! {
134 /// Opaque handle to a CUDA library (CUDA 12.8+ JIT library API).
135 ///
136 /// Retrieved via `cuKernelGetLibrary` to identify the JIT-compiled
137 /// library that contains a given kernel.
138 CUlibrary
139}
140
141define_handle! {
142 /// Opaque handle to an NVLink multicast object (CUDA 12.8+).
143 ///
144 /// Used with `cuMulticastCreate`, `cuMulticastAddDevice`, and related
145 /// functions to manage NVLink multicast memory regions across devices.
146 CUmulticastObject
147}
148
149define_handle! {
150 /// Opaque handle to a CUDA JIT linker state (`CUlinkState`).
151 ///
152 /// Created by `cuLinkCreate_v2`, populated by repeated calls to
153 /// `cuLinkAddData_v2`, finalised by `cuLinkComplete`, and freed by
154 /// `cuLinkDestroy`.
155 CUlinkState
156}
157
158// =========================================================================
159// CUmemGenericAllocationHandle — VMM allocation handle (CUDA 11.2+)
160// =========================================================================
161
162/// Opaque handle to a generic memory allocation managed by the CUDA virtual
163/// memory management (VMM) APIs (`cuMemCreate`, `cuMemRelease`, `cuMemMap`).
164///
165/// Although the CUDA header types this as `unsigned long long`, it is an opaque
166/// driver-side identifier and must not be interpreted as a numeric address.
167pub type CUmemGenericAllocationHandle = u64;
168
169// =========================================================================
170// CUmemorytype — memory type identifiers
171// =========================================================================
172
173/// Memory type identifiers returned by pointer attribute queries.
174#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
175#[repr(u32)]
176#[non_exhaustive]
177pub enum CUmemorytype {
178 /// Host (system) memory.
179 Host = 1,
180 /// Device (GPU) memory.
181 Device = 2,
182 /// Array memory.
183 Array = 3,
184 /// Unified (managed) memory.
185 Unified = 4,
186}
187
188// =========================================================================
189// CUpointer_attribute — pointer attribute query keys
190// =========================================================================
191
192/// Pointer attribute identifiers passed to `cuPointerGetAttribute`.
193#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
194#[repr(u32)]
195#[non_exhaustive]
196#[allow(non_camel_case_types)]
197pub enum CUpointer_attribute {
198 /// Query the CUDA context associated with a pointer.
199 Context = 1,
200 /// Query the memory type (host / device / unified) of a pointer.
201 MemoryType = 2,
202 /// Query the device pointer corresponding to a host pointer.
203 DevicePointer = 3,
204 /// Query the host pointer corresponding to a device pointer.
205 HostPointer = 4,
206 /// Query whether the memory is managed (unified).
207 IsManaged = 9,
208 /// Query the device ordinal for the pointer.
209 DeviceOrdinal = 10,
210}
211
212// =========================================================================
213// CUlimit — context limit identifiers
214// =========================================================================
215
216/// Context limit identifiers for `cuCtxSetLimit` / `cuCtxGetLimit`.
217#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
218#[repr(u32)]
219#[non_exhaustive]
220pub enum CUlimit {
221 /// Stack size for each GPU thread.
222 StackSize = 0,
223 /// Size of the printf FIFO.
224 PrintfFifoSize = 1,
225 /// Size of the heap used by `malloc()` on the device.
226 MallocHeapSize = 2,
227 /// Maximum nesting depth of a device runtime launch.
228 DevRuntimeSyncDepth = 3,
229 /// Maximum number of outstanding device runtime launches.
230 DevRuntimePendingLaunchCount = 4,
231 /// L2 cache fetch granularity.
232 MaxL2FetchGranularity = 5,
233 /// Maximum persisting L2 cache size.
234 PersistingL2CacheSize = 6,
235}
236
237// =========================================================================
238// CUfunction_attribute — function attribute query keys
239// =========================================================================
240
241/// Function attribute identifiers passed to `cuFuncGetAttribute`.
242#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
243#[repr(i32)]
244#[non_exhaustive]
245#[allow(non_camel_case_types)]
246pub enum CUfunction_attribute {
247 /// Maximum threads per block for this function.
248 MaxThreadsPerBlock = 0,
249 /// Shared memory used by this function (bytes).
250 SharedSizeBytes = 1,
251 /// Size of user-allocated constant memory (bytes).
252 ConstSizeBytes = 2,
253 /// Size of local memory used by each thread (bytes).
254 LocalSizeBytes = 3,
255 /// Number of registers used by each thread.
256 NumRegs = 4,
257 /// PTX virtual architecture version.
258 PtxVersion = 5,
259 /// Binary architecture version.
260 BinaryVersion = 6,
261 /// Whether this function has been cached.
262 CacheModeCa = 7,
263 /// Maximum dynamic shared memory size (bytes).
264 MaxDynamicSharedSizeBytes = 8,
265 /// Preferred shared memory carve-out.
266 PreferredSharedMemoryCarveout = 9,
267}
268
269// =========================================================================
270// CUresult constants — every documented CUDA Driver API error code
271// =========================================================================
272
273/// The API call returned with no errors.
274pub const CUDA_SUCCESS: CUresult = 0;
275
276/// One or more parameters passed to the API call are not acceptable.
277pub const CUDA_ERROR_INVALID_VALUE: CUresult = 1;
278
279/// The API call failed because it was unable to allocate enough memory.
280pub const CUDA_ERROR_OUT_OF_MEMORY: CUresult = 2;
281
282/// The CUDA driver has not been initialised via `cuInit`.
283pub const CUDA_ERROR_NOT_INITIALIZED: CUresult = 3;
284
285/// The CUDA driver is shutting down.
286pub const CUDA_ERROR_DEINITIALIZED: CUresult = 4;
287
288/// Profiler is not initialised for this run.
289pub const CUDA_ERROR_PROFILER_DISABLED: CUresult = 5;
290
291/// (Deprecated) Profiler not started.
292pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: CUresult = 6;
293
294/// (Deprecated) Profiler already started.
295pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: CUresult = 7;
296
297/// (Deprecated) Profiler already stopped.
298pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: CUresult = 8;
299
300/// Stub library loaded instead of the real driver.
301pub const CUDA_ERROR_STUB_LIBRARY: CUresult = 34;
302
303/// Device-side assert triggered.
304pub const CUDA_ERROR_DEVICE_UNAVAILABLE: CUresult = 46;
305
306/// No CUDA-capable device is detected.
307pub const CUDA_ERROR_NO_DEVICE: CUresult = 100;
308
309/// The device ordinal supplied is out of range.
310pub const CUDA_ERROR_INVALID_DEVICE: CUresult = 101;
311
312/// The device does not have a valid licence.
313pub const CUDA_ERROR_DEVICE_NOT_LICENSED: CUresult = 102;
314
315/// The PTX or cubin image is invalid.
316pub const CUDA_ERROR_INVALID_IMAGE: CUresult = 200;
317
318/// The supplied context is not valid.
319pub const CUDA_ERROR_INVALID_CONTEXT: CUresult = 201;
320
321/// (Deprecated) Context already current.
322pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CUresult = 202;
323
324/// A map or register operation has failed.
325pub const CUDA_ERROR_MAP_FAILED: CUresult = 205;
326
327/// An unmap or unregister operation has failed.
328pub const CUDA_ERROR_UNMAP_FAILED: CUresult = 206;
329
330/// The specified array is currently mapped.
331pub const CUDA_ERROR_ARRAY_IS_MAPPED: CUresult = 207;
332
333/// The resource is already mapped.
334pub const CUDA_ERROR_ALREADY_MAPPED: CUresult = 208;
335
336/// There is no kernel image available for execution on the device.
337pub const CUDA_ERROR_NO_BINARY_FOR_GPU: CUresult = 209;
338
339/// A resource has already been acquired.
340pub const CUDA_ERROR_ALREADY_ACQUIRED: CUresult = 210;
341
342/// The resource is not mapped.
343pub const CUDA_ERROR_NOT_MAPPED: CUresult = 211;
344
345/// A mapped resource is not available for access as an array.
346pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: CUresult = 212;
347
348/// A mapped resource is not available for access as a pointer.
349pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: CUresult = 213;
350
351/// An uncorrectable ECC error was detected.
352pub const CUDA_ERROR_ECC_UNCORRECTABLE: CUresult = 214;
353
354/// A PTX JIT limit has been reached.
355pub const CUDA_ERROR_UNSUPPORTED_LIMIT: CUresult = 215;
356
357/// The context already has work from another thread bound to it.
358pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: CUresult = 216;
359
360/// Peer access is not supported across the given devices.
361pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: CUresult = 217;
362
363/// The PTX JIT compilation was disabled or the PTX is invalid.
364pub const CUDA_ERROR_INVALID_PTX: CUresult = 218;
365
366/// Invalid graphics context.
367pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: CUresult = 219;
368
369/// NVLINK is uncorrectable.
370pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: CUresult = 220;
371
372/// JIT compiler not found.
373pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: CUresult = 221;
374
375/// Unsupported PTX version.
376pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: CUresult = 222;
377
378/// JIT compilation disabled.
379pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: CUresult = 223;
380
381/// Unsupported exec-affinity type.
382pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: CUresult = 224;
383
384/// Unsupported device-side synchronisation on this device.
385pub const CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC: CUresult = 225;
386
387/// The requested source is invalid.
388pub const CUDA_ERROR_INVALID_SOURCE: CUresult = 300;
389
390/// The named file was not found.
391pub const CUDA_ERROR_FILE_NOT_FOUND: CUresult = 301;
392
393/// A shared-object symbol lookup failed.
394pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: CUresult = 302;
395
396/// The shared-object init function failed.
397pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: CUresult = 303;
398
399/// An OS call failed.
400pub const CUDA_ERROR_OPERATING_SYSTEM: CUresult = 304;
401
402/// The supplied handle is invalid.
403pub const CUDA_ERROR_INVALID_HANDLE: CUresult = 400;
404
405/// The requested resource is in an illegal state.
406pub const CUDA_ERROR_ILLEGAL_STATE: CUresult = 401;
407
408/// A loss-less compression buffer was detected while doing uncompressed access.
409pub const CUDA_ERROR_LOSSY_QUERY: CUresult = 402;
410
411/// A named symbol was not found.
412pub const CUDA_ERROR_NOT_FOUND: CUresult = 500;
413
414/// The operation is not ready (asynchronous).
415pub const CUDA_ERROR_NOT_READY: CUresult = 600;
416
417/// An illegal memory address was encountered.
418pub const CUDA_ERROR_ILLEGAL_ADDRESS: CUresult = 700;
419
420/// The kernel launch uses too many resources (registers / shared memory).
421pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: CUresult = 701;
422
423/// The kernel launch exceeded the time-out enforced by the driver.
424pub const CUDA_ERROR_LAUNCH_TIMEOUT: CUresult = 702;
425
426/// A launch did not occur on a compatible texturing mode.
427pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: CUresult = 703;
428
429/// Peer access already enabled.
430pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: CUresult = 704;
431
432/// Peer access has not been enabled.
433pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: CUresult = 705;
434
435/// The primary context has already been initialised.
436pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: CUresult = 708;
437
438/// The context is being destroyed.
439pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: CUresult = 709;
440
441/// A 64-bit device assertion triggered.
442pub const CUDA_ERROR_ASSERT: CUresult = 710;
443
444/// Hardware resources to enable peer access are exhausted.
445pub const CUDA_ERROR_TOO_MANY_PEERS: CUresult = 711;
446
447/// The host-side memory region is already registered.
448pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: CUresult = 712;
449
450/// The host-side memory region is not registered.
451pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: CUresult = 713;
452
453/// Hardware stack overflow on the device.
454pub const CUDA_ERROR_HARDWARE_STACK_ERROR: CUresult = 714;
455
456/// Illegal instruction encountered on the device.
457pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: CUresult = 715;
458
459/// Misaligned address on the device.
460pub const CUDA_ERROR_MISALIGNED_ADDRESS: CUresult = 716;
461
462/// Invalid address space.
463pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: CUresult = 717;
464
465/// Invalid program counter on the device.
466pub const CUDA_ERROR_INVALID_PC: CUresult = 718;
467
468/// The kernel launch failed.
469pub const CUDA_ERROR_LAUNCH_FAILED: CUresult = 719;
470
471/// Cooperative launch is too large for the device/kernel.
472pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: CUresult = 720;
473
474/// The API call is not permitted in the active context.
475pub const CUDA_ERROR_NOT_PERMITTED: CUresult = 800;
476
477/// The API call is not supported by the current driver/device combination.
478pub const CUDA_ERROR_NOT_SUPPORTED: CUresult = 801;
479
480/// System not ready for CUDA operations.
481pub const CUDA_ERROR_SYSTEM_NOT_READY: CUresult = 802;
482
483/// System driver mismatch.
484pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: CUresult = 803;
485
486/// Old-style context incompatible with CUDA 3.2+ API.
487pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: CUresult = 804;
488
489/// MPS connection failed.
490pub const CUDA_ERROR_MPS_CONNECTION_FAILED: CUresult = 805;
491
492/// MPS RPC failure.
493pub const CUDA_ERROR_MPS_RPC_FAILURE: CUresult = 806;
494
495/// MPS server not ready.
496pub const CUDA_ERROR_MPS_SERVER_NOT_READY: CUresult = 807;
497
498/// MPS maximum clients reached.
499pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: CUresult = 808;
500
501/// MPS maximum connections reached.
502pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: CUresult = 809;
503
504/// MPS client terminated.
505pub const CUDA_ERROR_MPS_CLIENT_TERMINATED: CUresult = 810;
506
507/// CDP not supported.
508pub const CUDA_ERROR_CDP_NOT_SUPPORTED: CUresult = 811;
509
510/// CDP version mismatch.
511pub const CUDA_ERROR_CDP_VERSION_MISMATCH: CUresult = 812;
512
513/// Stream capture unsupported.
514pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: CUresult = 900;
515
516/// Stream capture invalidated.
517pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: CUresult = 901;
518
519/// Stream capture merge not permitted.
520pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: CUresult = 902;
521
522/// Stream capture unmatched.
523pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: CUresult = 903;
524
525/// Stream capture unjoined.
526pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: CUresult = 904;
527
528/// Stream capture isolation violation.
529pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: CUresult = 905;
530
531/// Implicit stream in graph capture.
532pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: CUresult = 906;
533
534/// Captured event error.
535pub const CUDA_ERROR_CAPTURED_EVENT: CUresult = 907;
536
537/// Stream capture wrong thread.
538pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: CUresult = 908;
539
540/// The async operation timed out.
541pub const CUDA_ERROR_TIMEOUT: CUresult = 909;
542
543/// The graph update failed.
544pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: CUresult = 910;
545
546/// External device error.
547pub const CUDA_ERROR_EXTERNAL_DEVICE: CUresult = 911;
548
549/// Invalid cluster size.
550pub const CUDA_ERROR_INVALID_CLUSTER_SIZE: CUresult = 912;
551
552/// Function not loaded.
553pub const CUDA_ERROR_FUNCTION_NOT_LOADED: CUresult = 913;
554
555/// Invalid resource type.
556pub const CUDA_ERROR_INVALID_RESOURCE_TYPE: CUresult = 914;
557
558/// Invalid resource configuration.
559pub const CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION: CUresult = 915;
560
561/// An unknown internal error occurred.
562pub const CUDA_ERROR_UNKNOWN: CUresult = 999;
563
564// =========================================================================
565// CUdevice_attribute — device property query keys
566// =========================================================================
567
568/// Device attribute identifiers passed to `cuDeviceGetAttribute`.
569#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
570#[repr(i32)]
571#[non_exhaustive]
572#[allow(non_camel_case_types)]
573pub enum CUdevice_attribute {
574 /// Maximum number of threads per block.
575 MaxThreadsPerBlock = 1,
576 /// Maximum x-dimension of a block.
577 MaxBlockDimX = 2,
578 /// Maximum y-dimension of a block.
579 MaxBlockDimY = 3,
580 /// Maximum z-dimension of a block.
581 MaxBlockDimZ = 4,
582 /// Maximum x-dimension of a grid.
583 MaxGridDimX = 5,
584 /// Maximum y-dimension of a grid.
585 MaxGridDimY = 6,
586 /// Maximum z-dimension of a grid.
587 MaxGridDimZ = 7,
588 /// Maximum shared memory available per block (bytes).
589 MaxSharedMemoryPerBlock = 8,
590 /// Total amount of constant memory on the device (bytes).
591 TotalConstantMemory = 9,
592 /// Warp size in threads.
593 WarpSize = 10,
594 /// Maximum pitch allowed by memory copies (bytes).
595 MaxPitch = 11,
596 /// Maximum number of 32-bit registers per block.
597 MaxRegistersPerBlock = 12,
598 /// Peak clock frequency in kHz.
599 ClockRate = 13,
600 /// Alignment requirement for textures.
601 TextureAlignment = 14,
602 /// Device can possibly copy memory and execute a kernel concurrently.
603 GpuOverlap = 15,
604 /// Number of multiprocessors on the device.
605 MultiprocessorCount = 16,
606 /// Whether there is a run-time limit on kernels.
607 KernelExecTimeout = 17,
608 /// Device is integrated (shares host memory).
609 Integrated = 18,
610 /// Device can map host memory with `cuMemHostAlloc` / `cuMemHostRegister`.
611 CanMapHostMemory = 19,
612 /// Compute mode: default, exclusive, prohibited, etc.
613 ComputeMode = 20,
614 /// Maximum 1D texture width.
615 MaxTexture1DWidth = 21,
616 /// Maximum 2D texture width.
617 MaxTexture2DWidth = 22,
618 /// Maximum 2D texture height.
619 MaxTexture2DHeight = 23,
620 /// Maximum 3D texture width.
621 MaxTexture3DWidth = 24,
622 /// Maximum 3D texture height.
623 MaxTexture3DHeight = 25,
624 /// Maximum 3D texture depth.
625 MaxTexture3DDepth = 26,
626 /// Maximum 2D layered texture width.
627 MaxTexture2DLayeredWidth = 27,
628 /// Maximum 2D layered texture height.
629 MaxTexture2DLayeredHeight = 28,
630 /// Maximum layers in a 2D layered texture.
631 MaxTexture2DLayeredLayers = 29,
632 /// Alignment requirement for surfaces.
633 SurfaceAlignment = 30,
634 /// Device can execute multiple kernels concurrently.
635 ConcurrentKernels = 31,
636 /// Device supports ECC memory.
637 EccEnabled = 32,
638 /// PCI bus ID of the device.
639 PciBusId = 33,
640 /// PCI device ID of the device.
641 PciDeviceId = 34,
642 /// Device is using TCC (Tesla Compute Cluster) driver model.
643 TccDriver = 35,
644 /// Peak memory clock frequency in kHz.
645 MemoryClockRate = 36,
646 /// Global memory bus width in bits.
647 GlobalMemoryBusWidth = 37,
648 /// Size of L2 cache in bytes.
649 L2CacheSize = 38,
650 /// Maximum resident threads per multiprocessor.
651 MaxThreadsPerMultiprocessor = 39,
652 /// Number of asynchronous engines.
653 AsyncEngineCount = 40,
654 /// Device shares a unified address space with the host.
655 UnifiedAddressing = 41,
656 /// Maximum 1D layered texture width.
657 MaxTexture1DLayeredWidth = 42,
658 /// Maximum layers in a 1D layered texture.
659 MaxTexture1DLayeredLayers = 43,
660 /// Maximum 2D texture width if CUDA 2D memory allocation is bound.
661 MaxTexture2DGatherWidth = 44,
662 /// Maximum 2D texture height if CUDA 2D memory allocation is bound.
663 MaxTexture2DGatherHeight = 45,
664 /// Alternate maximum 3D texture width.
665 MaxTexture3DWidthAlt = 47,
666 /// Alternate maximum 3D texture height.
667 MaxTexture3DHeightAlt = 48,
668 /// Alternate maximum 3D texture depth.
669 MaxTexture3DDepthAlt = 49,
670 /// PCI domain ID.
671 PciDomainId = 50,
672 /// Texture pitch alignment.
673 TexturePitchAlignment = 51,
674 /// Maximum 1D mipmapped texture width.
675 MaxTexture1DMipmappedWidth2 = 52,
676 /// Maximum width for a cubemap texture.
677 MaxTextureCubemapWidth = 54,
678 /// Maximum width for a cubemap layered texture.
679 MaxTextureCubemapLayeredWidth = 55,
680 /// Maximum layers in a cubemap layered texture.
681 MaxTextureCubemapLayeredLayers = 56,
682 /// Maximum 1D surface width.
683 MaxSurface1DWidth = 57,
684 /// Maximum 2D surface width.
685 MaxSurface2DWidth = 58,
686 /// Maximum 2D surface height.
687 MaxSurface2DHeight = 59,
688 /// Maximum 3D surface width.
689 MaxSurface3DWidth = 60,
690 /// Maximum 3D surface height.
691 MaxSurface3DHeight = 61,
692 /// Maximum 3D surface depth.
693 MaxSurface3DDepth = 62,
694 /// Maximum cubemap surface width.
695 MaxSurfaceCubemapWidth = 63,
696 /// Maximum 1D layered surface width.
697 MaxSurface1DLayeredWidth = 64,
698 /// Maximum layers in a 1D layered surface.
699 MaxSurface1DLayeredLayers = 65,
700 /// Maximum 2D layered surface width.
701 MaxSurface2DLayeredWidth = 66,
702 /// Maximum 2D layered surface height.
703 MaxSurface2DLayeredHeight = 67,
704 /// Maximum layers in a 2D layered surface.
705 MaxSurface2DLayeredLayers = 68,
706 /// Maximum cubemap layered surface width.
707 MaxSurfaceCubemapLayeredWidth = 69,
708 /// Maximum layers in a cubemap layered surface.
709 MaxSurfaceCubemapLayeredLayers = 70,
710 /// Maximum 1D linear texture width (deprecated).
711 MaxTexture1DLinearWidth = 71,
712 /// Maximum 2D linear texture width.
713 MaxTexture2DLinearWidth = 72,
714 /// Maximum 2D linear texture height.
715 MaxTexture2DLinearHeight = 73,
716 /// Maximum 2D linear texture pitch (bytes).
717 MaxTexture2DLinearPitch = 74,
718 /// Major compute capability version number.
719 ComputeCapabilityMajor = 75,
720 /// Minor compute capability version number.
721 ComputeCapabilityMinor = 76,
722 /// Maximum mipmapped 2D texture width.
723 MaxTexture2DMipmappedWidth = 77,
724 /// Maximum mipmapped 2D texture height.
725 MaxTexture2DMipmappedHeight = 78,
726 /// Maximum mipmapped 1D texture width.
727 MaxTexture1DMipmappedWidth = 79,
728 /// Device supports stream priorities.
729 StreamPrioritiesSupported = 80,
730 /// Maximum shared memory per multiprocessor (bytes).
731 MaxSharedMemoryPerMultiprocessor = 81,
732 /// Maximum registers per multiprocessor.
733 MaxRegistersPerMultiprocessor = 82,
734 /// Device supports managed memory.
735 ManagedMemory = 83,
736 /// Device is on a multi-GPU board.
737 IsMultiGpuBoard = 84,
738 /// Unique identifier for the multi-GPU board group.
739 MultiGpuBoardGroupId = 85,
740 /// Host-visible native-atomic support for float operations.
741 HostNativeAtomicSupported = 86,
742 /// Ratio of single-to-double precision performance.
743 SingleToDoublePrecisionPerfRatio = 87,
744 /// Device supports pageable memory access.
745 PageableMemoryAccess = 88,
746 /// Device can access host registered memory at the same virtual address.
747 ConcurrentManagedAccess = 89,
748 /// Device supports compute preemption.
749 ComputePreemptionSupported = 90,
750 /// Device can access host memory via pageable accesses.
751 CanUseHostPointerForRegisteredMem = 91,
752 /// Reserved attribute (CUDA internal, value 92).
753 Reserved92 = 92,
754 /// Reserved attribute (CUDA internal, value 93).
755 Reserved93 = 93,
756 /// Reserved attribute (CUDA internal, value 94).
757 Reserved94 = 94,
758 /// Device supports cooperative kernel launches.
759 CooperativeLaunch = 95,
760 /// Device supports cooperative kernel launches across multiple GPUs.
761 CooperativeMultiDeviceLaunch = 96,
762 /// Maximum optin shared memory per block.
763 MaxSharedMemoryPerBlockOptin = 97,
764 /// Device supports flushing of outstanding remote writes.
765 CanFlushRemoteWrites = 98,
766 /// Device supports host-side memory-register functions.
767 HostRegisterSupported = 99,
768 /// Device supports pageable memory access using host page tables.
769 PageableMemoryAccessUsesHostPageTables = 100,
770 /// Device supports direct access to managed memory on the host.
771 DirectManagedMemAccessFromHost = 101,
772 /// Device supports virtual memory management APIs.
773 VirtualMemoryManagementSupported = 102,
774 /// Device supports handle-type POSIX file descriptors for IPC.
775 HandleTypePosixFileDescriptorSupported = 103,
776 /// Device supports handle-type Win32 handles for IPC.
777 HandleTypeWin32HandleSupported = 104,
778 /// Device supports handle-type Win32 KMT handles for IPC.
779 HandleTypeWin32KmtHandleSupported = 105,
780 /// Maximum blocks per multiprocessor.
781 MaxBlocksPerMultiprocessor = 106,
782 /// Device supports generic compression for memory.
783 GenericCompressionSupported = 107,
784 /// Maximum persisting L2 cache size (bytes).
785 MaxPersistingL2CacheSize = 108,
786 /// Maximum access-policy window size for L2 cache.
787 MaxAccessPolicyWindowSize = 109,
788 /// Device supports RDMA APIs via `cuMemRangeGetAttribute`.
789 GpuDirectRdmaWithCudaVmmSupported = 110,
790 /// Free memory / total memory on the device accessible via `cuMemGetInfo`.
791 AccessPolicyMaxWindowSize = 111,
792 /// Reserved range of shared memory per SM (bytes).
793 ReservedSharedMemoryPerBlock = 112,
794 /// Device supports timeline semaphore interop.
795 TimelineSemaphoreInteropSupported = 113,
796 /// Device supports memory pools (`cudaMallocAsync`).
797 MemoryPoolsSupported = 115,
798 /// GPU direct RDMA is supported.
799 GpuDirectRdmaSupported = 116,
800 /// GPU direct RDMA flush-writes order.
801 GpuDirectRdmaFlushWritesOptions = 117,
802 /// GPU direct RDMA writes ordering.
803 GpuDirectRdmaWritesOrdering = 118,
804 /// Memory pool supported handle types.
805 MemoryPoolSupportedHandleTypes = 119,
806 /// Device supports cluster launch.
807 ClusterLaunch = 120,
808 /// Deferred mapping CUDA array supported.
809 DeferredMappingCudaArraySupported = 121,
810 /// Device supports IPC event handles.
811 IpcEventSupported = 122,
812 /// Device supports mem-sync domain count.
813 MemSyncDomainCount = 123,
814 /// Device supports tensor-map access to data.
815 TensorMapAccessSupported = 124,
816 /// Unified function pointers supported.
817 UnifiedFunctionPointers = 125,
818 /// NUMA config.
819 NumaConfig = 127,
820 /// NUMA id.
821 NumaId = 128,
822 /// Multicast supported.
823 /// Device supports getting the minimum required per-block shared memory
824 /// for a cooperative launch via the extended attributes.
825 MaxTimelineSemaphoreInteropSupported = 129,
826 /// Device supports memory sync domain operations.
827 MemSyncDomainSupported = 130,
828 /// Device supports GPU-Direct Fabric.
829 GpuDirectRdmaFabricSupported = 131,
830 /// Device supports multicast.
831 MulticastSupported = 132,
832 /// Device supports MPS features.
833 MpsEnabled = 133,
834 /// Host-NUMA identifier.
835 HostNumaId = 134,
836}
837
838// =========================================================================
839// CUjit_option — options for the JIT compiler
840// =========================================================================
841
842/// JIT compilation options passed to `cuModuleLoadDataEx` and related functions.
843#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
844#[repr(u32)]
845#[non_exhaustive]
846#[allow(non_camel_case_types)]
847pub enum CUjit_option {
848 /// Maximum number of registers that a thread may use.
849 MaxRegisters = 0,
850 /// Number of threads per block for the JIT target.
851 ThreadsPerBlock = 1,
852 /// Wall-clock time (ms) for compilation.
853 WallTime = 2,
854 /// Pointer to a buffer for info log output.
855 InfoLogBuffer = 3,
856 /// Size (bytes) of the info-log buffer.
857 InfoLogBufferSizeBytes = 4,
858 /// Pointer to a buffer for error log output.
859 ErrorLogBuffer = 5,
860 /// Size (bytes) of the error-log buffer.
861 ErrorLogBufferSizeBytes = 6,
862 /// Optimisation level (0-4).
863 OptimizationLevel = 7,
864 /// Determines the target based on the current attached context.
865 TargetFromCuContext = 8,
866 /// Specific compute target (sm_XX).
867 Target = 9,
868 /// Fallback strategy when exact match is not found.
869 FallbackStrategy = 10,
870 /// Specifies whether to generate debug info.
871 GenerateDebugInfo = 11,
872 /// Generate verbose log messages.
873 LogVerbose = 12,
874 /// Generate line-number information.
875 GenerateLineInfo = 13,
876 /// Cache mode (on / off).
877 CacheMode = 14,
878 /// (Internal) New SM3X option.
879 Sm3xOpt = 15,
880 /// Fast compile flag.
881 FastCompile = 16,
882 /// Global symbol names.
883 GlobalSymbolNames = 17,
884 /// Global symbol addresses.
885 GlobalSymbolAddresses = 18,
886 /// Number of global symbols.
887 GlobalSymbolCount = 19,
888 /// LTO flag.
889 Lto = 20,
890 /// FTZ (flush-to-zero) flag.
891 Ftz = 21,
892 /// Prec-div flag.
893 PrecDiv = 22,
894 /// Prec-sqrt flag.
895 PrecSqrt = 23,
896 /// FMA flag.
897 Fma = 24,
898 /// Referenced kernel names.
899 ReferencedKernelNames = 25,
900 /// Referenced kernel count.
901 ReferencedKernelCount = 26,
902 /// Referenced variable names.
903 ReferencedVariableNames = 27,
904 /// Referenced variable count.
905 ReferencedVariableCount = 28,
906 /// Optimise unused device variables.
907 OptimizeUnusedDeviceVariables = 29,
908 /// Position-independent code.
909 PositionIndependentCode = 30,
910}
911
912// =========================================================================
913// CUjitInputType — input types for the linker
914// =========================================================================
915
916/// Input types for `cuLinkAddData` / `cuLinkAddFile`.
917#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
918#[repr(u32)]
919#[non_exhaustive]
920pub enum CUjitInputType {
921 /// PTX source code.
922 Ptx = 1,
923 /// Compiled device code (cubin).
924 Cubin = 2,
925 /// Fat binary bundle.
926 Fatbin = 3,
927 /// Relocatable device object.
928 Object = 4,
929 /// Device code library.
930 Library = 5,
931}
932
933// =========================================================================
934// CUmemLocationType — location-type discriminant (CUDA 11.2+ VMM)
935// =========================================================================
936
937/// Specifies the kind of location described by a [`CUmemLocation`].
938///
939/// Mirrors `CUmemLocationType` in `cuda.h`. Used by the virtual-memory
940/// management APIs to identify where a memory allocation resides or which
941/// device should be granted access.
942#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
943#[repr(u32)]
944#[non_exhaustive]
945pub enum CUmemLocationType {
946 /// Invalid / uninitialised location type.
947 Invalid = 0,
948 /// Location is a CUDA device (the `id` field is a device ordinal).
949 Device = 1,
950 /// Location is the host (CPU) memory.
951 Host = 2,
952 /// Location is a specific NUMA node on the host.
953 HostNuma = 3,
954 /// Location is the NUMA node currently bound to the calling thread.
955 HostNumaCurrent = 4,
956}
957
958// =========================================================================
959// CUmemAllocationType — allocation-kind discriminant (CUDA 11.2+ VMM)
960// =========================================================================
961
962/// Type of memory allocation requested via the VMM APIs.
963///
964/// Mirrors `CUmemAllocationType` in `cuda.h`.
965#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
966#[repr(u32)]
967#[non_exhaustive]
968pub enum CUmemAllocationType {
969 /// Invalid / uninitialised allocation type.
970 Invalid = 0,
971 /// Pinned (page-locked) GPU memory backed by physical device frames.
972 Pinned = 1,
973 /// Sentinel value used by the CUDA driver to mark forward-compatible
974 /// extensions; always equal to the maximum 32-bit signed integer.
975 Max = 0x7fff_ffff,
976}
977
978// =========================================================================
979// CUmemAllocationHandleType — exportable handle bitfield (CUDA 11.2+ VMM)
980// =========================================================================
981
982/// Set of operating-system handle types that the driver may export for a
983/// VMM allocation. Treated as a bitfield in the CUDA C API.
984///
985/// Mirrors `CUmemAllocationHandleType` in `cuda.h`.
986#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
987#[repr(u32)]
988#[non_exhaustive]
989pub enum CUmemAllocationHandleType {
990 /// No exportable handle is requested.
991 None = 0,
992 /// POSIX file descriptor (Linux).
993 PosixFileDescriptor = 1,
994 /// Win32 NT handle.
995 Win32 = 2,
996 /// Win32 KMT handle (legacy kernel-mode-thunk).
997 Win32Kmt = 4,
998 /// Fabric handle for multi-host shared memory (CUDA 12.0+).
999 Fabric = 8,
1000}
1001
1002// =========================================================================
1003// CUmemAccessFlags — peer-access permissions for VMM allocations
1004// =========================================================================
1005
1006/// Access flags applied via `cuMemSetAccess` to a VMM allocation, controlling
1007/// whether a particular [`CUmemLocation`] may read or write the mapping.
1008///
1009/// Mirrors `CUmemAccess_flags` in `cuda.h`. Renamed to follow Rust naming
1010/// conventions; the discriminant values are unchanged.
1011#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1012#[repr(u32)]
1013#[non_exhaustive]
1014pub enum CUmemAccessFlags {
1015 /// No access permitted from the location.
1016 None = 0,
1017 /// Read-only access permitted.
1018 Read = 1,
1019 /// Read-write access permitted.
1020 ReadWrite = 3,
1021 /// Sentinel value used by the CUDA driver for forward compatibility.
1022 Max = 0x7fff_ffff,
1023}
1024
1025// =========================================================================
1026// CUmemLocation — memory-location descriptor (CUDA 11.2+ VMM)
1027// =========================================================================
1028
1029/// Describes a physical memory location for the VMM and pool APIs.
1030///
1031/// Mirrors `CUmemLocation` in `cuda.h`. The interpretation of `id` depends on
1032/// `loc_type`: for [`CUmemLocationType::Device`] it is a device ordinal, for
1033/// [`CUmemLocationType::HostNuma`] it is a NUMA node identifier, and for the
1034/// other variants it must be set to `0`.
1035///
1036/// The `loc_type` field is stored as a raw `u32` so that any forward-compatible
1037/// value emitted by a future driver can be round-tripped without UB; convert
1038/// to / from [`CUmemLocationType`] manually when interpreting it.
1039#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1040#[repr(C)]
1041pub struct CUmemLocation {
1042 /// Location type; see [`CUmemLocationType`].
1043 pub loc_type: u32,
1044 /// Identifier whose meaning depends on `loc_type`.
1045 pub id: i32,
1046}
1047
1048// =========================================================================
1049// CUmemAllocationProp — properties of a VMM allocation request
1050// =========================================================================
1051
1052/// Properties passed to `cuMemCreate` to describe a new VMM allocation.
1053///
1054/// Mirrors `CUmemAllocationProp` in `cuda.h`.
1055///
1056/// The `alloc_type`, `requested_handle_types` and `alloc_flags` fields are
1057/// stored as raw integers so that future driver extensions cannot trigger UB
1058/// via unknown discriminants; convert them to / from
1059/// [`CUmemAllocationType`] / [`CUmemAllocationHandleType`] when interpreting.
1060#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1061#[repr(C)]
1062pub struct CUmemAllocationProp {
1063 /// Allocation type; see [`CUmemAllocationType`].
1064 pub alloc_type: u32,
1065 /// Bitfield of OS handle types to export; see
1066 /// [`CUmemAllocationHandleType`].
1067 pub requested_handle_types: u32,
1068 /// Physical location of the allocation.
1069 pub location: CUmemLocation,
1070 /// Win32 security attributes pointer; null on non-Windows platforms or
1071 /// when no specific security descriptor is required.
1072 pub win32_handle_meta_data: *mut c_void,
1073 /// Reserved for future allocation flags; must be `0` on current drivers.
1074 pub alloc_flags: u64,
1075}
1076
1077// SAFETY: The struct contains a raw pointer (`win32_handle_meta_data`) that
1078// callers are responsible for managing. The CUDA driver treats the pointer
1079// as opaque, so the struct itself is logically Send+Sync.
1080unsafe impl Send for CUmemAllocationProp {}
1081unsafe impl Sync for CUmemAllocationProp {}
1082
1083impl Default for CUmemAllocationProp {
1084 fn default() -> Self {
1085 Self {
1086 alloc_type: 0,
1087 requested_handle_types: 0,
1088 location: CUmemLocation::default(),
1089 win32_handle_meta_data: std::ptr::null_mut(),
1090 alloc_flags: 0,
1091 }
1092 }
1093}
1094
1095// =========================================================================
1096// CUmemAccessDesc — per-location access permissions for `cuMemSetAccess`
1097// =========================================================================
1098
1099/// Per-location access descriptor for `cuMemSetAccess`.
1100///
1101/// Mirrors `CUmemAccessDesc` in `cuda.h`. The `flags` field stores a
1102/// [`CUmemAccessFlags`] value as a raw `u32` for FFI safety.
1103#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
1104#[repr(C)]
1105pub struct CUmemAccessDesc {
1106 /// Memory location whose access permission is being changed.
1107 pub location: CUmemLocation,
1108 /// Access flags; see [`CUmemAccessFlags`].
1109 pub flags: u32,
1110}
1111
1112// =========================================================================
1113// CUmemPoolProps — properties of a stream-ordered memory pool
1114// =========================================================================
1115
1116/// Properties passed to `cuMemPoolCreate`.
1117///
1118/// Mirrors `CUmemPoolProps` in `cuda.h`. The trailing `reserved` field is
1119/// part of the public ABI: the CUDA driver expects 56 zero bytes there to
1120/// preserve forward compatibility.
1121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1122#[repr(C)]
1123pub struct CUmemPoolProps {
1124 /// Allocation type to use when servicing pool requests; see
1125 /// [`CUmemAllocationType`].
1126 pub alloc_type: u32,
1127 /// Bitfield of OS handle types to export; see
1128 /// [`CUmemAllocationHandleType`].
1129 pub handle_types: u32,
1130 /// Physical location backing the pool.
1131 pub location: CUmemLocation,
1132 /// Win32 security-attributes pointer; null on non-Windows platforms or
1133 /// when no specific security descriptor is required.
1134 pub win32_security_attributes: *mut c_void,
1135 /// Maximum aggregate size (bytes) the pool may hold. `0` means
1136 /// unlimited.
1137 pub max_size: usize,
1138 /// Reserved padding required by the CUDA ABI; must remain zeroed.
1139 pub reserved: [u8; 56],
1140}
1141
1142// SAFETY: The struct contains a raw pointer (`win32_security_attributes`) that
1143// callers are responsible for managing. The CUDA driver treats the pointer
1144// as opaque, so the struct itself is logically Send+Sync.
1145unsafe impl Send for CUmemPoolProps {}
1146unsafe impl Sync for CUmemPoolProps {}
1147
1148impl Default for CUmemPoolProps {
1149 fn default() -> Self {
1150 Self {
1151 alloc_type: 0,
1152 handle_types: 0,
1153 location: CUmemLocation::default(),
1154 win32_security_attributes: std::ptr::null_mut(),
1155 max_size: 0,
1156 reserved: [0u8; 56],
1157 }
1158 }
1159}
1160
1161// =========================================================================
1162// CUDA_MEMCPY2D — descriptor for `cuMemcpy2D_v2`
1163// =========================================================================
1164
1165/// Descriptor for a 2-D memory copy executed via `cuMemcpy2D_v2`.
1166///
1167/// Mirrors `CUDA_MEMCPY2D` in `cuda.h`. The CUDA driver inspects only the
1168/// fields appropriate for the source / destination memory types; the
1169/// remaining fields **must** be zeroed. Use [`CUDA_MEMCPY2D::default`] to
1170/// obtain a zero-initialised descriptor and only set the fields you need.
1171///
1172/// `src_memory_type` and `dst_memory_type` are stored as raw `u32` for FFI
1173/// safety; convert to / from [`CUmemorytype`] manually.
1174#[derive(Debug, Clone, Copy)]
1175#[repr(C)]
1176pub struct CUDA_MEMCPY2D {
1177 /// Source X offset in bytes.
1178 pub src_x_in_bytes: usize,
1179 /// Source Y offset in rows.
1180 pub src_y: usize,
1181 /// Source memory type; see [`CUmemorytype`].
1182 pub src_memory_type: u32,
1183 /// Source host pointer (only valid when `src_memory_type == Host`).
1184 pub src_host: *const c_void,
1185 /// Source device pointer (only valid when `src_memory_type == Device`).
1186 pub src_device: CUdeviceptr,
1187 /// Source CUDA array (only valid when `src_memory_type == Array`).
1188 pub src_array: crate::ffi::CUarray,
1189 /// Source pitch in bytes (`0` selects a tightly-packed layout).
1190 pub src_pitch: usize,
1191 /// Destination X offset in bytes.
1192 pub dst_x_in_bytes: usize,
1193 /// Destination Y offset in rows.
1194 pub dst_y: usize,
1195 /// Destination memory type; see [`CUmemorytype`].
1196 pub dst_memory_type: u32,
1197 /// Destination host pointer (only valid when `dst_memory_type == Host`).
1198 pub dst_host: *mut c_void,
1199 /// Destination device pointer (only valid when `dst_memory_type == Device`).
1200 pub dst_device: CUdeviceptr,
1201 /// Destination CUDA array (only valid when `dst_memory_type == Array`).
1202 pub dst_array: crate::ffi::CUarray,
1203 /// Destination pitch in bytes (`0` selects a tightly-packed layout).
1204 pub dst_pitch: usize,
1205 /// Width of the copied region in bytes.
1206 pub width_in_bytes: usize,
1207 /// Height of the copied region in rows.
1208 pub height: usize,
1209}
1210
1211// SAFETY: The struct contains raw pointers and a CUDA array handle; callers
1212// are responsible for managing the underlying memory and handles. Treating
1213// the descriptor itself as Send+Sync mirrors the C-side struct, which the
1214// driver may inspect from any thread.
1215unsafe impl Send for CUDA_MEMCPY2D {}
1216unsafe impl Sync for CUDA_MEMCPY2D {}
1217
1218impl Default for CUDA_MEMCPY2D {
1219 fn default() -> Self {
1220 Self {
1221 src_x_in_bytes: 0,
1222 src_y: 0,
1223 src_memory_type: 0,
1224 src_host: std::ptr::null(),
1225 src_device: 0,
1226 src_array: crate::ffi::CUarray::default(),
1227 src_pitch: 0,
1228 dst_x_in_bytes: 0,
1229 dst_y: 0,
1230 dst_memory_type: 0,
1231 dst_host: std::ptr::null_mut(),
1232 dst_device: 0,
1233 dst_array: crate::ffi::CUarray::default(),
1234 dst_pitch: 0,
1235 width_in_bytes: 0,
1236 height: 0,
1237 }
1238 }
1239}
1240
1241// =========================================================================
1242// Submodules — extracted per refactoring policy (<2000 lines per file)
1243// =========================================================================
1244
1245#[path = "ffi_constants.rs"]
1246mod ffi_constants;
1247pub use ffi_constants::*;
1248
1249#[path = "ffi_launch.rs"]
1250mod ffi_launch;
1251pub use ffi_launch::*;
1252
1253#[path = "ffi_descriptors.rs"]
1254mod ffi_descriptors;
1255pub use ffi_descriptors::*;
1256
1257// =========================================================================
1258// Tests
1259// =========================================================================
1260
1261#[cfg(test)]
1262mod tests {
1263 use super::*;
1264
1265 #[test]
1266 fn test_cuda_success_is_zero() {
1267 assert_eq!(CUDA_SUCCESS, 0);
1268 }
1269
1270 #[test]
1271 fn test_opaque_types_are_pointer_sized() {
1272 assert_eq!(
1273 std::mem::size_of::<CUcontext>(),
1274 std::mem::size_of::<*mut c_void>()
1275 );
1276 assert_eq!(
1277 std::mem::size_of::<CUmodule>(),
1278 std::mem::size_of::<*mut c_void>()
1279 );
1280 assert_eq!(
1281 std::mem::size_of::<CUstream>(),
1282 std::mem::size_of::<*mut c_void>()
1283 );
1284 assert_eq!(
1285 std::mem::size_of::<CUevent>(),
1286 std::mem::size_of::<*mut c_void>()
1287 );
1288 assert_eq!(
1289 std::mem::size_of::<CUfunction>(),
1290 std::mem::size_of::<*mut c_void>()
1291 );
1292 assert_eq!(
1293 std::mem::size_of::<CUmemoryPool>(),
1294 std::mem::size_of::<*mut c_void>()
1295 );
1296 }
1297
1298 #[test]
1299 fn test_handle_default_is_null() {
1300 assert!(CUcontext::default().is_null());
1301 assert!(CUmodule::default().is_null());
1302 assert!(CUfunction::default().is_null());
1303 assert!(CUstream::default().is_null());
1304 assert!(CUevent::default().is_null());
1305 assert!(CUmemoryPool::default().is_null());
1306 }
1307
1308 #[test]
1309 fn test_device_attribute_repr() {
1310 // Original variants
1311 assert_eq!(CUdevice_attribute::MaxThreadsPerBlock as i32, 1);
1312 assert_eq!(CUdevice_attribute::WarpSize as i32, 10);
1313 assert_eq!(CUdevice_attribute::MultiprocessorCount as i32, 16);
1314 assert_eq!(CUdevice_attribute::ComputeCapabilityMajor as i32, 75);
1315 assert_eq!(CUdevice_attribute::ComputeCapabilityMinor as i32, 76);
1316 assert_eq!(CUdevice_attribute::MaxBlocksPerMultiprocessor as i32, 106);
1317 assert_eq!(CUdevice_attribute::L2CacheSize as i32, 38);
1318 assert_eq!(
1319 CUdevice_attribute::MaxSharedMemoryPerMultiprocessor as i32,
1320 81
1321 );
1322 assert_eq!(CUdevice_attribute::ManagedMemory as i32, 83);
1323
1324 // New variants
1325 assert_eq!(CUdevice_attribute::MaxTexture2DGatherWidth as i32, 44);
1326 assert_eq!(CUdevice_attribute::MaxTexture2DGatherHeight as i32, 45);
1327 assert_eq!(CUdevice_attribute::MaxTexture3DWidthAlt as i32, 47);
1328 assert_eq!(CUdevice_attribute::MaxTexture3DHeightAlt as i32, 48);
1329 assert_eq!(CUdevice_attribute::MaxTexture3DDepthAlt as i32, 49);
1330 assert_eq!(CUdevice_attribute::MaxTexture1DMipmappedWidth2 as i32, 52);
1331 assert_eq!(CUdevice_attribute::Reserved92 as i32, 92);
1332 assert_eq!(CUdevice_attribute::Reserved93 as i32, 93);
1333 assert_eq!(CUdevice_attribute::Reserved94 as i32, 94);
1334 assert_eq!(
1335 CUdevice_attribute::VirtualMemoryManagementSupported as i32,
1336 102
1337 );
1338 assert_eq!(
1339 CUdevice_attribute::HandleTypePosixFileDescriptorSupported as i32,
1340 103
1341 );
1342 assert_eq!(
1343 CUdevice_attribute::HandleTypeWin32HandleSupported as i32,
1344 104
1345 );
1346 assert_eq!(
1347 CUdevice_attribute::HandleTypeWin32KmtHandleSupported as i32,
1348 105
1349 );
1350 assert_eq!(CUdevice_attribute::AccessPolicyMaxWindowSize as i32, 111);
1351 assert_eq!(CUdevice_attribute::ReservedSharedMemoryPerBlock as i32, 112);
1352 assert_eq!(
1353 CUdevice_attribute::TimelineSemaphoreInteropSupported as i32,
1354 113
1355 );
1356 assert_eq!(CUdevice_attribute::MemoryPoolsSupported as i32, 115);
1357 assert_eq!(CUdevice_attribute::ClusterLaunch as i32, 120);
1358 assert_eq!(CUdevice_attribute::UnifiedFunctionPointers as i32, 125);
1359 assert_eq!(
1360 CUdevice_attribute::MaxTimelineSemaphoreInteropSupported as i32,
1361 129
1362 );
1363 assert_eq!(CUdevice_attribute::MemSyncDomainSupported as i32, 130);
1364 assert_eq!(CUdevice_attribute::GpuDirectRdmaFabricSupported as i32, 131);
1365 }
1366
1367 #[test]
1368 fn test_jit_option_repr() {
1369 assert_eq!(CUjit_option::MaxRegisters as u32, 0);
1370 assert_eq!(CUjit_option::ThreadsPerBlock as u32, 1);
1371 assert_eq!(CUjit_option::WallTime as u32, 2);
1372 assert_eq!(CUjit_option::InfoLogBuffer as u32, 3);
1373 assert_eq!(CUjit_option::InfoLogBufferSizeBytes as u32, 4);
1374 assert_eq!(CUjit_option::ErrorLogBuffer as u32, 5);
1375 assert_eq!(CUjit_option::ErrorLogBufferSizeBytes as u32, 6);
1376 assert_eq!(CUjit_option::OptimizationLevel as u32, 7);
1377 assert_eq!(CUjit_option::Target as u32, 9);
1378 assert_eq!(CUjit_option::FallbackStrategy as u32, 10);
1379 }
1380
1381 #[test]
1382 #[allow(clippy::assertions_on_constants)]
1383 fn test_error_code_ranges() {
1384 // Basic errors: 1-8
1385 assert!(CUDA_ERROR_INVALID_VALUE < 10);
1386 // Device errors: 100-102
1387 assert!((100..=102).contains(&CUDA_ERROR_NO_DEVICE));
1388 assert!((100..=102).contains(&CUDA_ERROR_INVALID_DEVICE));
1389 assert!((100..=102).contains(&CUDA_ERROR_DEVICE_NOT_LICENSED));
1390 // Image/context errors: 200+
1391 assert!(CUDA_ERROR_INVALID_IMAGE >= 200);
1392 // Launch errors: 700+
1393 assert!(CUDA_ERROR_LAUNCH_FAILED >= 700);
1394 assert!(CUDA_ERROR_ILLEGAL_ADDRESS >= 700);
1395 assert!(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES >= 700);
1396 // Stream capture errors: 900+
1397 assert!(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED >= 900);
1398 // Unknown is 999
1399 assert_eq!(CUDA_ERROR_UNKNOWN, 999);
1400 }
1401
1402 #[test]
1403 fn test_handle_debug_format() {
1404 let ctx = CUcontext::default();
1405 let debug_str = format!("{ctx:?}");
1406 assert!(debug_str.starts_with("CUcontext("));
1407 }
1408
1409 #[test]
1410 fn test_handle_equality() {
1411 let a = CUcontext::default();
1412 let b = CUcontext::default();
1413 assert_eq!(a, b);
1414 }
1415
1416 #[test]
1417 fn test_new_handle_types_are_pointer_sized() {
1418 assert_eq!(
1419 std::mem::size_of::<CUtexref>(),
1420 std::mem::size_of::<*mut c_void>()
1421 );
1422 assert_eq!(
1423 std::mem::size_of::<CUsurfref>(),
1424 std::mem::size_of::<*mut c_void>()
1425 );
1426 assert_eq!(
1427 std::mem::size_of::<CUtexObject>(),
1428 std::mem::size_of::<*mut c_void>()
1429 );
1430 assert_eq!(
1431 std::mem::size_of::<CUsurfObject>(),
1432 std::mem::size_of::<*mut c_void>()
1433 );
1434 }
1435
1436 #[test]
1437 fn test_new_handle_defaults_are_null() {
1438 assert!(CUtexref::default().is_null());
1439 assert!(CUsurfref::default().is_null());
1440 assert!(CUtexObject::default().is_null());
1441 assert!(CUsurfObject::default().is_null());
1442 }
1443
1444 #[test]
1445 fn test_memory_type_enum() {
1446 assert_eq!(CUmemorytype::Host as u32, 1);
1447 assert_eq!(CUmemorytype::Device as u32, 2);
1448 assert_eq!(CUmemorytype::Array as u32, 3);
1449 assert_eq!(CUmemorytype::Unified as u32, 4);
1450 }
1451
1452 #[test]
1453 fn test_pointer_attribute_enum() {
1454 assert_eq!(CUpointer_attribute::Context as u32, 1);
1455 assert_eq!(CUpointer_attribute::MemoryType as u32, 2);
1456 assert_eq!(CUpointer_attribute::DevicePointer as u32, 3);
1457 assert_eq!(CUpointer_attribute::HostPointer as u32, 4);
1458 assert_eq!(CUpointer_attribute::IsManaged as u32, 9);
1459 assert_eq!(CUpointer_attribute::DeviceOrdinal as u32, 10);
1460 }
1461
1462 #[test]
1463 fn test_limit_enum() {
1464 assert_eq!(CUlimit::StackSize as u32, 0);
1465 assert_eq!(CUlimit::PrintfFifoSize as u32, 1);
1466 assert_eq!(CUlimit::MallocHeapSize as u32, 2);
1467 assert_eq!(CUlimit::DevRuntimeSyncDepth as u32, 3);
1468 assert_eq!(CUlimit::DevRuntimePendingLaunchCount as u32, 4);
1469 assert_eq!(CUlimit::MaxL2FetchGranularity as u32, 5);
1470 assert_eq!(CUlimit::PersistingL2CacheSize as u32, 6);
1471 }
1472
1473 #[test]
1474 fn test_function_attribute_enum() {
1475 assert_eq!(CUfunction_attribute::MaxThreadsPerBlock as i32, 0);
1476 assert_eq!(CUfunction_attribute::SharedSizeBytes as i32, 1);
1477 assert_eq!(CUfunction_attribute::NumRegs as i32, 4);
1478 assert_eq!(CUfunction_attribute::PtxVersion as i32, 5);
1479 assert_eq!(CUfunction_attribute::BinaryVersion as i32, 6);
1480 assert_eq!(CUfunction_attribute::MaxDynamicSharedSizeBytes as i32, 8);
1481 assert_eq!(
1482 CUfunction_attribute::PreferredSharedMemoryCarveout as i32,
1483 9
1484 );
1485 }
1486
1487 // ---------------------------------------------------------------------
1488 // VMM / Pool / Linker FFI types — added by Wave 1
1489 // ---------------------------------------------------------------------
1490
1491 #[test]
1492 fn test_link_state_handle_is_pointer_sized_and_default_null() {
1493 assert_eq!(
1494 std::mem::size_of::<CUlinkState>(),
1495 std::mem::size_of::<*mut c_void>()
1496 );
1497 assert!(CUlinkState::default().is_null());
1498 }
1499
1500 #[test]
1501 fn test_mem_generic_allocation_handle_is_u64() {
1502 assert_eq!(
1503 std::mem::size_of::<CUmemGenericAllocationHandle>(),
1504 std::mem::size_of::<u64>()
1505 );
1506 let _: CUmemGenericAllocationHandle = 0u64;
1507 }
1508
1509 #[test]
1510 fn test_mem_location_type_repr() {
1511 assert_eq!(CUmemLocationType::Invalid as u32, 0);
1512 assert_eq!(CUmemLocationType::Device as u32, 1);
1513 assert_eq!(CUmemLocationType::Host as u32, 2);
1514 assert_eq!(CUmemLocationType::HostNuma as u32, 3);
1515 assert_eq!(CUmemLocationType::HostNumaCurrent as u32, 4);
1516 }
1517
1518 #[test]
1519 fn test_mem_allocation_type_repr() {
1520 assert_eq!(CUmemAllocationType::Invalid as u32, 0);
1521 assert_eq!(CUmemAllocationType::Pinned as u32, 1);
1522 assert_eq!(CUmemAllocationType::Max as u32, 0x7fff_ffff);
1523 }
1524
1525 #[test]
1526 fn test_mem_allocation_handle_type_repr() {
1527 assert_eq!(CUmemAllocationHandleType::None as u32, 0);
1528 assert_eq!(CUmemAllocationHandleType::PosixFileDescriptor as u32, 1);
1529 assert_eq!(CUmemAllocationHandleType::Win32 as u32, 2);
1530 assert_eq!(CUmemAllocationHandleType::Win32Kmt as u32, 4);
1531 assert_eq!(CUmemAllocationHandleType::Fabric as u32, 8);
1532 }
1533
1534 #[test]
1535 fn test_mem_access_flags_repr() {
1536 assert_eq!(CUmemAccessFlags::None as u32, 0);
1537 assert_eq!(CUmemAccessFlags::Read as u32, 1);
1538 assert_eq!(CUmemAccessFlags::ReadWrite as u32, 3);
1539 assert_eq!(CUmemAccessFlags::Max as u32, 0x7fff_ffff);
1540 }
1541
1542 #[test]
1543 fn test_mem_location_layout() {
1544 // Two consecutive 4-byte fields → 8 bytes, alignment 4.
1545 assert_eq!(std::mem::size_of::<CUmemLocation>(), 8);
1546 assert_eq!(std::mem::align_of::<CUmemLocation>(), 4);
1547 let loc = CUmemLocation::default();
1548 assert_eq!(loc.loc_type, 0);
1549 assert_eq!(loc.id, 0);
1550 }
1551
1552 #[test]
1553 fn test_mem_access_desc_layout() {
1554 // CUmemLocation (8) + flags (u32 = 4) → 12 bytes, alignment 4.
1555 assert_eq!(std::mem::size_of::<CUmemAccessDesc>(), 12);
1556 assert_eq!(std::mem::align_of::<CUmemAccessDesc>(), 4);
1557 let desc = CUmemAccessDesc::default();
1558 assert_eq!(desc.flags, 0);
1559 }
1560
1561 #[test]
1562 fn test_mem_allocation_prop_default_zeroed() {
1563 let prop = CUmemAllocationProp::default();
1564 assert_eq!(prop.alloc_type, 0);
1565 assert_eq!(prop.requested_handle_types, 0);
1566 assert_eq!(prop.location.loc_type, 0);
1567 assert_eq!(prop.location.id, 0);
1568 assert!(prop.win32_handle_meta_data.is_null());
1569 assert_eq!(prop.alloc_flags, 0);
1570 }
1571
1572 #[test]
1573 fn test_mem_pool_props_default_zeroed_and_padded() {
1574 let props = CUmemPoolProps::default();
1575 assert_eq!(props.alloc_type, 0);
1576 assert_eq!(props.handle_types, 0);
1577 assert_eq!(props.location.loc_type, 0);
1578 assert_eq!(props.location.id, 0);
1579 assert!(props.win32_security_attributes.is_null());
1580 assert_eq!(props.max_size, 0);
1581 assert!(props.reserved.iter().all(|&b| b == 0));
1582 // The CUDA ABI mandates 56 reserved bytes.
1583 assert_eq!(props.reserved.len(), 56);
1584 }
1585
1586 #[test]
1587 fn test_memcpy2d_default_zeroed() {
1588 let m = CUDA_MEMCPY2D::default();
1589 assert_eq!(m.src_x_in_bytes, 0);
1590 assert_eq!(m.src_y, 0);
1591 assert_eq!(m.src_memory_type, 0);
1592 assert!(m.src_host.is_null());
1593 assert_eq!(m.src_device, 0);
1594 assert!(m.src_array.is_null());
1595 assert_eq!(m.src_pitch, 0);
1596 assert_eq!(m.dst_x_in_bytes, 0);
1597 assert_eq!(m.dst_y, 0);
1598 assert_eq!(m.dst_memory_type, 0);
1599 assert!(m.dst_host.is_null());
1600 assert_eq!(m.dst_device, 0);
1601 assert!(m.dst_array.is_null());
1602 assert_eq!(m.dst_pitch, 0);
1603 assert_eq!(m.width_in_bytes, 0);
1604 assert_eq!(m.height, 0);
1605 }
1606}