oxicuda_driver/
lib.rs

1//! # OxiCUDA Driver
2//!
3//! **Dynamic, safe Rust bindings for the NVIDIA CUDA Driver API.**
4//!
5//! `oxicuda-driver` provides a zero-SDK-dependency wrapper around the CUDA
6//! Driver API.  Unlike traditional CUDA crate approaches that require the
7//! CUDA Toolkit (or at least its headers and link stubs) to be present at
8//! **build time**, this crate loads the driver shared library entirely at
9//! **runtime** via [`libloading`](https://crates.io/crates/libloading).
10//!
11//! ## Zero build-time dependency
12//!
13//! No `cuda.h`, no `libcuda.so` symlink, no `nvcc` — the crate compiles on
14//! any Rust toolchain.  The actual GPU driver is discovered and loaded the
15//! first time you call [`try_driver()`] or [`init()`].
16//!
17//! ## Runtime library loading
18//!
19//! | Platform | Library searched             |
20//! |----------|-----------------------------|
21//! | Linux    | `libcuda.so`, `libcuda.so.1` |
22//! | Windows  | `nvcuda.dll`                 |
23//! | macOS    | *(returns `UnsupportedPlatform` — NVIDIA dropped macOS support)* |
24//!
25//! ## Key types
26//!
27//! | Type          | Description                                    |
28//! |---------------|------------------------------------------------|
29//! | [`Device`]    | A CUDA-capable GPU discovered on the system    |
30//! | [`Context`]   | Owns a CUDA context bound to a device          |
31//! | [`Stream`]    | Asynchronous command queue within a context     |
32//! | [`Event`]     | Timing / synchronisation marker on a stream    |
33//! | [`Module`]    | Loaded PTX or cubin containing kernel code     |
34//! | [`Function`]  | A single kernel entry point inside a module    |
35//! | [`CudaError`] | Strongly-typed driver error code               |
36//!
37//! ## Quick start
38//!
39//! ```rust,no_run
40//! use oxicuda_driver::prelude::*;
41//!
42//! // Initialise the CUDA driver (loads libcuda at runtime).
43//! init()?;
44//!
45//! // Pick the best available GPU and create a context.
46//! let dev = Device::get(0)?;
47//! let _ctx = Context::new(&dev)?;
48//!
49//! // Load a PTX module and look up a kernel.
50//! let module = Module::from_ptx("ptx_source")?;
51//! let kernel = module.get_function("vector_add")?;
52//! # Ok::<(), oxicuda_driver::CudaError>(())
53//! ```
54
55#![warn(missing_docs)]
56#![warn(clippy::all)]
57#![allow(clippy::module_name_repetitions)]
58#![allow(clippy::missing_safety_doc)]
59#![allow(clippy::too_many_arguments)]
60#![allow(clippy::macro_metavars_in_unsafe)]
61
62// ---------------------------------------------------------------------------
63// Module declarations
64// ---------------------------------------------------------------------------
65
66pub mod context;
67pub mod context_config;
68pub mod cooperative_launch;
69pub mod cupti_stubs;
70pub mod debug;
71pub mod device;
72pub mod error;
73pub mod event;
74pub mod fabric_handle;
75pub mod ffi;
76pub mod function_attr;
77pub mod graph;
78pub mod link;
79pub mod loader;
80pub mod memory_info;
81pub mod module;
82pub mod multi_gpu;
83pub mod nvlink_topology;
84pub mod occupancy;
85pub mod occupancy_ext;
86pub mod occupancy_register_count;
87pub mod primary_context;
88pub mod profiler;
89pub mod stream;
90pub mod stream_ordered_alloc;
91pub mod stream_ordered_model;
92pub mod tma;
93
94// ---------------------------------------------------------------------------
95// Re-exports — error handling
96// ---------------------------------------------------------------------------
97
98pub use error::{CudaError, CudaResult, DriverLoadError, check};
99
100// ---------------------------------------------------------------------------
101// Re-exports — FFI types and constants
102// ---------------------------------------------------------------------------
103
104pub use ffi::{
105    CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, CU_TRSF_NORMALIZED_COORDINATES,
106    CU_TRSF_READ_AS_INTEGER, CU_TRSF_SRGB, CUDA_ARRAY_DESCRIPTOR, CUDA_ARRAY3D_CUBEMAP,
107    CUDA_ARRAY3D_DESCRIPTOR, CUDA_ARRAY3D_LAYERED, CUDA_ARRAY3D_SURFACE_LDST,
108    CUDA_ARRAY3D_TEXTURE_GATHER, CUDA_MEMCPY2D, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC,
109    CUDA_TEXTURE_DESC, CUaddress_mode, CUarray, CUarray_format, CUcontext, CUdevice,
110    CUdevice_attribute, CUdeviceptr, CUevent, CUfilter_mode, CUfunction, CUfunction_attribute,
111    CUjit_option, CUjitInputType, CUkernel, CUlibrary, CUlimit, CUlinkState, CUmemAccessDesc,
112    CUmemAccessFlags, CUmemAllocationHandleType, CUmemAllocationProp, CUmemAllocationType,
113    CUmemGenericAllocationHandle, CUmemLocation, CUmemLocationType, CUmemPoolProps,
114    CUmemcpyAttributes, CUmemoryPool, CUmemorytype, CUmipmappedArray, CUmodule, CUmulticastObject,
115    CUpointer_attribute, CUresourceViewFormat, CUresourcetype, CUstream, CUsurfObject, CUsurfref,
116    CUtexObject, CUtexref, CuLaunchAttribute, CuLaunchAttributeClusterDim, CuLaunchAttributeId,
117    CuLaunchAttributeValue, CuLaunchConfig, CudaResourceDescArray, CudaResourceDescLinear,
118    CudaResourceDescMipmap, CudaResourceDescPitch2d, CudaResourceDescRes,
119};
120
121// ---------------------------------------------------------------------------
122// Re-exports — high-level safe wrappers
123// ---------------------------------------------------------------------------
124
125pub use context::Context;
126pub use context_config::{CacheConfig, SharedMemConfig};
127pub use cooperative_launch::{
128    CooperativeLaunchConfig, CooperativeLaunchSupport, DeviceLaunchConfig,
129    MultiDeviceCooperativeLaunchConfig, cooperative_launch, cooperative_launch_multi_device,
130};
131pub use cupti_stubs::{
132    ActivityRecord, ActivitySession, CuptiActivityError, CuptiActivityKind, cupti_available,
133};
134pub use debug::{DebugLevel, DebugSession, KernelDebugger, MemoryChecker, NanInfChecker};
135pub use device::{Device, DeviceInfo, best_device, can_access_peer, driver_version, list_devices};
136pub use event::Event;
137pub use fabric_handle::{
138    FABRIC_HANDLE_BYTES, FabricAccess, FabricAllocationProps, FabricHandle, FabricImport,
139    FabricImporter, FabricMemory,
140};
141pub use graph::{Graph, GraphExec, GraphNode, MemcpyDirection, StreamCapture};
142pub use link::{
143    FallbackStrategy, LinkInputType, LinkedModule, Linker, LinkerOptions, OptimizationLevel,
144};
145pub use loader::try_driver;
146pub use module::{Function, JitDiagnostic, JitLog, JitOptions, JitSeverity, Module};
147pub use multi_gpu::DevicePool;
148pub use nvlink_topology::{GpuTopology, NvLinkVersion, TopologyTree, TopologyType};
149pub use occupancy_register_count::{OccupancyFromPtx, PtxKernel, PtxRegisterUsage};
150pub use primary_context::PrimaryContext;
151pub use profiler::ProfilerGuard;
152pub use stream::Stream;
153pub use stream_ordered_alloc::{
154    StreamAllocation, StreamMemoryPool, StreamOrderedAllocConfig, stream_alloc, stream_free,
155};
156pub use stream_ordered_model::StreamOrderId;
157
158// ---------------------------------------------------------------------------
159// Driver initialisation
160// ---------------------------------------------------------------------------
161
162/// Initialise the CUDA driver API.
163///
164/// This must be called before any other driver function.  It is safe to call
165/// multiple times; subsequent calls are no-ops inside the driver itself.
166///
167/// Internally this loads the shared library (if not already cached) and
168/// invokes `cuInit(0)`.
169///
170/// # Errors
171///
172/// Returns [`CudaError::NotInitialized`] if the CUDA driver library cannot be
173/// loaded, or another [`CudaError`] variant if `cuInit` reports a failure.
174pub fn init() -> CudaResult<()> {
175    let driver = loader::try_driver()?;
176    error::check(unsafe { (driver.cu_init)(0) })
177}
178
179// ---------------------------------------------------------------------------
180// Prelude — convenient glob import
181// ---------------------------------------------------------------------------
182
183/// Convenient glob import for common OxiCUDA Driver types.
184///
185/// ```rust
186/// use oxicuda_driver::prelude::*;
187/// ```
188pub mod prelude {
189    pub use crate::{
190        ActivitySession, CacheConfig, Context, CooperativeLaunchConfig, CooperativeLaunchSupport,
191        CudaError, CudaResult, DebugLevel, DebugSession, Device, DeviceLaunchConfig, DevicePool,
192        Event, FabricHandle, FabricImporter, FabricMemory, FallbackStrategy, Function, GpuTopology,
193        Graph, GraphExec, GraphNode, KernelDebugger, LinkInputType, LinkedModule, Linker,
194        LinkerOptions, MemcpyDirection, Module, MultiDeviceCooperativeLaunchConfig, NvLinkVersion,
195        OccupancyFromPtx, OptimizationLevel, PrimaryContext, ProfilerGuard, SharedMemConfig,
196        Stream, StreamAllocation, StreamCapture, StreamMemoryPool, StreamOrderedAllocConfig,
197        TopologyTree, TopologyType, can_access_peer, cooperative_launch,
198        cooperative_launch_multi_device, cupti_available, driver_version, init, stream_alloc,
199        stream_free, try_driver,
200    };
201}
202
203// ---------------------------------------------------------------------------
204// Compile-time feature flags
205// ---------------------------------------------------------------------------
206
207/// Compile-time feature availability.
208pub mod features {
209    /// Whether GPU tests are enabled (`--features gpu-tests`).
210    pub const HAS_GPU_TESTS: bool = cfg!(feature = "gpu-tests");
211}
212
213// ---------------------------------------------------------------------------
214// CPU-only tests for driver infrastructure
215// ---------------------------------------------------------------------------
216
217#[cfg(test)]
218mod driver_infra_tests {
219    // -----------------------------------------------------------------------
220    // Task 2 — Multi-threaded context migration (F3)
221    //
222    // Verifies the thread-safety of the context-stack data structure model
223    // using pure Rust primitives.  No GPU is required.
224    // -----------------------------------------------------------------------
225
226    /// Simulate 4 threads each pushing and popping a "context ID" to/from a
227    /// thread-local stack, then verifying all results are collected correctly.
228    ///
229    /// This exercises the logical structure of context push/pop across threads
230    /// (corresponding to `cuCtxPushCurrent` / `cuCtxPopCurrent`) without
231    /// needing a real CUDA driver.
232    #[test]
233    fn context_push_pop_thread_safety() {
234        use std::sync::{Arc, Mutex};
235        use std::thread;
236
237        let results: Arc<Mutex<Vec<(u32, u32)>>> = Arc::new(Mutex::new(vec![]));
238        let mut handles = vec![];
239
240        for thread_id in 0..4u32 {
241            let results_clone = Arc::clone(&results);
242            let handle = thread::spawn(move || {
243                // Each thread simulates pushing two context IDs onto its
244                // private stack and then reading the top (most-recently-pushed)
245                // context.
246                let ctx_id = thread_id * 100;
247                let stack: Vec<u32> = vec![ctx_id, ctx_id + 1];
248                // Pop semantics: the top of the stack is the last element.
249                let top = stack.last().copied().unwrap_or(0);
250                let mut r = results_clone.lock().expect("results lock failed");
251                r.push((thread_id, top));
252            });
253            handles.push(handle);
254        }
255
256        for h in handles {
257            h.join().expect("thread panicked");
258        }
259
260        let results = results.lock().expect("final lock failed");
261        assert_eq!(results.len(), 4, "all 4 threads must contribute a result");
262
263        // Every thread should have seen `ctx_id + 1` as the top of its stack.
264        for &(thread_id, top) in results.iter() {
265            let expected_top = thread_id * 100 + 1;
266            assert_eq!(
267                top, expected_top,
268                "thread {thread_id}: expected top {expected_top}, got {top}"
269            );
270        }
271    }
272
273    // -----------------------------------------------------------------------
274    // Task 3 — Scope-exit / Drop resource release under OOM (F10)
275    //
276    // Verifies that Drop impls run correctly even when further allocations
277    // fail (simulated OOM), and that Rust's LIFO drop order is preserved.
278    // -----------------------------------------------------------------------
279
280    /// `Drop` is invoked for every resource that was successfully constructed,
281    /// even when a subsequent allocation would fail (simulated OOM).
282    #[test]
283    fn drop_counter_tracks_resource_release() {
284        use std::sync::Arc;
285        use std::sync::atomic::{AtomicUsize, Ordering};
286
287        struct FakeResource {
288            dropped: Arc<AtomicUsize>,
289        }
290
291        impl Drop for FakeResource {
292            fn drop(&mut self) {
293                self.dropped.fetch_add(1, Ordering::SeqCst);
294            }
295        }
296
297        let counter = Arc::new(AtomicUsize::new(0));
298
299        {
300            let _r1 = FakeResource {
301                dropped: Arc::clone(&counter),
302            };
303            let _r2 = FakeResource {
304                dropped: Arc::clone(&counter),
305            };
306            // Simulate OOM by not creating r3 — neither r1 nor r2 is dropped yet.
307            assert_eq!(
308                counter.load(Ordering::SeqCst),
309                0,
310                "resources must not be dropped before scope exit"
311            );
312        }
313
314        // After the block ends, both r1 and r2 must have been dropped.
315        assert_eq!(
316            counter.load(Ordering::SeqCst),
317            2,
318            "both resources must be dropped at scope exit"
319        );
320    }
321
322    /// Rust drops local variables in **reverse declaration order** (LIFO).
323    /// This test verifies that invariant for RAII guard types.
324    #[test]
325    fn drop_order_is_lifo() {
326        use std::sync::{Arc, Mutex};
327
328        let order: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(vec![]));
329
330        struct Ordered {
331            id: u32,
332            order: Arc<Mutex<Vec<u32>>>,
333        }
334
335        impl Drop for Ordered {
336            fn drop(&mut self) {
337                self.order.lock().expect("order lock failed").push(self.id);
338            }
339        }
340
341        {
342            let _a = Ordered {
343                id: 1,
344                order: Arc::clone(&order),
345            };
346            let _b = Ordered {
347                id: 2,
348                order: Arc::clone(&order),
349            };
350            let _c = Ordered {
351                id: 3,
352                order: Arc::clone(&order),
353            };
354        }
355
356        let observed = order.lock().expect("final order lock failed");
357        assert_eq!(
358            *observed,
359            vec![3, 2, 1],
360            "CUDA RAII guards must be released in LIFO order"
361        );
362    }
363
364    // -----------------------------------------------------------------------
365    // Task 4 — Driver version negotiation (NVIDIA Driver 525 / 535 / 550 / 560)
366    //
367    // `cuDriverGetVersion` returns the CUDA version as `major * 1000 + minor`.
368    // These tests verify the parsing logic and the version-gating conditions
369    // used throughout OxiCUDA without requiring a real driver.
370    // -----------------------------------------------------------------------
371
372    /// NVIDIA Driver 525 ships with CUDA 12.0.  Verify the parse of 12000.
373    #[test]
374    fn driver_version_parsing_cuda_12_0() {
375        // cuDriverGetVersion returns 12000 for CUDA 12.0 (driver 525).
376        let cuda_version: i32 = 12000;
377        let major = cuda_version / 1000;
378        let minor = cuda_version % 1000;
379        assert_eq!(major, 12, "major version mismatch");
380        assert_eq!(minor, 0, "minor version mismatch");
381    }
382
383    /// NVIDIA Driver 535 ships with CUDA 12.2.  Verify the parse of 12020.
384    #[test]
385    fn driver_version_parsing_cuda_12_2() {
386        let cuda_version: i32 = 12020;
387        let major = cuda_version / 1000;
388        let minor = cuda_version % 1000;
389        assert_eq!(major, 12);
390        assert_eq!(minor, 20);
391    }
392
393    /// NVIDIA Driver 550 ships with CUDA 12.4.  Verify the parse of 12040.
394    #[test]
395    fn driver_version_parsing_cuda_12_4() {
396        let cuda_version: i32 = 12040;
397        let major = cuda_version / 1000;
398        let minor = cuda_version % 1000;
399        assert_eq!(major, 12);
400        assert_eq!(minor, 40);
401    }
402
403    /// NVIDIA Driver 560 ships with CUDA 12.6.  Verify the parse of 12060.
404    #[test]
405    fn driver_version_parsing_cuda_12_6() {
406        let cuda_version: i32 = 12060;
407        let major = cuda_version / 1000;
408        let minor = cuda_version % 1000;
409        assert_eq!(major, 12);
410        assert_eq!(minor, 60);
411    }
412
413    /// OxiCUDA requires CUDA 11.2+ (`cuMemAllocAsync` availability).
414    /// Verify that the set of supported versions all meet the minimum and
415    /// that older versions are correctly rejected.
416    #[test]
417    fn driver_version_minimum_requirement() {
418        // cuMemAllocAsync was introduced in CUDA 11.2 (version integer 11020).
419        let min_required: i32 = 11020;
420
421        let supported: [i32; 5] = [11020, 11040, 12000, 12060, 12080];
422        for v in supported {
423            assert!(
424                v >= min_required,
425                "CUDA version {v} should be supported (>= {min_required})"
426            );
427        }
428
429        let too_old: [i32; 2] = [10020, 11010];
430        for v in too_old {
431            assert!(
432                v < min_required,
433                "CUDA version {v} should NOT be supported (< {min_required})"
434            );
435        }
436    }
437
438    /// CUDA 12.8 (version 12080) introduces `cuMemcpyBatchAsync`.
439    /// Verify the feature-gating arithmetic.
440    #[test]
441    fn driver_cuda_12_8_features_available() {
442        // 12.8 → 12080
443        let cuda_128: i32 = 12080;
444        assert!(
445            cuda_128 >= 12080,
446            "CUDA 12.8 must support cuMemcpyBatchAsync"
447        );
448
449        // 12.0 does not have it.
450        let cuda_120: i32 = 12000;
451        assert!(
452            cuda_120 < 12080,
453            "CUDA 12.0 must NOT support cuMemcpyBatchAsync"
454        );
455    }
456
457    /// Verify the complete NVIDIA-driver-version → CUDA-version mapping used
458    /// in OxiCUDA's version negotiation table.
459    #[test]
460    fn driver_nvidia_to_cuda_version_mapping() {
461        // (nvidia_driver, expected_cuda_version_int)
462        let mapping: [(u32, i32); 4] = [
463            (525, 12000), // Driver 525  → CUDA 12.0
464            (535, 12020), // Driver 535  → CUDA 12.2
465            (550, 12040), // Driver 550  → CUDA 12.4
466            (560, 12060), // Driver 560  → CUDA 12.6
467        ];
468
469        for (nvidia_driver, cuda_version) in mapping {
470            let major = cuda_version / 1000;
471            let minor = cuda_version % 1000;
472            // Sanity: all are CUDA 12.x
473            assert_eq!(major, 12, "driver {nvidia_driver}: expected CUDA 12.x");
474            // Minor must be a multiple of 10 (CUDA minor encoding)
475            assert_eq!(
476                minor % 10,
477                0,
478                "driver {nvidia_driver}: minor {minor} is not a multiple of 10"
479            );
480            // CUDA 12.8+ features require version >= 12080
481            let has_12_8_features = cuda_version >= 12080;
482            assert!(
483                !has_12_8_features,
484                "driver {nvidia_driver} (CUDA {major}.{:02}) should NOT have 12.8+ features",
485                minor / 10
486            );
487        }
488    }
489}
oxicuda_driver/lib.rs

oxicuda_driver/
lib.rs