Skip to main content

oxicuda_driver/
lib.rs

1//! # OxiCUDA Driver
2//!
3//! **Dynamic, safe Rust bindings for the NVIDIA CUDA Driver API.**
4//!
5//! `oxicuda-driver` provides a zero-SDK-dependency wrapper around the CUDA
6//! Driver API.  Unlike traditional CUDA crate approaches that require the
7//! CUDA Toolkit (or at least its headers and link stubs) to be present at
8//! **build time**, this crate loads the driver shared library entirely at
9//! **runtime** via [`libloading`](https://crates.io/crates/libloading).
10//!
11//! ## Zero build-time dependency
12//!
13//! No `cuda.h`, no `libcuda.so` symlink, no `nvcc` — the crate compiles on
14//! any Rust toolchain.  The actual GPU driver is discovered and loaded the
15//! first time you call [`try_driver()`] or [`init()`].
16//!
17//! ## Runtime library loading
18//!
19//! | Platform | Library searched             |
20//! |----------|-----------------------------|
21//! | Linux    | `libcuda.so`, `libcuda.so.1` |
22//! | Windows  | `nvcuda.dll`                 |
23//! | macOS    | *(returns `UnsupportedPlatform` — NVIDIA dropped macOS support)* |
24//!
25//! ## Key types
26//!
27//! | Type          | Description                                    |
28//! |---------------|------------------------------------------------|
29//! | [`Device`]    | A CUDA-capable GPU discovered on the system    |
30//! | [`Context`]   | Owns a CUDA context bound to a device          |
31//! | [`Stream`]    | Asynchronous command queue within a context     |
32//! | [`Event`]     | Timing / synchronisation marker on a stream    |
33//! | [`Module`]    | Loaded PTX or cubin containing kernel code     |
34//! | [`Function`]  | A single kernel entry point inside a module    |
35//! | [`CudaError`] | Strongly-typed driver error code               |
36//!
37//! ## Quick start
38//!
39//! ```rust,no_run
40//! use oxicuda_driver::prelude::*;
41//!
42//! // Initialise the CUDA driver (loads libcuda at runtime).
43//! init()?;
44//!
45//! // Pick the best available GPU and create a context.
46//! let dev = Device::get(0)?;
47//! let _ctx = Context::new(&dev)?;
48//!
49//! // Load a PTX module and look up a kernel.
50//! let module = Module::from_ptx("ptx_source")?;
51//! let kernel = module.get_function("vector_add")?;
52//! # Ok::<(), oxicuda_driver::CudaError>(())
53//! ```
54
55#![warn(missing_docs)]
56#![warn(clippy::all)]
57#![allow(clippy::module_name_repetitions)]
58#![allow(clippy::missing_safety_doc)]
59#![allow(clippy::too_many_arguments)]
60#![allow(clippy::macro_metavars_in_unsafe)]
61
62// ---------------------------------------------------------------------------
63// Module declarations
64// ---------------------------------------------------------------------------
65
66pub mod context;
67pub mod context_config;
68pub mod cooperative_launch;
69pub mod debug;
70pub mod device;
71pub mod error;
72pub mod event;
73pub mod ffi;
74pub mod function_attr;
75pub mod graph;
76pub mod link;
77pub mod loader;
78pub mod memory_info;
79pub mod module;
80pub mod multi_gpu;
81pub mod nvlink_topology;
82pub mod occupancy;
83pub mod occupancy_ext;
84pub mod primary_context;
85pub mod profiler;
86pub mod stream;
87pub mod stream_ordered_alloc;
88pub mod tma;
89
90// ---------------------------------------------------------------------------
91// Re-exports — error handling
92// ---------------------------------------------------------------------------
93
94pub use error::{CudaError, CudaResult, DriverLoadError, check};
95
96// ---------------------------------------------------------------------------
97// Re-exports — FFI types and constants
98// ---------------------------------------------------------------------------
99
100pub use ffi::{
101    CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, CU_TRSF_NORMALIZED_COORDINATES,
102    CU_TRSF_READ_AS_INTEGER, CU_TRSF_SRGB, CUDA_ARRAY_DESCRIPTOR, CUDA_ARRAY3D_CUBEMAP,
103    CUDA_ARRAY3D_DESCRIPTOR, CUDA_ARRAY3D_LAYERED, CUDA_ARRAY3D_SURFACE_LDST,
104    CUDA_ARRAY3D_TEXTURE_GATHER, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC,
105    CUaddress_mode, CUarray, CUarray_format, CUcontext, CUdevice, CUdevice_attribute, CUdeviceptr,
106    CUevent, CUfilter_mode, CUfunction, CUfunction_attribute, CUjit_option, CUkernel, CUlibrary,
107    CUlimit, CUmemoryPool, CUmemorytype, CUmipmappedArray, CUmodule, CUmulticastObject,
108    CUpointer_attribute, CUresourceViewFormat, CUresourcetype, CUstream, CUsurfObject, CUsurfref,
109    CUtexObject, CUtexref, CuLaunchAttribute, CuLaunchAttributeClusterDim, CuLaunchAttributeId,
110    CuLaunchAttributeValue, CuLaunchConfig, CudaResourceDescArray, CudaResourceDescLinear,
111    CudaResourceDescMipmap, CudaResourceDescPitch2d, CudaResourceDescRes,
112};
113
114// ---------------------------------------------------------------------------
115// Re-exports — high-level safe wrappers
116// ---------------------------------------------------------------------------
117
118pub use context::Context;
119pub use context_config::{CacheConfig, SharedMemConfig};
120pub use cooperative_launch::{
121    CooperativeLaunchConfig, CooperativeLaunchSupport, DeviceLaunchConfig,
122    MultiDeviceCooperativeLaunchConfig, cooperative_launch, cooperative_launch_multi_device,
123};
124pub use debug::{DebugLevel, DebugSession, KernelDebugger, MemoryChecker, NanInfChecker};
125pub use device::{Device, DeviceInfo, best_device, can_access_peer, driver_version, list_devices};
126pub use event::Event;
127pub use graph::{Graph, GraphExec, GraphNode, MemcpyDirection, StreamCapture};
128pub use link::{
129    FallbackStrategy, LinkInputType, LinkedModule, Linker, LinkerOptions, OptimizationLevel,
130};
131pub use loader::try_driver;
132pub use module::{Function, JitDiagnostic, JitLog, JitOptions, JitSeverity, Module};
133pub use multi_gpu::DevicePool;
134pub use nvlink_topology::{GpuTopology, NvLinkVersion, TopologyTree, TopologyType};
135pub use primary_context::PrimaryContext;
136pub use profiler::ProfilerGuard;
137pub use stream::Stream;
138pub use stream_ordered_alloc::{
139    StreamAllocation, StreamMemoryPool, StreamOrderedAllocConfig, stream_alloc, stream_free,
140};
141
142// ---------------------------------------------------------------------------
143// Driver initialisation
144// ---------------------------------------------------------------------------
145
146/// Initialise the CUDA driver API.
147///
148/// This must be called before any other driver function.  It is safe to call
149/// multiple times; subsequent calls are no-ops inside the driver itself.
150///
151/// Internally this loads the shared library (if not already cached) and
152/// invokes `cuInit(0)`.
153///
154/// # Errors
155///
156/// Returns [`CudaError::NotInitialized`] if the CUDA driver library cannot be
157/// loaded, or another [`CudaError`] variant if `cuInit` reports a failure.
158pub fn init() -> CudaResult<()> {
159    let driver = loader::try_driver()?;
160    error::check(unsafe { (driver.cu_init)(0) })
161}
162
163// ---------------------------------------------------------------------------
164// Prelude — convenient glob import
165// ---------------------------------------------------------------------------
166
167/// Convenient glob import for common OxiCUDA Driver types.
168///
169/// ```rust
170/// use oxicuda_driver::prelude::*;
171/// ```
172pub mod prelude {
173    pub use crate::{
174        CacheConfig, Context, CooperativeLaunchConfig, CooperativeLaunchSupport, CudaError,
175        CudaResult, DebugLevel, DebugSession, Device, DeviceLaunchConfig, DevicePool, Event,
176        FallbackStrategy, Function, GpuTopology, Graph, GraphExec, GraphNode, KernelDebugger,
177        LinkInputType, LinkedModule, Linker, LinkerOptions, MemcpyDirection, Module,
178        MultiDeviceCooperativeLaunchConfig, NvLinkVersion, OptimizationLevel, PrimaryContext,
179        ProfilerGuard, SharedMemConfig, Stream, StreamAllocation, StreamCapture, StreamMemoryPool,
180        StreamOrderedAllocConfig, TopologyTree, TopologyType, can_access_peer, cooperative_launch,
181        cooperative_launch_multi_device, driver_version, init, stream_alloc, stream_free,
182        try_driver,
183    };
184}
185
186// ---------------------------------------------------------------------------
187// Compile-time feature flags
188// ---------------------------------------------------------------------------
189
190/// Compile-time feature availability.
191pub mod features {
192    /// Whether GPU tests are enabled (`--features gpu-tests`).
193    pub const HAS_GPU_TESTS: bool = cfg!(feature = "gpu-tests");
194}
195
196// ---------------------------------------------------------------------------
197// CPU-only tests for driver infrastructure
198// ---------------------------------------------------------------------------
199
200#[cfg(test)]
201mod driver_infra_tests {
202    // -----------------------------------------------------------------------
203    // Task 2 — Multi-threaded context migration (F3)
204    //
205    // Verifies the thread-safety of the context-stack data structure model
206    // using pure Rust primitives.  No GPU is required.
207    // -----------------------------------------------------------------------
208
209    /// Simulate 4 threads each pushing and popping a "context ID" to/from a
210    /// thread-local stack, then verifying all results are collected correctly.
211    ///
212    /// This exercises the logical structure of context push/pop across threads
213    /// (corresponding to `cuCtxPushCurrent` / `cuCtxPopCurrent`) without
214    /// needing a real CUDA driver.
215    #[test]
216    fn context_push_pop_thread_safety() {
217        use std::sync::{Arc, Mutex};
218        use std::thread;
219
220        let results: Arc<Mutex<Vec<(u32, u32)>>> = Arc::new(Mutex::new(vec![]));
221        let mut handles = vec![];
222
223        for thread_id in 0..4u32 {
224            let results_clone = Arc::clone(&results);
225            let handle = thread::spawn(move || {
226                // Each thread simulates pushing two context IDs onto its
227                // private stack and then reading the top (most-recently-pushed)
228                // context.
229                let ctx_id = thread_id * 100;
230                let stack: Vec<u32> = vec![ctx_id, ctx_id + 1];
231                // Pop semantics: the top of the stack is the last element.
232                let top = stack.last().copied().unwrap_or(0);
233                let mut r = results_clone.lock().expect("results lock failed");
234                r.push((thread_id, top));
235            });
236            handles.push(handle);
237        }
238
239        for h in handles {
240            h.join().expect("thread panicked");
241        }
242
243        let results = results.lock().expect("final lock failed");
244        assert_eq!(results.len(), 4, "all 4 threads must contribute a result");
245
246        // Every thread should have seen `ctx_id + 1` as the top of its stack.
247        for &(thread_id, top) in results.iter() {
248            let expected_top = thread_id * 100 + 1;
249            assert_eq!(
250                top, expected_top,
251                "thread {thread_id}: expected top {expected_top}, got {top}"
252            );
253        }
254    }
255
256    // -----------------------------------------------------------------------
257    // Task 3 — Scope-exit / Drop resource release under OOM (F10)
258    //
259    // Verifies that Drop impls run correctly even when further allocations
260    // fail (simulated OOM), and that Rust's LIFO drop order is preserved.
261    // -----------------------------------------------------------------------
262
263    /// `Drop` is invoked for every resource that was successfully constructed,
264    /// even when a subsequent allocation would fail (simulated OOM).
265    #[test]
266    fn drop_counter_tracks_resource_release() {
267        use std::sync::Arc;
268        use std::sync::atomic::{AtomicUsize, Ordering};
269
270        struct FakeResource {
271            dropped: Arc<AtomicUsize>,
272        }
273
274        impl Drop for FakeResource {
275            fn drop(&mut self) {
276                self.dropped.fetch_add(1, Ordering::SeqCst);
277            }
278        }
279
280        let counter = Arc::new(AtomicUsize::new(0));
281
282        {
283            let _r1 = FakeResource {
284                dropped: Arc::clone(&counter),
285            };
286            let _r2 = FakeResource {
287                dropped: Arc::clone(&counter),
288            };
289            // Simulate OOM by not creating r3 — neither r1 nor r2 is dropped yet.
290            assert_eq!(
291                counter.load(Ordering::SeqCst),
292                0,
293                "resources must not be dropped before scope exit"
294            );
295        }
296
297        // After the block ends, both r1 and r2 must have been dropped.
298        assert_eq!(
299            counter.load(Ordering::SeqCst),
300            2,
301            "both resources must be dropped at scope exit"
302        );
303    }
304
305    /// Rust drops local variables in **reverse declaration order** (LIFO).
306    /// This test verifies that invariant for RAII guard types.
307    #[test]
308    fn drop_order_is_lifo() {
309        use std::sync::{Arc, Mutex};
310
311        let order: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(vec![]));
312
313        struct Ordered {
314            id: u32,
315            order: Arc<Mutex<Vec<u32>>>,
316        }
317
318        impl Drop for Ordered {
319            fn drop(&mut self) {
320                self.order.lock().expect("order lock failed").push(self.id);
321            }
322        }
323
324        {
325            let _a = Ordered {
326                id: 1,
327                order: Arc::clone(&order),
328            };
329            let _b = Ordered {
330                id: 2,
331                order: Arc::clone(&order),
332            };
333            let _c = Ordered {
334                id: 3,
335                order: Arc::clone(&order),
336            };
337        }
338
339        let observed = order.lock().expect("final order lock failed");
340        assert_eq!(
341            *observed,
342            vec![3, 2, 1],
343            "CUDA RAII guards must be released in LIFO order"
344        );
345    }
346
347    // -----------------------------------------------------------------------
348    // Task 4 — Driver version negotiation (NVIDIA Driver 525 / 535 / 550 / 560)
349    //
350    // `cuDriverGetVersion` returns the CUDA version as `major * 1000 + minor`.
351    // These tests verify the parsing logic and the version-gating conditions
352    // used throughout OxiCUDA without requiring a real driver.
353    // -----------------------------------------------------------------------
354
355    /// NVIDIA Driver 525 ships with CUDA 12.0.  Verify the parse of 12000.
356    #[test]
357    fn driver_version_parsing_cuda_12_0() {
358        // cuDriverGetVersion returns 12000 for CUDA 12.0 (driver 525).
359        let cuda_version: i32 = 12000;
360        let major = cuda_version / 1000;
361        let minor = cuda_version % 1000;
362        assert_eq!(major, 12, "major version mismatch");
363        assert_eq!(minor, 0, "minor version mismatch");
364    }
365
366    /// NVIDIA Driver 535 ships with CUDA 12.2.  Verify the parse of 12020.
367    #[test]
368    fn driver_version_parsing_cuda_12_2() {
369        let cuda_version: i32 = 12020;
370        let major = cuda_version / 1000;
371        let minor = cuda_version % 1000;
372        assert_eq!(major, 12);
373        assert_eq!(minor, 20);
374    }
375
376    /// NVIDIA Driver 550 ships with CUDA 12.4.  Verify the parse of 12040.
377    #[test]
378    fn driver_version_parsing_cuda_12_4() {
379        let cuda_version: i32 = 12040;
380        let major = cuda_version / 1000;
381        let minor = cuda_version % 1000;
382        assert_eq!(major, 12);
383        assert_eq!(minor, 40);
384    }
385
386    /// NVIDIA Driver 560 ships with CUDA 12.6.  Verify the parse of 12060.
387    #[test]
388    fn driver_version_parsing_cuda_12_6() {
389        let cuda_version: i32 = 12060;
390        let major = cuda_version / 1000;
391        let minor = cuda_version % 1000;
392        assert_eq!(major, 12);
393        assert_eq!(minor, 60);
394    }
395
396    /// OxiCUDA requires CUDA 11.2+ (`cuMemAllocAsync` availability).
397    /// Verify that the set of supported versions all meet the minimum and
398    /// that older versions are correctly rejected.
399    #[test]
400    fn driver_version_minimum_requirement() {
401        // cuMemAllocAsync was introduced in CUDA 11.2 (version integer 11020).
402        let min_required: i32 = 11020;
403
404        let supported: [i32; 5] = [11020, 11040, 12000, 12060, 12080];
405        for v in supported {
406            assert!(
407                v >= min_required,
408                "CUDA version {v} should be supported (>= {min_required})"
409            );
410        }
411
412        let too_old: [i32; 2] = [10020, 11010];
413        for v in too_old {
414            assert!(
415                v < min_required,
416                "CUDA version {v} should NOT be supported (< {min_required})"
417            );
418        }
419    }
420
421    /// CUDA 12.8 (version 12080) introduces `cuMemcpyBatchAsync`.
422    /// Verify the feature-gating arithmetic.
423    #[test]
424    fn driver_cuda_12_8_features_available() {
425        // 12.8 → 12080
426        let cuda_128: i32 = 12080;
427        assert!(
428            cuda_128 >= 12080,
429            "CUDA 12.8 must support cuMemcpyBatchAsync"
430        );
431
432        // 12.0 does not have it.
433        let cuda_120: i32 = 12000;
434        assert!(
435            cuda_120 < 12080,
436            "CUDA 12.0 must NOT support cuMemcpyBatchAsync"
437        );
438    }
439
440    /// Verify the complete NVIDIA-driver-version → CUDA-version mapping used
441    /// in OxiCUDA's version negotiation table.
442    #[test]
443    fn driver_nvidia_to_cuda_version_mapping() {
444        // (nvidia_driver, expected_cuda_version_int)
445        let mapping: [(u32, i32); 4] = [
446            (525, 12000), // Driver 525  → CUDA 12.0
447            (535, 12020), // Driver 535  → CUDA 12.2
448            (550, 12040), // Driver 550  → CUDA 12.4
449            (560, 12060), // Driver 560  → CUDA 12.6
450        ];
451
452        for (nvidia_driver, cuda_version) in mapping {
453            let major = cuda_version / 1000;
454            let minor = cuda_version % 1000;
455            // Sanity: all are CUDA 12.x
456            assert_eq!(major, 12, "driver {nvidia_driver}: expected CUDA 12.x");
457            // Minor must be a multiple of 10 (CUDA minor encoding)
458            assert_eq!(
459                minor % 10,
460                0,
461                "driver {nvidia_driver}: minor {minor} is not a multiple of 10"
462            );
463            // CUDA 12.8+ features require version >= 12080
464            let has_12_8_features = cuda_version >= 12080;
465            assert!(
466                !has_12_8_features,
467                "driver {nvidia_driver} (CUDA {major}.{:02}) should NOT have 12.8+ features",
468                minor / 10
469            );
470        }
471    }
472}