oxicuda_driver/lib.rs
1//! # OxiCUDA Driver
2//!
3//! **Dynamic, safe Rust bindings for the NVIDIA CUDA Driver API.**
4//!
5//! `oxicuda-driver` provides a zero-SDK-dependency wrapper around the CUDA
6//! Driver API. Unlike traditional CUDA crate approaches that require the
7//! CUDA Toolkit (or at least its headers and link stubs) to be present at
8//! **build time**, this crate loads the driver shared library entirely at
9//! **runtime** via [`libloading`](https://crates.io/crates/libloading).
10//!
11//! ## Zero build-time dependency
12//!
13//! No `cuda.h`, no `libcuda.so` symlink, no `nvcc` — the crate compiles on
14//! any Rust toolchain. The actual GPU driver is discovered and loaded the
15//! first time you call [`try_driver()`] or [`init()`].
16//!
17//! ## Runtime library loading
18//!
19//! | Platform | Library searched |
20//! |----------|-----------------------------|
21//! | Linux | `libcuda.so`, `libcuda.so.1` |
22//! | Windows | `nvcuda.dll` |
23//! | macOS | *(returns `UnsupportedPlatform` — NVIDIA dropped macOS support)* |
24//!
25//! ## Key types
26//!
27//! | Type | Description |
28//! |---------------|------------------------------------------------|
29//! | [`Device`] | A CUDA-capable GPU discovered on the system |
30//! | [`Context`] | Owns a CUDA context bound to a device |
31//! | [`Stream`] | Asynchronous command queue within a context |
32//! | [`Event`] | Timing / synchronisation marker on a stream |
33//! | [`Module`] | Loaded PTX or cubin containing kernel code |
34//! | [`Function`] | A single kernel entry point inside a module |
35//! | [`CudaError`] | Strongly-typed driver error code |
36//!
37//! ## Quick start
38//!
39//! ```rust,no_run
40//! use oxicuda_driver::prelude::*;
41//!
42//! // Initialise the CUDA driver (loads libcuda at runtime).
43//! init()?;
44//!
45//! // Pick the best available GPU and create a context.
46//! let dev = Device::get(0)?;
47//! let _ctx = Context::new(&dev)?;
48//!
49//! // Load a PTX module and look up a kernel.
50//! let module = Module::from_ptx("ptx_source")?;
51//! let kernel = module.get_function("vector_add")?;
52//! # Ok::<(), oxicuda_driver::CudaError>(())
53//! ```
54
55#![warn(missing_docs)]
56#![warn(clippy::all)]
57#![allow(clippy::module_name_repetitions)]
58#![allow(clippy::missing_safety_doc)]
59#![allow(clippy::too_many_arguments)]
60#![allow(clippy::macro_metavars_in_unsafe)]
61
62// ---------------------------------------------------------------------------
63// Module declarations
64// ---------------------------------------------------------------------------
65
66pub mod context;
67pub mod context_config;
68pub mod cooperative_launch;
69pub mod debug;
70pub mod device;
71pub mod error;
72pub mod event;
73pub mod ffi;
74pub mod function_attr;
75pub mod graph;
76pub mod link;
77pub mod loader;
78pub mod memory_info;
79pub mod module;
80pub mod multi_gpu;
81pub mod nvlink_topology;
82pub mod occupancy;
83pub mod occupancy_ext;
84pub mod primary_context;
85pub mod profiler;
86pub mod stream;
87pub mod stream_ordered_alloc;
88pub mod tma;
89
90// ---------------------------------------------------------------------------
91// Re-exports — error handling
92// ---------------------------------------------------------------------------
93
94pub use error::{CudaError, CudaResult, DriverLoadError, check};
95
96// ---------------------------------------------------------------------------
97// Re-exports — FFI types and constants
98// ---------------------------------------------------------------------------
99
100pub use ffi::{
101 CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, CU_TRSF_NORMALIZED_COORDINATES,
102 CU_TRSF_READ_AS_INTEGER, CU_TRSF_SRGB, CUDA_ARRAY_DESCRIPTOR, CUDA_ARRAY3D_CUBEMAP,
103 CUDA_ARRAY3D_DESCRIPTOR, CUDA_ARRAY3D_LAYERED, CUDA_ARRAY3D_SURFACE_LDST,
104 CUDA_ARRAY3D_TEXTURE_GATHER, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC,
105 CUaddress_mode, CUarray, CUarray_format, CUcontext, CUdevice, CUdevice_attribute, CUdeviceptr,
106 CUevent, CUfilter_mode, CUfunction, CUfunction_attribute, CUjit_option, CUkernel, CUlibrary,
107 CUlimit, CUmemoryPool, CUmemorytype, CUmipmappedArray, CUmodule, CUmulticastObject,
108 CUpointer_attribute, CUresourceViewFormat, CUresourcetype, CUstream, CUsurfObject, CUsurfref,
109 CUtexObject, CUtexref, CuLaunchAttribute, CuLaunchAttributeClusterDim, CuLaunchAttributeId,
110 CuLaunchAttributeValue, CuLaunchConfig, CudaResourceDescArray, CudaResourceDescLinear,
111 CudaResourceDescMipmap, CudaResourceDescPitch2d, CudaResourceDescRes,
112};
113
114// ---------------------------------------------------------------------------
115// Re-exports — high-level safe wrappers
116// ---------------------------------------------------------------------------
117
118pub use context::Context;
119pub use context_config::{CacheConfig, SharedMemConfig};
120pub use cooperative_launch::{
121 CooperativeLaunchConfig, CooperativeLaunchSupport, DeviceLaunchConfig,
122 MultiDeviceCooperativeLaunchConfig, cooperative_launch, cooperative_launch_multi_device,
123};
124pub use debug::{DebugLevel, DebugSession, KernelDebugger, MemoryChecker, NanInfChecker};
125pub use device::{Device, DeviceInfo, best_device, can_access_peer, driver_version, list_devices};
126pub use event::Event;
127pub use graph::{Graph, GraphExec, GraphNode, MemcpyDirection, StreamCapture};
128pub use link::{
129 FallbackStrategy, LinkInputType, LinkedModule, Linker, LinkerOptions, OptimizationLevel,
130};
131pub use loader::try_driver;
132pub use module::{Function, JitDiagnostic, JitLog, JitOptions, JitSeverity, Module};
133pub use multi_gpu::DevicePool;
134pub use nvlink_topology::{GpuTopology, NvLinkVersion, TopologyTree, TopologyType};
135pub use primary_context::PrimaryContext;
136pub use profiler::ProfilerGuard;
137pub use stream::Stream;
138pub use stream_ordered_alloc::{
139 StreamAllocation, StreamMemoryPool, StreamOrderedAllocConfig, stream_alloc, stream_free,
140};
141
142// ---------------------------------------------------------------------------
143// Driver initialisation
144// ---------------------------------------------------------------------------
145
146/// Initialise the CUDA driver API.
147///
148/// This must be called before any other driver function. It is safe to call
149/// multiple times; subsequent calls are no-ops inside the driver itself.
150///
151/// Internally this loads the shared library (if not already cached) and
152/// invokes `cuInit(0)`.
153///
154/// # Errors
155///
156/// Returns [`CudaError::NotInitialized`] if the CUDA driver library cannot be
157/// loaded, or another [`CudaError`] variant if `cuInit` reports a failure.
158pub fn init() -> CudaResult<()> {
159 let driver = loader::try_driver()?;
160 error::check(unsafe { (driver.cu_init)(0) })
161}
162
163// ---------------------------------------------------------------------------
164// Prelude — convenient glob import
165// ---------------------------------------------------------------------------
166
167/// Convenient glob import for common OxiCUDA Driver types.
168///
169/// ```rust
170/// use oxicuda_driver::prelude::*;
171/// ```
172pub mod prelude {
173 pub use crate::{
174 CacheConfig, Context, CooperativeLaunchConfig, CooperativeLaunchSupport, CudaError,
175 CudaResult, DebugLevel, DebugSession, Device, DeviceLaunchConfig, DevicePool, Event,
176 FallbackStrategy, Function, GpuTopology, Graph, GraphExec, GraphNode, KernelDebugger,
177 LinkInputType, LinkedModule, Linker, LinkerOptions, MemcpyDirection, Module,
178 MultiDeviceCooperativeLaunchConfig, NvLinkVersion, OptimizationLevel, PrimaryContext,
179 ProfilerGuard, SharedMemConfig, Stream, StreamAllocation, StreamCapture, StreamMemoryPool,
180 StreamOrderedAllocConfig, TopologyTree, TopologyType, can_access_peer, cooperative_launch,
181 cooperative_launch_multi_device, driver_version, init, stream_alloc, stream_free,
182 try_driver,
183 };
184}
185
186// ---------------------------------------------------------------------------
187// Compile-time feature flags
188// ---------------------------------------------------------------------------
189
190/// Compile-time feature availability.
191pub mod features {
192 /// Whether GPU tests are enabled (`--features gpu-tests`).
193 pub const HAS_GPU_TESTS: bool = cfg!(feature = "gpu-tests");
194}
195
196// ---------------------------------------------------------------------------
197// CPU-only tests for driver infrastructure
198// ---------------------------------------------------------------------------
199
200#[cfg(test)]
201mod driver_infra_tests {
202 // -----------------------------------------------------------------------
203 // Task 2 — Multi-threaded context migration (F3)
204 //
205 // Verifies the thread-safety of the context-stack data structure model
206 // using pure Rust primitives. No GPU is required.
207 // -----------------------------------------------------------------------
208
209 /// Simulate 4 threads each pushing and popping a "context ID" to/from a
210 /// thread-local stack, then verifying all results are collected correctly.
211 ///
212 /// This exercises the logical structure of context push/pop across threads
213 /// (corresponding to `cuCtxPushCurrent` / `cuCtxPopCurrent`) without
214 /// needing a real CUDA driver.
215 #[test]
216 fn context_push_pop_thread_safety() {
217 use std::sync::{Arc, Mutex};
218 use std::thread;
219
220 let results: Arc<Mutex<Vec<(u32, u32)>>> = Arc::new(Mutex::new(vec![]));
221 let mut handles = vec![];
222
223 for thread_id in 0..4u32 {
224 let results_clone = Arc::clone(&results);
225 let handle = thread::spawn(move || {
226 // Each thread simulates pushing two context IDs onto its
227 // private stack and then reading the top (most-recently-pushed)
228 // context.
229 let ctx_id = thread_id * 100;
230 let stack: Vec<u32> = vec![ctx_id, ctx_id + 1];
231 // Pop semantics: the top of the stack is the last element.
232 let top = stack.last().copied().unwrap_or(0);
233 let mut r = results_clone.lock().expect("results lock failed");
234 r.push((thread_id, top));
235 });
236 handles.push(handle);
237 }
238
239 for h in handles {
240 h.join().expect("thread panicked");
241 }
242
243 let results = results.lock().expect("final lock failed");
244 assert_eq!(results.len(), 4, "all 4 threads must contribute a result");
245
246 // Every thread should have seen `ctx_id + 1` as the top of its stack.
247 for &(thread_id, top) in results.iter() {
248 let expected_top = thread_id * 100 + 1;
249 assert_eq!(
250 top, expected_top,
251 "thread {thread_id}: expected top {expected_top}, got {top}"
252 );
253 }
254 }
255
256 // -----------------------------------------------------------------------
257 // Task 3 — Scope-exit / Drop resource release under OOM (F10)
258 //
259 // Verifies that Drop impls run correctly even when further allocations
260 // fail (simulated OOM), and that Rust's LIFO drop order is preserved.
261 // -----------------------------------------------------------------------
262
263 /// `Drop` is invoked for every resource that was successfully constructed,
264 /// even when a subsequent allocation would fail (simulated OOM).
265 #[test]
266 fn drop_counter_tracks_resource_release() {
267 use std::sync::Arc;
268 use std::sync::atomic::{AtomicUsize, Ordering};
269
270 struct FakeResource {
271 dropped: Arc<AtomicUsize>,
272 }
273
274 impl Drop for FakeResource {
275 fn drop(&mut self) {
276 self.dropped.fetch_add(1, Ordering::SeqCst);
277 }
278 }
279
280 let counter = Arc::new(AtomicUsize::new(0));
281
282 {
283 let _r1 = FakeResource {
284 dropped: Arc::clone(&counter),
285 };
286 let _r2 = FakeResource {
287 dropped: Arc::clone(&counter),
288 };
289 // Simulate OOM by not creating r3 — neither r1 nor r2 is dropped yet.
290 assert_eq!(
291 counter.load(Ordering::SeqCst),
292 0,
293 "resources must not be dropped before scope exit"
294 );
295 }
296
297 // After the block ends, both r1 and r2 must have been dropped.
298 assert_eq!(
299 counter.load(Ordering::SeqCst),
300 2,
301 "both resources must be dropped at scope exit"
302 );
303 }
304
305 /// Rust drops local variables in **reverse declaration order** (LIFO).
306 /// This test verifies that invariant for RAII guard types.
307 #[test]
308 fn drop_order_is_lifo() {
309 use std::sync::{Arc, Mutex};
310
311 let order: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(vec![]));
312
313 struct Ordered {
314 id: u32,
315 order: Arc<Mutex<Vec<u32>>>,
316 }
317
318 impl Drop for Ordered {
319 fn drop(&mut self) {
320 self.order.lock().expect("order lock failed").push(self.id);
321 }
322 }
323
324 {
325 let _a = Ordered {
326 id: 1,
327 order: Arc::clone(&order),
328 };
329 let _b = Ordered {
330 id: 2,
331 order: Arc::clone(&order),
332 };
333 let _c = Ordered {
334 id: 3,
335 order: Arc::clone(&order),
336 };
337 }
338
339 let observed = order.lock().expect("final order lock failed");
340 assert_eq!(
341 *observed,
342 vec![3, 2, 1],
343 "CUDA RAII guards must be released in LIFO order"
344 );
345 }
346
347 // -----------------------------------------------------------------------
348 // Task 4 — Driver version negotiation (NVIDIA Driver 525 / 535 / 550 / 560)
349 //
350 // `cuDriverGetVersion` returns the CUDA version as `major * 1000 + minor`.
351 // These tests verify the parsing logic and the version-gating conditions
352 // used throughout OxiCUDA without requiring a real driver.
353 // -----------------------------------------------------------------------
354
355 /// NVIDIA Driver 525 ships with CUDA 12.0. Verify the parse of 12000.
356 #[test]
357 fn driver_version_parsing_cuda_12_0() {
358 // cuDriverGetVersion returns 12000 for CUDA 12.0 (driver 525).
359 let cuda_version: i32 = 12000;
360 let major = cuda_version / 1000;
361 let minor = cuda_version % 1000;
362 assert_eq!(major, 12, "major version mismatch");
363 assert_eq!(minor, 0, "minor version mismatch");
364 }
365
366 /// NVIDIA Driver 535 ships with CUDA 12.2. Verify the parse of 12020.
367 #[test]
368 fn driver_version_parsing_cuda_12_2() {
369 let cuda_version: i32 = 12020;
370 let major = cuda_version / 1000;
371 let minor = cuda_version % 1000;
372 assert_eq!(major, 12);
373 assert_eq!(minor, 20);
374 }
375
376 /// NVIDIA Driver 550 ships with CUDA 12.4. Verify the parse of 12040.
377 #[test]
378 fn driver_version_parsing_cuda_12_4() {
379 let cuda_version: i32 = 12040;
380 let major = cuda_version / 1000;
381 let minor = cuda_version % 1000;
382 assert_eq!(major, 12);
383 assert_eq!(minor, 40);
384 }
385
386 /// NVIDIA Driver 560 ships with CUDA 12.6. Verify the parse of 12060.
387 #[test]
388 fn driver_version_parsing_cuda_12_6() {
389 let cuda_version: i32 = 12060;
390 let major = cuda_version / 1000;
391 let minor = cuda_version % 1000;
392 assert_eq!(major, 12);
393 assert_eq!(minor, 60);
394 }
395
396 /// OxiCUDA requires CUDA 11.2+ (`cuMemAllocAsync` availability).
397 /// Verify that the set of supported versions all meet the minimum and
398 /// that older versions are correctly rejected.
399 #[test]
400 fn driver_version_minimum_requirement() {
401 // cuMemAllocAsync was introduced in CUDA 11.2 (version integer 11020).
402 let min_required: i32 = 11020;
403
404 let supported: [i32; 5] = [11020, 11040, 12000, 12060, 12080];
405 for v in supported {
406 assert!(
407 v >= min_required,
408 "CUDA version {v} should be supported (>= {min_required})"
409 );
410 }
411
412 let too_old: [i32; 2] = [10020, 11010];
413 for v in too_old {
414 assert!(
415 v < min_required,
416 "CUDA version {v} should NOT be supported (< {min_required})"
417 );
418 }
419 }
420
421 /// CUDA 12.8 (version 12080) introduces `cuMemcpyBatchAsync`.
422 /// Verify the feature-gating arithmetic.
423 #[test]
424 fn driver_cuda_12_8_features_available() {
425 // 12.8 → 12080
426 let cuda_128: i32 = 12080;
427 assert!(
428 cuda_128 >= 12080,
429 "CUDA 12.8 must support cuMemcpyBatchAsync"
430 );
431
432 // 12.0 does not have it.
433 let cuda_120: i32 = 12000;
434 assert!(
435 cuda_120 < 12080,
436 "CUDA 12.0 must NOT support cuMemcpyBatchAsync"
437 );
438 }
439
440 /// Verify the complete NVIDIA-driver-version → CUDA-version mapping used
441 /// in OxiCUDA's version negotiation table.
442 #[test]
443 fn driver_nvidia_to_cuda_version_mapping() {
444 // (nvidia_driver, expected_cuda_version_int)
445 let mapping: [(u32, i32); 4] = [
446 (525, 12000), // Driver 525 → CUDA 12.0
447 (535, 12020), // Driver 535 → CUDA 12.2
448 (550, 12040), // Driver 550 → CUDA 12.4
449 (560, 12060), // Driver 560 → CUDA 12.6
450 ];
451
452 for (nvidia_driver, cuda_version) in mapping {
453 let major = cuda_version / 1000;
454 let minor = cuda_version % 1000;
455 // Sanity: all are CUDA 12.x
456 assert_eq!(major, 12, "driver {nvidia_driver}: expected CUDA 12.x");
457 // Minor must be a multiple of 10 (CUDA minor encoding)
458 assert_eq!(
459 minor % 10,
460 0,
461 "driver {nvidia_driver}: minor {minor} is not a multiple of 10"
462 );
463 // CUDA 12.8+ features require version >= 12080
464 let has_12_8_features = cuda_version >= 12080;
465 assert!(
466 !has_12_8_features,
467 "driver {nvidia_driver} (CUDA {major}.{:02}) should NOT have 12.8+ features",
468 minor / 10
469 );
470 }
471 }
472}