oxicuda-driver 0.1.1

OxiCUDA Driver - Dynamic CUDA driver API wrapper via libloading (zero SDK dependency)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
//! # OxiCUDA Driver
//!
//! **Dynamic, safe Rust bindings for the NVIDIA CUDA Driver API.**
//!
//! `oxicuda-driver` provides a zero-SDK-dependency wrapper around the CUDA
//! Driver API.  Unlike traditional CUDA crate approaches that require the
//! CUDA Toolkit (or at least its headers and link stubs) to be present at
//! **build time**, this crate loads the driver shared library entirely at
//! **runtime** via [`libloading`](https://crates.io/crates/libloading).
//!
//! ## Zero build-time dependency
//!
//! No `cuda.h`, no `libcuda.so` symlink, no `nvcc` — the crate compiles on
//! any Rust toolchain.  The actual GPU driver is discovered and loaded the
//! first time you call [`try_driver()`] or [`init()`].
//!
//! ## Runtime library loading
//!
//! | Platform | Library searched             |
//! |----------|-----------------------------|
//! | Linux    | `libcuda.so`, `libcuda.so.1` |
//! | Windows  | `nvcuda.dll`                 |
//! | macOS    | *(returns `UnsupportedPlatform` — NVIDIA dropped macOS support)* |
//!
//! ## Key types
//!
//! | Type          | Description                                    |
//! |---------------|------------------------------------------------|
//! | [`Device`]    | A CUDA-capable GPU discovered on the system    |
//! | [`Context`]   | Owns a CUDA context bound to a device          |
//! | [`Stream`]    | Asynchronous command queue within a context     |
//! | [`Event`]     | Timing / synchronisation marker on a stream    |
//! | [`Module`]    | Loaded PTX or cubin containing kernel code     |
//! | [`Function`]  | A single kernel entry point inside a module    |
//! | [`CudaError`] | Strongly-typed driver error code               |
//!
//! ## Quick start
//!
//! ```rust,no_run
//! use oxicuda_driver::prelude::*;
//!
//! // Initialise the CUDA driver (loads libcuda at runtime).
//! init()?;
//!
//! // Pick the best available GPU and create a context.
//! let dev = Device::get(0)?;
//! let _ctx = Context::new(&dev)?;
//!
//! // Load a PTX module and look up a kernel.
//! let module = Module::from_ptx("ptx_source")?;
//! let kernel = module.get_function("vector_add")?;
//! # Ok::<(), oxicuda_driver::CudaError>(())
//! ```

#![warn(missing_docs)]
#![warn(clippy::all)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::missing_safety_doc)]
#![allow(clippy::too_many_arguments)]
#![allow(clippy::macro_metavars_in_unsafe)]

// ---------------------------------------------------------------------------
// Module declarations
// ---------------------------------------------------------------------------

pub mod context;
pub mod context_config;
pub mod cooperative_launch;
pub mod debug;
pub mod device;
pub mod error;
pub mod event;
pub mod ffi;
pub mod function_attr;
pub mod graph;
pub mod link;
pub mod loader;
pub mod memory_info;
pub mod module;
pub mod multi_gpu;
pub mod nvlink_topology;
pub mod occupancy;
pub mod occupancy_ext;
pub mod primary_context;
pub mod profiler;
pub mod stream;
pub mod stream_ordered_alloc;
pub mod tma;

// ---------------------------------------------------------------------------
// Re-exports — error handling
// ---------------------------------------------------------------------------

pub use error::{CudaError, CudaResult, DriverLoadError, check};

// ---------------------------------------------------------------------------
// Re-exports — FFI types and constants
// ---------------------------------------------------------------------------

pub use ffi::{
    CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, CU_TRSF_NORMALIZED_COORDINATES,
    CU_TRSF_READ_AS_INTEGER, CU_TRSF_SRGB, CUDA_ARRAY_DESCRIPTOR, CUDA_ARRAY3D_CUBEMAP,
    CUDA_ARRAY3D_DESCRIPTOR, CUDA_ARRAY3D_LAYERED, CUDA_ARRAY3D_SURFACE_LDST,
    CUDA_ARRAY3D_TEXTURE_GATHER, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC,
    CUaddress_mode, CUarray, CUarray_format, CUcontext, CUdevice, CUdevice_attribute, CUdeviceptr,
    CUevent, CUfilter_mode, CUfunction, CUfunction_attribute, CUjit_option, CUkernel, CUlibrary,
    CUlimit, CUmemoryPool, CUmemorytype, CUmipmappedArray, CUmodule, CUmulticastObject,
    CUpointer_attribute, CUresourceViewFormat, CUresourcetype, CUstream, CUsurfObject, CUsurfref,
    CUtexObject, CUtexref, CuLaunchAttribute, CuLaunchAttributeClusterDim, CuLaunchAttributeId,
    CuLaunchAttributeValue, CuLaunchConfig, CudaResourceDescArray, CudaResourceDescLinear,
    CudaResourceDescMipmap, CudaResourceDescPitch2d, CudaResourceDescRes,
};

// ---------------------------------------------------------------------------
// Re-exports — high-level safe wrappers
// ---------------------------------------------------------------------------

pub use context::Context;
pub use context_config::{CacheConfig, SharedMemConfig};
pub use cooperative_launch::{
    CooperativeLaunchConfig, CooperativeLaunchSupport, DeviceLaunchConfig,
    MultiDeviceCooperativeLaunchConfig, cooperative_launch, cooperative_launch_multi_device,
};
pub use debug::{DebugLevel, DebugSession, KernelDebugger, MemoryChecker, NanInfChecker};
pub use device::{Device, DeviceInfo, best_device, can_access_peer, driver_version, list_devices};
pub use event::Event;
pub use graph::{Graph, GraphExec, GraphNode, MemcpyDirection, StreamCapture};
pub use link::{
    FallbackStrategy, LinkInputType, LinkedModule, Linker, LinkerOptions, OptimizationLevel,
};
pub use loader::try_driver;
pub use module::{Function, JitDiagnostic, JitLog, JitOptions, JitSeverity, Module};
pub use multi_gpu::DevicePool;
pub use nvlink_topology::{GpuTopology, NvLinkVersion, TopologyTree, TopologyType};
pub use primary_context::PrimaryContext;
pub use profiler::ProfilerGuard;
pub use stream::Stream;
pub use stream_ordered_alloc::{
    StreamAllocation, StreamMemoryPool, StreamOrderedAllocConfig, stream_alloc, stream_free,
};

// ---------------------------------------------------------------------------
// Driver initialisation
// ---------------------------------------------------------------------------

/// Initialise the CUDA driver API.
///
/// This must be called before any other driver function.  It is safe to call
/// multiple times; subsequent calls are no-ops inside the driver itself.
///
/// Internally this loads the shared library (if not already cached) and
/// invokes `cuInit(0)`.
///
/// # Errors
///
/// Returns [`CudaError::NotInitialized`] if the CUDA driver library cannot be
/// loaded, or another [`CudaError`] variant if `cuInit` reports a failure.
pub fn init() -> CudaResult<()> {
    let driver = loader::try_driver()?;
    error::check(unsafe { (driver.cu_init)(0) })
}

// ---------------------------------------------------------------------------
// Prelude — convenient glob import
// ---------------------------------------------------------------------------

/// Convenient glob import for common OxiCUDA Driver types.
///
/// ```rust
/// use oxicuda_driver::prelude::*;
/// ```
pub mod prelude {
    pub use crate::{
        CacheConfig, Context, CooperativeLaunchConfig, CooperativeLaunchSupport, CudaError,
        CudaResult, DebugLevel, DebugSession, Device, DeviceLaunchConfig, DevicePool, Event,
        FallbackStrategy, Function, GpuTopology, Graph, GraphExec, GraphNode, KernelDebugger,
        LinkInputType, LinkedModule, Linker, LinkerOptions, MemcpyDirection, Module,
        MultiDeviceCooperativeLaunchConfig, NvLinkVersion, OptimizationLevel, PrimaryContext,
        ProfilerGuard, SharedMemConfig, Stream, StreamAllocation, StreamCapture, StreamMemoryPool,
        StreamOrderedAllocConfig, TopologyTree, TopologyType, can_access_peer, cooperative_launch,
        cooperative_launch_multi_device, driver_version, init, stream_alloc, stream_free,
        try_driver,
    };
}

// ---------------------------------------------------------------------------
// Compile-time feature flags
// ---------------------------------------------------------------------------

/// Compile-time feature availability.
pub mod features {
    /// Whether GPU tests are enabled (`--features gpu-tests`).
    pub const HAS_GPU_TESTS: bool = cfg!(feature = "gpu-tests");
}

// ---------------------------------------------------------------------------
// CPU-only tests for driver infrastructure
// ---------------------------------------------------------------------------

#[cfg(test)]
mod driver_infra_tests {
    // -----------------------------------------------------------------------
    // Task 2 — Multi-threaded context migration (F3)
    //
    // Verifies the thread-safety of the context-stack data structure model
    // using pure Rust primitives.  No GPU is required.
    // -----------------------------------------------------------------------

    /// Simulate 4 threads each pushing and popping a "context ID" to/from a
    /// thread-local stack, then verifying all results are collected correctly.
    ///
    /// This exercises the logical structure of context push/pop across threads
    /// (corresponding to `cuCtxPushCurrent` / `cuCtxPopCurrent`) without
    /// needing a real CUDA driver.
    #[test]
    fn context_push_pop_thread_safety() {
        use std::sync::{Arc, Mutex};
        use std::thread;

        let results: Arc<Mutex<Vec<(u32, u32)>>> = Arc::new(Mutex::new(vec![]));
        let mut handles = vec![];

        for thread_id in 0..4u32 {
            let results_clone = Arc::clone(&results);
            let handle = thread::spawn(move || {
                // Each thread simulates pushing two context IDs onto its
                // private stack and then reading the top (most-recently-pushed)
                // context.
                let ctx_id = thread_id * 100;
                let stack: Vec<u32> = vec![ctx_id, ctx_id + 1];
                // Pop semantics: the top of the stack is the last element.
                let top = stack.last().copied().unwrap_or(0);
                let mut r = results_clone.lock().expect("results lock failed");
                r.push((thread_id, top));
            });
            handles.push(handle);
        }

        for h in handles {
            h.join().expect("thread panicked");
        }

        let results = results.lock().expect("final lock failed");
        assert_eq!(results.len(), 4, "all 4 threads must contribute a result");

        // Every thread should have seen `ctx_id + 1` as the top of its stack.
        for &(thread_id, top) in results.iter() {
            let expected_top = thread_id * 100 + 1;
            assert_eq!(
                top, expected_top,
                "thread {thread_id}: expected top {expected_top}, got {top}"
            );
        }
    }

    // -----------------------------------------------------------------------
    // Task 3 — Scope-exit / Drop resource release under OOM (F10)
    //
    // Verifies that Drop impls run correctly even when further allocations
    // fail (simulated OOM), and that Rust's LIFO drop order is preserved.
    // -----------------------------------------------------------------------

    /// `Drop` is invoked for every resource that was successfully constructed,
    /// even when a subsequent allocation would fail (simulated OOM).
    #[test]
    fn drop_counter_tracks_resource_release() {
        use std::sync::Arc;
        use std::sync::atomic::{AtomicUsize, Ordering};

        struct FakeResource {
            dropped: Arc<AtomicUsize>,
        }

        impl Drop for FakeResource {
            fn drop(&mut self) {
                self.dropped.fetch_add(1, Ordering::SeqCst);
            }
        }

        let counter = Arc::new(AtomicUsize::new(0));

        {
            let _r1 = FakeResource {
                dropped: Arc::clone(&counter),
            };
            let _r2 = FakeResource {
                dropped: Arc::clone(&counter),
            };
            // Simulate OOM by not creating r3 — neither r1 nor r2 is dropped yet.
            assert_eq!(
                counter.load(Ordering::SeqCst),
                0,
                "resources must not be dropped before scope exit"
            );
        }

        // After the block ends, both r1 and r2 must have been dropped.
        assert_eq!(
            counter.load(Ordering::SeqCst),
            2,
            "both resources must be dropped at scope exit"
        );
    }

    /// Rust drops local variables in **reverse declaration order** (LIFO).
    /// This test verifies that invariant for RAII guard types.
    #[test]
    fn drop_order_is_lifo() {
        use std::sync::{Arc, Mutex};

        let order: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(vec![]));

        struct Ordered {
            id: u32,
            order: Arc<Mutex<Vec<u32>>>,
        }

        impl Drop for Ordered {
            fn drop(&mut self) {
                self.order.lock().expect("order lock failed").push(self.id);
            }
        }

        {
            let _a = Ordered {
                id: 1,
                order: Arc::clone(&order),
            };
            let _b = Ordered {
                id: 2,
                order: Arc::clone(&order),
            };
            let _c = Ordered {
                id: 3,
                order: Arc::clone(&order),
            };
        }

        let observed = order.lock().expect("final order lock failed");
        assert_eq!(
            *observed,
            vec![3, 2, 1],
            "CUDA RAII guards must be released in LIFO order"
        );
    }

    // -----------------------------------------------------------------------
    // Task 4 — Driver version negotiation (NVIDIA Driver 525 / 535 / 550 / 560)
    //
    // `cuDriverGetVersion` returns the CUDA version as `major * 1000 + minor`.
    // These tests verify the parsing logic and the version-gating conditions
    // used throughout OxiCUDA without requiring a real driver.
    // -----------------------------------------------------------------------

    /// NVIDIA Driver 525 ships with CUDA 12.0.  Verify the parse of 12000.
    #[test]
    fn driver_version_parsing_cuda_12_0() {
        // cuDriverGetVersion returns 12000 for CUDA 12.0 (driver 525).
        let cuda_version: i32 = 12000;
        let major = cuda_version / 1000;
        let minor = cuda_version % 1000;
        assert_eq!(major, 12, "major version mismatch");
        assert_eq!(minor, 0, "minor version mismatch");
    }

    /// NVIDIA Driver 535 ships with CUDA 12.2.  Verify the parse of 12020.
    #[test]
    fn driver_version_parsing_cuda_12_2() {
        let cuda_version: i32 = 12020;
        let major = cuda_version / 1000;
        let minor = cuda_version % 1000;
        assert_eq!(major, 12);
        assert_eq!(minor, 20);
    }

    /// NVIDIA Driver 550 ships with CUDA 12.4.  Verify the parse of 12040.
    #[test]
    fn driver_version_parsing_cuda_12_4() {
        let cuda_version: i32 = 12040;
        let major = cuda_version / 1000;
        let minor = cuda_version % 1000;
        assert_eq!(major, 12);
        assert_eq!(minor, 40);
    }

    /// NVIDIA Driver 560 ships with CUDA 12.6.  Verify the parse of 12060.
    #[test]
    fn driver_version_parsing_cuda_12_6() {
        let cuda_version: i32 = 12060;
        let major = cuda_version / 1000;
        let minor = cuda_version % 1000;
        assert_eq!(major, 12);
        assert_eq!(minor, 60);
    }

    /// OxiCUDA requires CUDA 11.2+ (`cuMemAllocAsync` availability).
    /// Verify that the set of supported versions all meet the minimum and
    /// that older versions are correctly rejected.
    #[test]
    fn driver_version_minimum_requirement() {
        // cuMemAllocAsync was introduced in CUDA 11.2 (version integer 11020).
        let min_required: i32 = 11020;

        let supported: [i32; 5] = [11020, 11040, 12000, 12060, 12080];
        for v in supported {
            assert!(
                v >= min_required,
                "CUDA version {v} should be supported (>= {min_required})"
            );
        }

        let too_old: [i32; 2] = [10020, 11010];
        for v in too_old {
            assert!(
                v < min_required,
                "CUDA version {v} should NOT be supported (< {min_required})"
            );
        }
    }

    /// CUDA 12.8 (version 12080) introduces `cuMemcpyBatchAsync`.
    /// Verify the feature-gating arithmetic.
    #[test]
    fn driver_cuda_12_8_features_available() {
        // 12.8 → 12080
        let cuda_128: i32 = 12080;
        assert!(
            cuda_128 >= 12080,
            "CUDA 12.8 must support cuMemcpyBatchAsync"
        );

        // 12.0 does not have it.
        let cuda_120: i32 = 12000;
        assert!(
            cuda_120 < 12080,
            "CUDA 12.0 must NOT support cuMemcpyBatchAsync"
        );
    }

    /// Verify the complete NVIDIA-driver-version → CUDA-version mapping used
    /// in OxiCUDA's version negotiation table.
    #[test]
    fn driver_nvidia_to_cuda_version_mapping() {
        // (nvidia_driver, expected_cuda_version_int)
        let mapping: [(u32, i32); 4] = [
            (525, 12000), // Driver 525  → CUDA 12.0
            (535, 12020), // Driver 535  → CUDA 12.2
            (550, 12040), // Driver 550  → CUDA 12.4
            (560, 12060), // Driver 560  → CUDA 12.6
        ];

        for (nvidia_driver, cuda_version) in mapping {
            let major = cuda_version / 1000;
            let minor = cuda_version % 1000;
            // Sanity: all are CUDA 12.x
            assert_eq!(major, 12, "driver {nvidia_driver}: expected CUDA 12.x");
            // Minor must be a multiple of 10 (CUDA minor encoding)
            assert_eq!(
                minor % 10,
                0,
                "driver {nvidia_driver}: minor {minor} is not a multiple of 10"
            );
            // CUDA 12.8+ features require version >= 12080
            let has_12_8_features = cuda_version >= 12080;
            assert!(
                !has_12_8_features,
                "driver {nvidia_driver} (CUDA {major}.{:02}) should NOT have 12.8+ features",
                minor / 10
            );
        }
    }
}