#![warn(missing_docs)]
#![warn(clippy::all)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::missing_safety_doc)]
#![allow(clippy::too_many_arguments)]
#![allow(clippy::macro_metavars_in_unsafe)]
pub mod context;
pub mod context_config;
pub mod cooperative_launch;
pub mod debug;
pub mod device;
pub mod error;
pub mod event;
pub mod ffi;
pub mod function_attr;
pub mod graph;
pub mod link;
pub mod loader;
pub mod memory_info;
pub mod module;
pub mod multi_gpu;
pub mod nvlink_topology;
pub mod occupancy;
pub mod occupancy_ext;
pub mod primary_context;
pub mod profiler;
pub mod stream;
pub mod stream_ordered_alloc;
pub mod tma;
pub use error::{CudaError, CudaResult, DriverLoadError, check};
pub use ffi::{
CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, CU_TRSF_NORMALIZED_COORDINATES,
CU_TRSF_READ_AS_INTEGER, CU_TRSF_SRGB, CUDA_ARRAY_DESCRIPTOR, CUDA_ARRAY3D_CUBEMAP,
CUDA_ARRAY3D_DESCRIPTOR, CUDA_ARRAY3D_LAYERED, CUDA_ARRAY3D_SURFACE_LDST,
CUDA_ARRAY3D_TEXTURE_GATHER, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC,
CUaddress_mode, CUarray, CUarray_format, CUcontext, CUdevice, CUdevice_attribute, CUdeviceptr,
CUevent, CUfilter_mode, CUfunction, CUfunction_attribute, CUjit_option, CUkernel, CUlibrary,
CUlimit, CUmemoryPool, CUmemorytype, CUmipmappedArray, CUmodule, CUmulticastObject,
CUpointer_attribute, CUresourceViewFormat, CUresourcetype, CUstream, CUsurfObject, CUsurfref,
CUtexObject, CUtexref, CuLaunchAttribute, CuLaunchAttributeClusterDim, CuLaunchAttributeId,
CuLaunchAttributeValue, CuLaunchConfig, CudaResourceDescArray, CudaResourceDescLinear,
CudaResourceDescMipmap, CudaResourceDescPitch2d, CudaResourceDescRes,
};
pub use context::Context;
pub use context_config::{CacheConfig, SharedMemConfig};
pub use cooperative_launch::{
CooperativeLaunchConfig, CooperativeLaunchSupport, DeviceLaunchConfig,
MultiDeviceCooperativeLaunchConfig, cooperative_launch, cooperative_launch_multi_device,
};
pub use debug::{DebugLevel, DebugSession, KernelDebugger, MemoryChecker, NanInfChecker};
pub use device::{Device, DeviceInfo, best_device, can_access_peer, driver_version, list_devices};
pub use event::Event;
pub use graph::{Graph, GraphExec, GraphNode, MemcpyDirection, StreamCapture};
pub use link::{
FallbackStrategy, LinkInputType, LinkedModule, Linker, LinkerOptions, OptimizationLevel,
};
pub use loader::try_driver;
pub use module::{Function, JitDiagnostic, JitLog, JitOptions, JitSeverity, Module};
pub use multi_gpu::DevicePool;
pub use nvlink_topology::{GpuTopology, NvLinkVersion, TopologyTree, TopologyType};
pub use primary_context::PrimaryContext;
pub use profiler::ProfilerGuard;
pub use stream::Stream;
pub use stream_ordered_alloc::{
StreamAllocation, StreamMemoryPool, StreamOrderedAllocConfig, stream_alloc, stream_free,
};
pub fn init() -> CudaResult<()> {
let driver = loader::try_driver()?;
error::check(unsafe { (driver.cu_init)(0) })
}
pub mod prelude {
pub use crate::{
CacheConfig, Context, CooperativeLaunchConfig, CooperativeLaunchSupport, CudaError,
CudaResult, DebugLevel, DebugSession, Device, DeviceLaunchConfig, DevicePool, Event,
FallbackStrategy, Function, GpuTopology, Graph, GraphExec, GraphNode, KernelDebugger,
LinkInputType, LinkedModule, Linker, LinkerOptions, MemcpyDirection, Module,
MultiDeviceCooperativeLaunchConfig, NvLinkVersion, OptimizationLevel, PrimaryContext,
ProfilerGuard, SharedMemConfig, Stream, StreamAllocation, StreamCapture, StreamMemoryPool,
StreamOrderedAllocConfig, TopologyTree, TopologyType, can_access_peer, cooperative_launch,
cooperative_launch_multi_device, driver_version, init, stream_alloc, stream_free,
try_driver,
};
}
pub mod features {
pub const HAS_GPU_TESTS: bool = cfg!(feature = "gpu-tests");
}
#[cfg(test)]
mod driver_infra_tests {
#[test]
fn context_push_pop_thread_safety() {
use std::sync::{Arc, Mutex};
use std::thread;
let results: Arc<Mutex<Vec<(u32, u32)>>> = Arc::new(Mutex::new(vec![]));
let mut handles = vec![];
for thread_id in 0..4u32 {
let results_clone = Arc::clone(&results);
let handle = thread::spawn(move || {
let ctx_id = thread_id * 100;
let stack: Vec<u32> = vec![ctx_id, ctx_id + 1];
let top = stack.last().copied().unwrap_or(0);
let mut r = results_clone.lock().expect("results lock failed");
r.push((thread_id, top));
});
handles.push(handle);
}
for h in handles {
h.join().expect("thread panicked");
}
let results = results.lock().expect("final lock failed");
assert_eq!(results.len(), 4, "all 4 threads must contribute a result");
for &(thread_id, top) in results.iter() {
let expected_top = thread_id * 100 + 1;
assert_eq!(
top, expected_top,
"thread {thread_id}: expected top {expected_top}, got {top}"
);
}
}
#[test]
fn drop_counter_tracks_resource_release() {
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
struct FakeResource {
dropped: Arc<AtomicUsize>,
}
impl Drop for FakeResource {
fn drop(&mut self) {
self.dropped.fetch_add(1, Ordering::SeqCst);
}
}
let counter = Arc::new(AtomicUsize::new(0));
{
let _r1 = FakeResource {
dropped: Arc::clone(&counter),
};
let _r2 = FakeResource {
dropped: Arc::clone(&counter),
};
assert_eq!(
counter.load(Ordering::SeqCst),
0,
"resources must not be dropped before scope exit"
);
}
assert_eq!(
counter.load(Ordering::SeqCst),
2,
"both resources must be dropped at scope exit"
);
}
#[test]
fn drop_order_is_lifo() {
use std::sync::{Arc, Mutex};
let order: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(vec![]));
struct Ordered {
id: u32,
order: Arc<Mutex<Vec<u32>>>,
}
impl Drop for Ordered {
fn drop(&mut self) {
self.order.lock().expect("order lock failed").push(self.id);
}
}
{
let _a = Ordered {
id: 1,
order: Arc::clone(&order),
};
let _b = Ordered {
id: 2,
order: Arc::clone(&order),
};
let _c = Ordered {
id: 3,
order: Arc::clone(&order),
};
}
let observed = order.lock().expect("final order lock failed");
assert_eq!(
*observed,
vec![3, 2, 1],
"CUDA RAII guards must be released in LIFO order"
);
}
#[test]
fn driver_version_parsing_cuda_12_0() {
let cuda_version: i32 = 12000;
let major = cuda_version / 1000;
let minor = cuda_version % 1000;
assert_eq!(major, 12, "major version mismatch");
assert_eq!(minor, 0, "minor version mismatch");
}
#[test]
fn driver_version_parsing_cuda_12_2() {
let cuda_version: i32 = 12020;
let major = cuda_version / 1000;
let minor = cuda_version % 1000;
assert_eq!(major, 12);
assert_eq!(minor, 20);
}
#[test]
fn driver_version_parsing_cuda_12_4() {
let cuda_version: i32 = 12040;
let major = cuda_version / 1000;
let minor = cuda_version % 1000;
assert_eq!(major, 12);
assert_eq!(minor, 40);
}
#[test]
fn driver_version_parsing_cuda_12_6() {
let cuda_version: i32 = 12060;
let major = cuda_version / 1000;
let minor = cuda_version % 1000;
assert_eq!(major, 12);
assert_eq!(minor, 60);
}
#[test]
fn driver_version_minimum_requirement() {
let min_required: i32 = 11020;
let supported: [i32; 5] = [11020, 11040, 12000, 12060, 12080];
for v in supported {
assert!(
v >= min_required,
"CUDA version {v} should be supported (>= {min_required})"
);
}
let too_old: [i32; 2] = [10020, 11010];
for v in too_old {
assert!(
v < min_required,
"CUDA version {v} should NOT be supported (< {min_required})"
);
}
}
#[test]
fn driver_cuda_12_8_features_available() {
let cuda_128: i32 = 12080;
assert!(
cuda_128 >= 12080,
"CUDA 12.8 must support cuMemcpyBatchAsync"
);
let cuda_120: i32 = 12000;
assert!(
cuda_120 < 12080,
"CUDA 12.0 must NOT support cuMemcpyBatchAsync"
);
}
#[test]
fn driver_nvidia_to_cuda_version_mapping() {
let mapping: [(u32, i32); 4] = [
(525, 12000), (535, 12020), (550, 12040), (560, 12060), ];
for (nvidia_driver, cuda_version) in mapping {
let major = cuda_version / 1000;
let minor = cuda_version % 1000;
assert_eq!(major, 12, "driver {nvidia_driver}: expected CUDA 12.x");
assert_eq!(
minor % 10,
0,
"driver {nvidia_driver}: minor {minor} is not a multiple of 10"
);
let has_12_8_features = cuda_version >= 12080;
assert!(
!has_12_8_features,
"driver {nvidia_driver} (CUDA {major}.{:02}) should NOT have 12.8+ features",
minor / 10
);
}
}
}