oxicuda_runtime/lib.rs
1//! # OxiCUDA Runtime
2//!
3//! Pure-Rust implementation of the **CUDA Runtime API** (`libcudart`) surface,
4//! built on top of `oxicuda-driver`'s dynamic driver loader.
5//!
6//! ## Coverage
7//!
8//! | Module | API functions |
9//! |--------------|---------------------------------------------------------------------|
10//! | [`device`] | `cudaGetDeviceCount`, `cudaSetDevice`, `cudaGetDevice`, `cudaGetDeviceProperties`, `cudaDeviceSynchronize`, `cudaDeviceReset` |
11//! | [`memory`] | `cudaMalloc`, `cudaFree`, `cudaMallocHost`, `cudaFreeHost`, `cudaMallocManaged`, `cudaMallocPitch`, `cudaMemcpy`, `cudaMemcpyAsync`, `cudaMemset`, `cudaMemGetInfo` |
12//! | [`stream`] | `cudaStreamCreate`, `cudaStreamCreateWithFlags`, `cudaStreamCreateWithPriority`, `cudaStreamDestroy`, `cudaStreamSynchronize`, `cudaStreamQuery`, `cudaStreamWaitEvent`, `cudaStreamGetPriority`, `cudaStreamGetFlags` |
13//! | [`event`] | `cudaEventCreate`, `cudaEventCreateWithFlags`, `cudaEventDestroy`, `cudaEventRecord`, `cudaEventSynchronize`, `cudaEventQuery`, `cudaEventElapsedTime` |
14//! | [`launch`] | `cudaLaunchKernel` (explicit function handle), `cudaFuncGetAttributes`, `cudaFuncSetAttribute`, `module_load_ptx`, `module_get_function`, `module_unload` |
15//! | [`peer`] | `cudaDeviceCanAccessPeer`, `cudaDeviceEnablePeerAccess`, `cudaDeviceDisablePeerAccess`, `cudaMemcpyPeer`, `cudaMemcpyPeerAsync` |
16//! | [`profiler`] | `cudaProfilerStart`, `cudaProfilerStop`, [`profiler::ProfilerGuard`] |
17//! | [`error`] | [`CudaRtError`], [`CudaRtResult`] |
18//!
19//! ## Design goals
20//!
21//! - **Zero CUDA SDK build-time dependency**: just like `oxicuda-driver`, the
22//! runtime crate only needs the NVIDIA driver (`libcuda.so` / `nvcuda.dll`)
23//! at *run* time.
24//! - **Ergonomic Rust API**: strong types for streams, events, device pointers,
25//! and kernel dimensions instead of raw pointers.
26//! - **No unwrap**: all fallible operations return `Result`.
27//!
28//! ## Quick start
29//!
30//! ```rust,no_run
31//! use oxicuda_runtime::{device, memory, stream, event};
32//! use oxicuda_runtime::memory::MemcpyKind;
33//!
34//! // Select device 0.
35//! device::set_device(0)?;
36//!
37//! // Allocate 1 MiB of device memory.
38//! let d_buf = memory::malloc(1 << 20)?;
39//!
40//! // Zero it.
41//! memory::memset(d_buf, 0, 1 << 20)?;
42//!
43//! // Create a stream, record an event.
44//! let s = stream::stream_create()?;
45//! let e = event::event_create()?;
46//! event::event_record(e, s)?;
47//! event::event_synchronize(e)?;
48//!
49//! // Cleanup.
50//! event::event_destroy(e)?;
51//! stream::stream_destroy(s)?;
52//! memory::free(d_buf)?;
53//! # Ok::<(), oxicuda_runtime::error::CudaRtError>(())
54//! ```
55
56// ─── Modules ─────────────────────────────────────────────────────────────────
57
58pub mod device;
59pub mod error;
60pub mod event;
61pub mod launch;
62pub mod memory;
63pub mod peer;
64pub mod profiler;
65pub mod stream;
66pub mod texture;
67
68// ─── Top-level re-exports ────────────────────────────────────────────────────
69
70pub use device::CudaDeviceProp;
71pub use error::{CudaRtError, CudaRtResult};
72pub use event::{CudaEvent, EventFlags};
73pub use launch::{CudaFunction, CudaModule, Dim3, FuncAttribute, FuncAttributes};
74pub use memory::DevicePtr;
75pub use stream::{CudaStream, StreamFlags};
76pub use texture::{
77 AddressMode, Array3DFlags, ArrayFormat, CudaArray, CudaArray3D, CudaSurfaceObject,
78 CudaTextureObject, FilterMode, ResourceDesc, ResourceViewDesc, TextureDesc,
79};
80
81// ─── Convenience API (flat namespace) ────────────────────────────────────────
82
83/// Returns the number of CUDA-capable devices (mirrors `cudaGetDeviceCount`).
84pub fn get_device_count() -> CudaRtResult<u32> {
85 device::get_device_count()
86}
87
88/// Set the current device for this thread (mirrors `cudaSetDevice`).
89pub fn set_device(ordinal: u32) -> CudaRtResult<()> {
90 device::set_device(ordinal)
91}
92
93/// Get the current device for this thread (mirrors `cudaGetDevice`).
94pub fn get_device() -> CudaRtResult<u32> {
95 device::get_device()
96}
97
98/// Block until all device operations complete (mirrors `cudaDeviceSynchronize`).
99pub fn device_synchronize() -> CudaRtResult<()> {
100 device::device_synchronize()
101}
102
103/// Allocate device memory (mirrors `cudaMalloc`).
104pub fn cuda_malloc(size: usize) -> CudaRtResult<DevicePtr> {
105 memory::malloc(size)
106}
107
108/// Free device memory (mirrors `cudaFree`).
109pub fn cuda_free(ptr: DevicePtr) -> CudaRtResult<()> {
110 memory::free(ptr)
111}
112
113/// Zero device memory (mirrors `cudaMemset`).
114pub fn cuda_memset(ptr: DevicePtr, value: u8, count: usize) -> CudaRtResult<()> {
115 memory::memset(ptr, value, count)
116}
117
118/// Copy host slice → device (typed helper, no raw pointers).
119pub fn memcpy_h2d<T: Copy>(dst: DevicePtr, src: &[T]) -> CudaRtResult<()> {
120 memory::memcpy_h2d(dst, src)
121}
122
123/// Copy device → host slice (typed helper, no raw pointers).
124pub fn memcpy_d2h<T: Copy>(dst: &mut [T], src: DevicePtr) -> CudaRtResult<()> {
125 memory::memcpy_d2h(dst, src)
126}
127
128/// Copy between device allocations.
129pub fn memcpy_d2d(dst: DevicePtr, src: DevicePtr, bytes: usize) -> CudaRtResult<()> {
130 memory::memcpy_d2d(dst, src, bytes)
131}
132
133// ─── Integration tests ───────────────────────────────────────────────────────
134
135#[cfg(test)]
136mod tests {
137 use super::*;
138
139 /// Verify the flat API delegates correctly without panicking.
140 #[test]
141 fn flat_api_no_panic() {
142 // These must all return Result, not panic, regardless of GPU presence.
143 let _ = get_device_count();
144 let _ = get_device();
145 let _ = cuda_malloc(0);
146 let _ = cuda_free(DevicePtr::NULL);
147 let _ = cuda_memset(DevicePtr::NULL, 0, 0);
148 }
149
150 #[test]
151 fn device_ptr_arithmetic() {
152 let base = DevicePtr(0x1000);
153 assert_eq!(base.offset(16), DevicePtr(0x1010));
154 assert_eq!(base.offset(-16), DevicePtr(0x0FF0));
155 }
156
157 #[test]
158 fn dim3_convenience() {
159 let d = Dim3::one_d(1024);
160 assert_eq!(d.volume(), 1024);
161 let d2 = Dim3::two_d(32, 8);
162 assert_eq!(d2.volume(), 256);
163 }
164
165 #[test]
166 fn error_display_non_empty() {
167 let e = CudaRtError::MemoryAllocation;
168 assert!(!e.to_string().is_empty());
169 }
170
171 #[test]
172 fn stream_flags_constants() {
173 assert_eq!(StreamFlags::DEFAULT.0, 0);
174 assert_eq!(StreamFlags::NON_BLOCKING.0, 1);
175 }
176
177 #[test]
178 fn event_flags_constants() {
179 assert_eq!(EventFlags::DEFAULT.0, 0);
180 assert_eq!(EventFlags::DISABLE_TIMING.0, 2);
181 }
182}