oxicuda_runtime/
lib.rs

1//! # OxiCUDA Runtime
2//!
3//! Pure-Rust implementation of the **CUDA Runtime API** (`libcudart`) surface,
4//! built on top of `oxicuda-driver`'s dynamic driver loader.
5//!
6//! ## Coverage
7//!
8//! | Module       | API functions                                                       |
9//! |--------------|---------------------------------------------------------------------|
10//! | [`device`]   | `cudaGetDeviceCount`, `cudaSetDevice`, `cudaGetDevice`, `cudaGetDeviceProperties`, `cudaDeviceSynchronize`, `cudaDeviceReset` |
11//! | [`memory`]   | `cudaMalloc`, `cudaFree`, `cudaMallocHost`, `cudaFreeHost`, `cudaMallocManaged`, `cudaMallocPitch`, `cudaMemcpy`, `cudaMemcpyAsync`, `cudaMemset`, `cudaMemGetInfo` |
12//! | [`stream`]   | `cudaStreamCreate`, `cudaStreamCreateWithFlags`, `cudaStreamCreateWithPriority`, `cudaStreamDestroy`, `cudaStreamSynchronize`, `cudaStreamQuery`, `cudaStreamWaitEvent`, `cudaStreamGetPriority`, `cudaStreamGetFlags` |
13//! | [`event`]    | `cudaEventCreate`, `cudaEventCreateWithFlags`, `cudaEventDestroy`, `cudaEventRecord`, `cudaEventSynchronize`, `cudaEventQuery`, `cudaEventElapsedTime` |
14//! | [`launch`]   | `cudaLaunchKernel` (explicit function handle), `cudaFuncGetAttributes`, `cudaFuncSetAttribute`, `module_load_ptx`, `module_get_function`, `module_unload` |
15//! | [`peer`]     | `cudaDeviceCanAccessPeer`, `cudaDeviceEnablePeerAccess`, `cudaDeviceDisablePeerAccess`, `cudaMemcpyPeer`, `cudaMemcpyPeerAsync` |
16//! | [`profiler`] | `cudaProfilerStart`, `cudaProfilerStop`, [`profiler::ProfilerGuard`] |
17//! | [`error`]    | [`CudaRtError`], [`CudaRtResult`]                                   |
18//!
19//! ## Design goals
20//!
21//! - **Zero CUDA SDK build-time dependency**: just like `oxicuda-driver`, the
22//!   runtime crate only needs the NVIDIA driver (`libcuda.so` / `nvcuda.dll`)
23//!   at *run* time.
24//! - **Ergonomic Rust API**: strong types for streams, events, device pointers,
25//!   and kernel dimensions instead of raw pointers.
26//! - **No unwrap**: all fallible operations return `Result`.
27//!
28//! ## Quick start
29//!
30//! ```rust,no_run
31//! use oxicuda_runtime::{device, memory, stream, event};
32//! use oxicuda_runtime::memory::MemcpyKind;
33//!
34//! // Select device 0.
35//! device::set_device(0)?;
36//!
37//! // Allocate 1 MiB of device memory.
38//! let d_buf = memory::malloc(1 << 20)?;
39//!
40//! // Zero it.
41//! memory::memset(d_buf, 0, 1 << 20)?;
42//!
43//! // Create a stream, record an event.
44//! let s = stream::stream_create()?;
45//! let e = event::event_create()?;
46//! event::event_record(e, s)?;
47//! event::event_synchronize(e)?;
48//!
49//! // Cleanup.
50//! event::event_destroy(e)?;
51//! stream::stream_destroy(s)?;
52//! memory::free(d_buf)?;
53//! # Ok::<(), oxicuda_runtime::error::CudaRtError>(())
54//! ```
55
56// ─── Modules ─────────────────────────────────────────────────────────────────
57
58pub mod device;
59pub mod error;
60pub mod event;
61pub mod launch;
62pub mod memory;
63pub mod peer;
64pub mod profiler;
65pub mod stream;
66pub mod texture;
67
68// ─── Top-level re-exports ────────────────────────────────────────────────────
69
70pub use device::CudaDeviceProp;
71pub use error::{CudaRtError, CudaRtResult};
72pub use event::{CudaEvent, EventFlags};
73pub use launch::{CudaFunction, CudaModule, Dim3, FuncAttribute, FuncAttributes};
74pub use memory::DevicePtr;
75pub use stream::{CudaStream, StreamFlags};
76pub use texture::{
77    AddressMode, Array3DFlags, ArrayFormat, CudaArray, CudaArray3D, CudaSurfaceObject,
78    CudaTextureObject, FilterMode, ResourceDesc, ResourceViewDesc, TextureDesc,
79};
80
81// ─── Convenience API (flat namespace) ────────────────────────────────────────
82
83/// Returns the number of CUDA-capable devices (mirrors `cudaGetDeviceCount`).
84pub fn get_device_count() -> CudaRtResult<u32> {
85    device::get_device_count()
86}
87
88/// Set the current device for this thread (mirrors `cudaSetDevice`).
89pub fn set_device(ordinal: u32) -> CudaRtResult<()> {
90    device::set_device(ordinal)
91}
92
93/// Get the current device for this thread (mirrors `cudaGetDevice`).
94pub fn get_device() -> CudaRtResult<u32> {
95    device::get_device()
96}
97
98/// Block until all device operations complete (mirrors `cudaDeviceSynchronize`).
99pub fn device_synchronize() -> CudaRtResult<()> {
100    device::device_synchronize()
101}
102
103/// Allocate device memory (mirrors `cudaMalloc`).
104pub fn cuda_malloc(size: usize) -> CudaRtResult<DevicePtr> {
105    memory::malloc(size)
106}
107
108/// Free device memory (mirrors `cudaFree`).
109pub fn cuda_free(ptr: DevicePtr) -> CudaRtResult<()> {
110    memory::free(ptr)
111}
112
113/// Zero device memory (mirrors `cudaMemset`).
114pub fn cuda_memset(ptr: DevicePtr, value: u8, count: usize) -> CudaRtResult<()> {
115    memory::memset(ptr, value, count)
116}
117
118/// Copy host slice → device (typed helper, no raw pointers).
119pub fn memcpy_h2d<T: Copy>(dst: DevicePtr, src: &[T]) -> CudaRtResult<()> {
120    memory::memcpy_h2d(dst, src)
121}
122
123/// Copy device → host slice (typed helper, no raw pointers).
124pub fn memcpy_d2h<T: Copy>(dst: &mut [T], src: DevicePtr) -> CudaRtResult<()> {
125    memory::memcpy_d2h(dst, src)
126}
127
128/// Copy between device allocations.
129pub fn memcpy_d2d(dst: DevicePtr, src: DevicePtr, bytes: usize) -> CudaRtResult<()> {
130    memory::memcpy_d2d(dst, src, bytes)
131}
132
133// ─── Integration tests ───────────────────────────────────────────────────────
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    /// Verify the flat API delegates correctly without panicking.
140    #[test]
141    fn flat_api_no_panic() {
142        // These must all return Result, not panic, regardless of GPU presence.
143        let _ = get_device_count();
144        let _ = get_device();
145        let _ = cuda_malloc(0);
146        let _ = cuda_free(DevicePtr::NULL);
147        let _ = cuda_memset(DevicePtr::NULL, 0, 0);
148    }
149
150    #[test]
151    fn device_ptr_arithmetic() {
152        let base = DevicePtr(0x1000);
153        assert_eq!(base.offset(16), DevicePtr(0x1010));
154        assert_eq!(base.offset(-16), DevicePtr(0x0FF0));
155    }
156
157    #[test]
158    fn dim3_convenience() {
159        let d = Dim3::one_d(1024);
160        assert_eq!(d.volume(), 1024);
161        let d2 = Dim3::two_d(32, 8);
162        assert_eq!(d2.volume(), 256);
163    }
164
165    #[test]
166    fn error_display_non_empty() {
167        let e = CudaRtError::MemoryAllocation;
168        assert!(!e.to_string().is_empty());
169    }
170
171    #[test]
172    fn stream_flags_constants() {
173        assert_eq!(StreamFlags::DEFAULT.0, 0);
174        assert_eq!(StreamFlags::NON_BLOCKING.0, 1);
175    }
176
177    #[test]
178    fn event_flags_constants() {
179        assert_eq!(EventFlags::DEFAULT.0, 0);
180        assert_eq!(EventFlags::DISABLE_TIMING.0, 2);
181    }
182}
oxicuda_runtime/lib.rs

oxicuda_runtime/
lib.rs