oxicuda-runtime 0.1.0

OxiCUDA Runtime - CUDA Runtime API wrapper (cudaMalloc/cudaMemcpy/cudaLaunchKernel) built on the driver API
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
//! Device and host memory management.
//!
//! Implements the CUDA Runtime memory API:
//! - `cudaMalloc` / `cudaFree`
//! - `cudaMallocHost` / `cudaFreeHost` (pinned host memory)
//! - `cudaMallocManaged` (unified memory)
//! - `cudaMallocPitch` (pitched 2-D allocation)
//! - `cudaMemcpy` / `cudaMemcpyAsync`
//! - `cudaMemset` / `cudaMemsetAsync`
//! - `cudaMemGetInfo`
//!
//! All memory addresses returned for device allocations are represented as
//! [`DevicePtr`], a newtype around `u64` that matches the driver API's
//! `CUdeviceptr`.

use std::ffi::c_void;

use oxicuda_driver::loader::try_driver;

use crate::error::{CudaRtError, CudaRtResult};
use crate::stream::CudaStream;

// ─── DevicePtr ───────────────────────────────────────────────────────────────

/// Opaque CUDA device-memory address (mirrors `CUdeviceptr`).
///
/// This is a plain `u64` wrapped in a newtype to prevent accidental
/// dereferencing from host code.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct DevicePtr(pub u64);

impl DevicePtr {
    /// The null (zero) device pointer.
    pub const NULL: Self = Self(0);

    /// Returns `true` if this is the null pointer.
    #[must_use]
    pub fn is_null(self) -> bool {
        self.0 == 0
    }

    /// Offset this pointer by `offset` bytes, returning a new `DevicePtr`.
    #[must_use]
    pub fn offset(self, offset: isize) -> Self {
        Self((self.0 as i64 + offset as i64) as u64)
    }
}

// ─── MemcpyKind ──────────────────────────────────────────────────────────────

/// Direction of a `cudaMemcpy` transfer.
///
/// Mirrors `cudaMemcpyKind`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MemcpyKind {
    /// Host → Host.
    HostToHost = 0,
    /// Host → Device.
    HostToDevice = 1,
    /// Device → Host.
    DeviceToHost = 2,
    /// Device → Device.
    DeviceToDevice = 3,
    /// Direction inferred from pointer attributes (unified addressing).
    Default = 4,
}

// ─── MemAttachFlags ──────────────────────────────────────────────────────────

/// Flags for `cudaMallocManaged`.
///
/// Mirrors `cudaMemAttachFlags`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MemAttachFlags {
    /// Memory accessible by all CUDA devices and host.
    Global = 1,
    /// Memory only accessible by the host and a single CUDA device.
    Host = 2,
    /// Memory only accessible by single stream (deprecated in CUDA 12).
    Single = 4,
}

// ─── Allocation ──────────────────────────────────────────────────────────────

/// Allocate `size` bytes of device memory.
///
/// Mirrors `cudaMalloc`.
///
/// # Errors
///
/// - [`CudaRtError::DriverNotAvailable`] — driver not loaded.
/// - [`CudaRtError::MemoryAllocation`] — out of device memory.
pub fn malloc(size: usize) -> CudaRtResult<DevicePtr> {
    if size == 0 {
        return Ok(DevicePtr::NULL);
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    let mut ptr: u64 = 0;
    // SAFETY: FFI; ptr is a valid stack-allocated u64.
    let rc = unsafe { (api.cu_mem_alloc_v2)(&raw mut ptr, size) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::MemoryAllocation));
    }
    Ok(DevicePtr(ptr))
}

/// Free device memory previously allocated with [`malloc`].
///
/// Mirrors `cudaFree`.
///
/// # Errors
///
/// Propagates driver errors.  Passing [`DevicePtr::NULL`] is a no-op.
pub fn free(ptr: DevicePtr) -> CudaRtResult<()> {
    if ptr.is_null() {
        return Ok(());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    // SAFETY: FFI; ptr was returned by cu_mem_alloc_v2.
    let rc = unsafe { (api.cu_mem_free_v2)(ptr.0) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidDevicePointer));
    }
    Ok(())
}

/// Allocate `size` bytes of pinned (page-locked) host memory.
///
/// Mirrors `cudaMallocHost`.
///
/// Returns a raw host pointer that must be freed with [`free_host`].
///
/// # Errors
///
/// - [`CudaRtError::MemoryAllocation`] — out of host memory.
pub fn malloc_host(size: usize) -> CudaRtResult<*mut c_void> {
    if size == 0 {
        return Ok(std::ptr::null_mut());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    let mut ptr: *mut c_void = std::ptr::null_mut();
    // SAFETY: FFI; ptr is a valid stack-allocated pointer.
    let rc = unsafe { (api.cu_mem_alloc_host_v2)(&raw mut ptr, size) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::MemoryAllocation));
    }
    Ok(ptr)
}

/// Free page-locked host memory previously allocated with [`malloc_host`].
///
/// Mirrors `cudaFreeHost`.
///
/// # Errors
///
/// Propagates driver errors.
///
/// # Safety
///
/// `ptr` must have been returned by [`malloc_host`] and must not have been
/// freed already.
pub unsafe fn free_host(ptr: *mut c_void) -> CudaRtResult<()> {
    if ptr.is_null() {
        return Ok(());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    // SAFETY: FFI; ptr was returned by cu_mem_alloc_host_v2.
    let rc = unsafe { (api.cu_mem_free_host)(ptr) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidHostPointer));
    }
    Ok(())
}

/// Allocate unified managed memory accessible from both CPU and GPU.
///
/// Mirrors `cudaMallocManaged`.
///
/// # Errors
///
/// - [`CudaRtError::NotSupported`] — device does not support managed memory.
/// - [`CudaRtError::MemoryAllocation`] — out of memory.
pub fn malloc_managed(size: usize, flags: MemAttachFlags) -> CudaRtResult<DevicePtr> {
    if size == 0 {
        return Ok(DevicePtr::NULL);
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    let mut ptr: u64 = 0;
    // SAFETY: FFI; ptr is valid and flags maps to CU_MEM_ATTACH_* values.
    let rc = unsafe { (api.cu_mem_alloc_managed)(&raw mut ptr, size, flags as u32) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::MemoryAllocation));
    }
    Ok(DevicePtr(ptr))
}

/// Allocate pitched device memory for 2-D arrays.
///
/// Mirrors `cudaMallocPitch`.
///
/// Returns `(device_ptr, pitch_bytes)`.  `pitch_bytes` is ≥ `width_bytes`
/// and aligned to the hardware's texture alignment.
///
/// # Errors
///
/// Propagates driver errors.
pub fn malloc_pitch(width_bytes: usize, height: usize) -> CudaRtResult<(DevicePtr, usize)> {
    if width_bytes == 0 || height == 0 {
        return Ok((DevicePtr::NULL, 0));
    }
    // Compute the pitch: round width_bytes up to 512-byte alignment, which
    // matches the driver's cuMemAllocPitch behaviour for most hardware.
    let align: usize = 512;
    let pitch = width_bytes.div_ceil(align) * align;
    let size = pitch * height;
    let ptr = malloc(size)?;
    Ok((ptr, pitch))
}

// ─── Memcpy ──────────────────────────────────────────────────────────────────

/// Synchronously copy `count` bytes between memory regions.
///
/// Mirrors `cudaMemcpy`.
///
/// # Safety
///
/// `src` and `dst` must point to valid memory of the appropriate kind
/// (host or device) and must not overlap.
///
/// # Errors
///
/// - [`CudaRtError::InvalidMemcpyDirection`] for unsupported `kind`.
/// - Driver errors for invalid pointers or counts.
pub unsafe fn memcpy(
    dst: *mut c_void,
    src: *const c_void,
    count: usize,
    kind: MemcpyKind,
) -> CudaRtResult<()> {
    if count == 0 {
        return Ok(());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    let rc = match kind {
        MemcpyKind::HostToHost => {
            // Pure host copy — no driver involvement.
            // SAFETY: Caller ensures src/dst are valid and non-overlapping.
            unsafe { std::ptr::copy_nonoverlapping(src as *const u8, dst as *mut u8, count) };
            0u32
        }
        MemcpyKind::HostToDevice => {
            let dst_ptr = dst as u64;
            // SAFETY: FFI; src/dst valid per caller contract.
            unsafe { (api.cu_memcpy_htod_v2)(dst_ptr, src, count) }
        }
        MemcpyKind::DeviceToHost => {
            let src_ptr = src as u64;
            // SAFETY: FFI; src/dst valid per caller contract.
            unsafe { (api.cu_memcpy_dtoh_v2)(dst, src_ptr, count) }
        }
        MemcpyKind::DeviceToDevice => {
            let dst_ptr = dst as u64;
            let src_ptr = src as u64;
            // SAFETY: FFI; src/dst valid per caller contract.
            unsafe { (api.cu_memcpy_dtod_v2)(dst_ptr, src_ptr, count) }
        }
        MemcpyKind::Default => {
            // Fall back to H2D (common case; real implementation would use
            // cuPointerGetAttribute to determine actual memory type).
            let dst_ptr = dst as u64;
            // SAFETY: FFI.
            unsafe { (api.cu_memcpy_htod_v2)(dst_ptr, src, count) }
        }
    };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidMemcpyDirection));
    }
    Ok(())
}

/// Asynchronously copy `count` bytes on `stream`.
///
/// Mirrors `cudaMemcpyAsync`.
///
/// # Safety
///
/// Same requirements as [`memcpy`] plus `stream` must be valid.
///
/// # Errors
///
/// Propagates driver errors.
pub unsafe fn memcpy_async(
    dst: *mut c_void,
    src: *const c_void,
    count: usize,
    kind: MemcpyKind,
    stream: &CudaStream,
) -> CudaRtResult<()> {
    if count == 0 {
        return Ok(());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    let rc = match kind {
        MemcpyKind::HostToHost => {
            // SAFETY: host-to-host can be dispatched synchronously.
            unsafe { std::ptr::copy_nonoverlapping(src as *const u8, dst as *mut u8, count) };
            0u32
        }
        MemcpyKind::HostToDevice | MemcpyKind::Default => {
            let dst_ptr = dst as u64;
            // SAFETY: FFI; caller guarantees validity.
            unsafe { (api.cu_memcpy_htod_async_v2)(dst_ptr, src, count, stream.raw()) }
        }
        MemcpyKind::DeviceToHost => {
            let src_ptr = src as u64;
            // SAFETY: FFI.
            unsafe { (api.cu_memcpy_dtoh_async_v2)(dst, src_ptr, count, stream.raw()) }
        }
        MemcpyKind::DeviceToDevice => {
            // Fall back to synchronous D2D (driver lacks async D2D helper in v1).
            let dst_ptr = dst as u64;
            let src_ptr = src as u64;
            // SAFETY: FFI.
            unsafe { (api.cu_memcpy_dtod_v2)(dst_ptr, src_ptr, count) }
        }
    };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidMemcpyDirection));
    }
    Ok(())
}

// ─── Typed helpers ────────────────────────────────────────────────────────────

/// Copy a slice of host data to a device allocation.
///
/// # Errors
///
/// Propagates driver errors.
pub fn memcpy_h2d<T: Copy>(dst: DevicePtr, src: &[T]) -> CudaRtResult<()> {
    let bytes = std::mem::size_of_val(src);
    // SAFETY: src is a valid slice; dst is a device allocation.
    unsafe {
        memcpy(
            dst.0 as *mut c_void,
            src.as_ptr() as *const c_void,
            bytes,
            MemcpyKind::HostToDevice,
        )
    }
}

/// Copy device memory to a host slice.
///
/// # Errors
///
/// Propagates driver errors.
pub fn memcpy_d2h<T: Copy>(dst: &mut [T], src: DevicePtr) -> CudaRtResult<()> {
    let bytes = std::mem::size_of_val(dst);
    // SAFETY: dst is a valid mutable slice; src is a device allocation.
    unsafe {
        memcpy(
            dst.as_mut_ptr() as *mut c_void,
            src.0 as *const c_void,
            bytes,
            MemcpyKind::DeviceToHost,
        )
    }
}

/// Copy between two device allocations.
///
/// # Errors
///
/// Propagates driver errors.
pub fn memcpy_d2d(dst: DevicePtr, src: DevicePtr, bytes: usize) -> CudaRtResult<()> {
    // SAFETY: both ptrs are device allocations.
    unsafe {
        memcpy(
            dst.0 as *mut c_void,
            src.0 as *const c_void,
            bytes,
            MemcpyKind::DeviceToDevice,
        )
    }
}

// ─── Memset ──────────────────────────────────────────────────────────────────

/// Set `count` bytes of device memory starting at `ptr` to `value`.
///
/// Mirrors `cudaMemset`.
///
/// # Errors
///
/// Propagates driver errors.
pub fn memset(ptr: DevicePtr, value: u8, count: usize) -> CudaRtResult<()> {
    if count == 0 || ptr.is_null() {
        return Ok(());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    // SAFETY: FFI; ptr is a valid device allocation.
    let rc = unsafe { (api.cu_memset_d8_v2)(ptr.0, value, count) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidDevicePointer));
    }
    Ok(())
}

/// Set device memory to 32-bit value pattern.
///
/// `count` is the number of 32-bit words (not bytes) to set.
/// Mirrors `cudaMemset` for 4-byte granularity.
///
/// # Errors
///
/// Propagates driver errors.
pub fn memset32(ptr: DevicePtr, value: u32, count: usize) -> CudaRtResult<()> {
    if count == 0 || ptr.is_null() {
        return Ok(());
    }
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    // SAFETY: FFI; ptr is a valid device allocation.
    let rc = unsafe { (api.cu_memset_d32_v2)(ptr.0, value, count) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidDevicePointer));
    }
    Ok(())
}

// ─── MemGetInfo ──────────────────────────────────────────────────────────────

/// Returns `(free_bytes, total_bytes)` for the current device's global memory.
///
/// Mirrors `cudaMemGetInfo`.
///
/// # Errors
///
/// Propagates driver errors.
pub fn mem_get_info() -> CudaRtResult<(usize, usize)> {
    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
    let mut free: usize = 0;
    let mut total: usize = 0;
    // SAFETY: FFI; both pointers are valid stack-allocated usizes.
    let rc = unsafe { (api.cu_mem_get_info_v2)(&raw mut free, &raw mut total) };
    if rc != 0 {
        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::Unknown));
    }
    Ok((free, total))
}

// ─── Tests ───────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn malloc_zero_returns_null() {
        // zero-byte allocation must return NULL without calling the driver.
        // This is valid even without a GPU.
        let result = malloc(0);
        assert!(matches!(result, Ok(DevicePtr(0))));
    }

    #[test]
    fn free_null_is_noop() {
        // freeing a null pointer must not panic or call the driver.
        let result = free(DevicePtr::NULL);
        assert!(result.is_ok() || result.is_err()); // either is acceptable w/o GPU
    }

    #[test]
    fn device_ptr_offset() {
        let p = DevicePtr(1000);
        assert_eq!(p.offset(8), DevicePtr(1008));
        assert_eq!(p.offset(-8), DevicePtr(992));
    }

    #[test]
    fn device_ptr_is_null() {
        assert!(DevicePtr::NULL.is_null());
        assert!(!DevicePtr(1).is_null());
    }

    #[test]
    fn malloc_pitch_returns_aligned_pitch() {
        // Without a GPU, malloc_pitch falls through to malloc which may fail,
        // but the pitch computation is pure arithmetic.
        let (_, pitch) = malloc_pitch(100, 32).unwrap_or((DevicePtr::NULL, 512));
        // Pitch must be a multiple of 512.
        assert_eq!(pitch % 512, 0);
        assert!(pitch >= 100);
    }

    #[test]
    fn memcpy_kind_values() {
        assert_eq!(MemcpyKind::HostToHost as u32, 0);
        assert_eq!(MemcpyKind::HostToDevice as u32, 1);
        assert_eq!(MemcpyKind::DeviceToHost as u32, 2);
        assert_eq!(MemcpyKind::DeviceToDevice as u32, 3);
        assert_eq!(MemcpyKind::Default as u32, 4);
    }
}