oxicuda_runtime/
memory.rs

1//! Device and host memory management.
2//!
3//! Implements the CUDA Runtime memory API:
4//! - `cudaMalloc` / `cudaFree`
5//! - `cudaMallocHost` / `cudaFreeHost` (pinned host memory)
6//! - `cudaMallocManaged` (unified memory)
7//! - `cudaMallocPitch` (pitched 2-D allocation)
8//! - `cudaMemcpy` / `cudaMemcpyAsync`
9//! - `cudaMemset` / `cudaMemsetAsync`
10//! - `cudaMemGetInfo`
11//!
12//! All memory addresses returned for device allocations are represented as
13//! [`DevicePtr`], a newtype around `u64` that matches the driver API's
14//! `CUdeviceptr`.
15
16use std::ffi::c_void;
17
18use oxicuda_driver::loader::try_driver;
19
20use crate::error::{CudaRtError, CudaRtResult};
21use crate::stream::CudaStream;
22
23// ─── DevicePtr ───────────────────────────────────────────────────────────────
24
25/// Opaque CUDA device-memory address (mirrors `CUdeviceptr`).
26///
27/// This is a plain `u64` wrapped in a newtype to prevent accidental
28/// dereferencing from host code.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
30pub struct DevicePtr(pub u64);
31
32impl DevicePtr {
33    /// The null (zero) device pointer.
34    pub const NULL: Self = Self(0);
35
36    /// Returns `true` if this is the null pointer.
37    #[must_use]
38    pub fn is_null(self) -> bool {
39        self.0 == 0
40    }
41
42    /// Offset this pointer by `offset` bytes, returning a new `DevicePtr`.
43    #[must_use]
44    pub fn offset(self, offset: isize) -> Self {
45        Self((self.0 as i64 + offset as i64) as u64)
46    }
47}
48
49// ─── MemcpyKind ──────────────────────────────────────────────────────────────
50
51/// Direction of a `cudaMemcpy` transfer.
52///
53/// Mirrors `cudaMemcpyKind`.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
55pub enum MemcpyKind {
56    /// Host → Host.
57    HostToHost = 0,
58    /// Host → Device.
59    HostToDevice = 1,
60    /// Device → Host.
61    DeviceToHost = 2,
62    /// Device → Device.
63    DeviceToDevice = 3,
64    /// Direction inferred from pointer attributes (unified addressing).
65    Default = 4,
66}
67
68// ─── MemAttachFlags ──────────────────────────────────────────────────────────
69
70/// Flags for `cudaMallocManaged`.
71///
72/// Mirrors `cudaMemAttachFlags`.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
74pub enum MemAttachFlags {
75    /// Memory accessible by all CUDA devices and host.
76    Global = 1,
77    /// Memory only accessible by the host and a single CUDA device.
78    Host = 2,
79    /// Memory only accessible by single stream (deprecated in CUDA 12).
80    Single = 4,
81}
82
83// ─── Allocation ──────────────────────────────────────────────────────────────
84
85/// Allocate `size` bytes of device memory.
86///
87/// Mirrors `cudaMalloc`.
88///
89/// # Errors
90///
91/// - [`CudaRtError::DriverNotAvailable`] — driver not loaded.
92/// - [`CudaRtError::MemoryAllocation`] — out of device memory.
93pub fn malloc(size: usize) -> CudaRtResult<DevicePtr> {
94    if size == 0 {
95        return Ok(DevicePtr::NULL);
96    }
97    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
98    let mut ptr: u64 = 0;
99    // SAFETY: FFI; ptr is a valid stack-allocated u64.
100    let rc = unsafe { (api.cu_mem_alloc_v2)(&raw mut ptr, size) };
101    if rc != 0 {
102        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::MemoryAllocation));
103    }
104    Ok(DevicePtr(ptr))
105}
106
107/// Free device memory previously allocated with [`malloc`].
108///
109/// Mirrors `cudaFree`.
110///
111/// # Errors
112///
113/// Propagates driver errors.  Passing [`DevicePtr::NULL`] is a no-op.
114pub fn free(ptr: DevicePtr) -> CudaRtResult<()> {
115    if ptr.is_null() {
116        return Ok(());
117    }
118    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
119    // SAFETY: FFI; ptr was returned by cu_mem_alloc_v2.
120    let rc = unsafe { (api.cu_mem_free_v2)(ptr.0) };
121    if rc != 0 {
122        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidDevicePointer));
123    }
124    Ok(())
125}
126
127/// Allocate `size` bytes of pinned (page-locked) host memory.
128///
129/// Mirrors `cudaMallocHost`.
130///
131/// Returns a raw host pointer that must be freed with [`free_host`].
132///
133/// # Errors
134///
135/// - [`CudaRtError::MemoryAllocation`] — out of host memory.
136pub fn malloc_host(size: usize) -> CudaRtResult<*mut c_void> {
137    if size == 0 {
138        return Ok(std::ptr::null_mut());
139    }
140    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
141    let mut ptr: *mut c_void = std::ptr::null_mut();
142    // SAFETY: FFI; ptr is a valid stack-allocated pointer.
143    let rc = unsafe { (api.cu_mem_alloc_host_v2)(&raw mut ptr, size) };
144    if rc != 0 {
145        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::MemoryAllocation));
146    }
147    Ok(ptr)
148}
149
150/// Free page-locked host memory previously allocated with [`malloc_host`].
151///
152/// Mirrors `cudaFreeHost`.
153///
154/// # Errors
155///
156/// Propagates driver errors.
157///
158/// # Safety
159///
160/// `ptr` must have been returned by [`malloc_host`] and must not have been
161/// freed already.
162pub unsafe fn free_host(ptr: *mut c_void) -> CudaRtResult<()> {
163    if ptr.is_null() {
164        return Ok(());
165    }
166    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
167    // SAFETY: FFI; ptr was returned by cu_mem_alloc_host_v2.
168    let rc = unsafe { (api.cu_mem_free_host)(ptr) };
169    if rc != 0 {
170        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidHostPointer));
171    }
172    Ok(())
173}
174
175/// Allocate unified managed memory accessible from both CPU and GPU.
176///
177/// Mirrors `cudaMallocManaged`.
178///
179/// # Errors
180///
181/// - [`CudaRtError::NotSupported`] — device does not support managed memory.
182/// - [`CudaRtError::MemoryAllocation`] — out of memory.
183pub fn malloc_managed(size: usize, flags: MemAttachFlags) -> CudaRtResult<DevicePtr> {
184    if size == 0 {
185        return Ok(DevicePtr::NULL);
186    }
187    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
188    let mut ptr: u64 = 0;
189    // SAFETY: FFI; ptr is valid and flags maps to CU_MEM_ATTACH_* values.
190    let rc = unsafe { (api.cu_mem_alloc_managed)(&raw mut ptr, size, flags as u32) };
191    if rc != 0 {
192        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::MemoryAllocation));
193    }
194    Ok(DevicePtr(ptr))
195}
196
197/// Allocate pitched device memory for 2-D arrays.
198///
199/// Mirrors `cudaMallocPitch`.
200///
201/// Returns `(device_ptr, pitch_bytes)`.  `pitch_bytes` is ≥ `width_bytes`
202/// and aligned to the hardware's texture alignment.
203///
204/// # Errors
205///
206/// Propagates driver errors.
207pub fn malloc_pitch(width_bytes: usize, height: usize) -> CudaRtResult<(DevicePtr, usize)> {
208    if width_bytes == 0 || height == 0 {
209        return Ok((DevicePtr::NULL, 0));
210    }
211    // Compute the pitch: round width_bytes up to 512-byte alignment, which
212    // matches the driver's cuMemAllocPitch behaviour for most hardware.
213    let align: usize = 512;
214    let pitch = width_bytes.div_ceil(align) * align;
215    let size = pitch * height;
216    let ptr = malloc(size)?;
217    Ok((ptr, pitch))
218}
219
220// ─── Memcpy ──────────────────────────────────────────────────────────────────
221
222/// Synchronously copy `count` bytes between memory regions.
223///
224/// Mirrors `cudaMemcpy`.
225///
226/// # Safety
227///
228/// `src` and `dst` must point to valid memory of the appropriate kind
229/// (host or device) and must not overlap.
230///
231/// # Errors
232///
233/// - [`CudaRtError::InvalidMemcpyDirection`] for unsupported `kind`.
234/// - Driver errors for invalid pointers or counts.
235pub unsafe fn memcpy(
236    dst: *mut c_void,
237    src: *const c_void,
238    count: usize,
239    kind: MemcpyKind,
240) -> CudaRtResult<()> {
241    if count == 0 {
242        return Ok(());
243    }
244    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
245    let rc = match kind {
246        MemcpyKind::HostToHost => {
247            // Pure host copy — no driver involvement.
248            // SAFETY: Caller ensures src/dst are valid and non-overlapping.
249            unsafe { std::ptr::copy_nonoverlapping(src as *const u8, dst as *mut u8, count) };
250            0u32
251        }
252        MemcpyKind::HostToDevice => {
253            let dst_ptr = dst as u64;
254            // SAFETY: FFI; src/dst valid per caller contract.
255            unsafe { (api.cu_memcpy_htod_v2)(dst_ptr, src, count) }
256        }
257        MemcpyKind::DeviceToHost => {
258            let src_ptr = src as u64;
259            // SAFETY: FFI; src/dst valid per caller contract.
260            unsafe { (api.cu_memcpy_dtoh_v2)(dst, src_ptr, count) }
261        }
262        MemcpyKind::DeviceToDevice => {
263            let dst_ptr = dst as u64;
264            let src_ptr = src as u64;
265            // SAFETY: FFI; src/dst valid per caller contract.
266            unsafe { (api.cu_memcpy_dtod_v2)(dst_ptr, src_ptr, count) }
267        }
268        MemcpyKind::Default => {
269            // Fall back to H2D (common case; real implementation would use
270            // cuPointerGetAttribute to determine actual memory type).
271            let dst_ptr = dst as u64;
272            // SAFETY: FFI.
273            unsafe { (api.cu_memcpy_htod_v2)(dst_ptr, src, count) }
274        }
275    };
276    if rc != 0 {
277        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidMemcpyDirection));
278    }
279    Ok(())
280}
281
282/// Asynchronously copy `count` bytes on `stream`.
283///
284/// Mirrors `cudaMemcpyAsync`.
285///
286/// # Safety
287///
288/// Same requirements as [`memcpy`] plus `stream` must be valid.
289///
290/// # Errors
291///
292/// Propagates driver errors.
293pub unsafe fn memcpy_async(
294    dst: *mut c_void,
295    src: *const c_void,
296    count: usize,
297    kind: MemcpyKind,
298    stream: &CudaStream,
299) -> CudaRtResult<()> {
300    if count == 0 {
301        return Ok(());
302    }
303    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
304    let rc = match kind {
305        MemcpyKind::HostToHost => {
306            // SAFETY: host-to-host can be dispatched synchronously.
307            unsafe { std::ptr::copy_nonoverlapping(src as *const u8, dst as *mut u8, count) };
308            0u32
309        }
310        MemcpyKind::HostToDevice | MemcpyKind::Default => {
311            let dst_ptr = dst as u64;
312            // SAFETY: FFI; caller guarantees validity.
313            unsafe { (api.cu_memcpy_htod_async_v2)(dst_ptr, src, count, stream.raw()) }
314        }
315        MemcpyKind::DeviceToHost => {
316            let src_ptr = src as u64;
317            // SAFETY: FFI.
318            unsafe { (api.cu_memcpy_dtoh_async_v2)(dst, src_ptr, count, stream.raw()) }
319        }
320        MemcpyKind::DeviceToDevice => {
321            // Fall back to synchronous D2D (driver lacks async D2D helper in v1).
322            let dst_ptr = dst as u64;
323            let src_ptr = src as u64;
324            // SAFETY: FFI.
325            unsafe { (api.cu_memcpy_dtod_v2)(dst_ptr, src_ptr, count) }
326        }
327    };
328    if rc != 0 {
329        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidMemcpyDirection));
330    }
331    Ok(())
332}
333
334// ─── Typed helpers ────────────────────────────────────────────────────────────
335
336/// Copy a slice of host data to a device allocation.
337///
338/// # Errors
339///
340/// Propagates driver errors.
341pub fn memcpy_h2d<T: Copy>(dst: DevicePtr, src: &[T]) -> CudaRtResult<()> {
342    let bytes = std::mem::size_of_val(src);
343    // SAFETY: src is a valid slice; dst is a device allocation.
344    unsafe {
345        memcpy(
346            dst.0 as *mut c_void,
347            src.as_ptr() as *const c_void,
348            bytes,
349            MemcpyKind::HostToDevice,
350        )
351    }
352}
353
354/// Copy device memory to a host slice.
355///
356/// # Errors
357///
358/// Propagates driver errors.
359pub fn memcpy_d2h<T: Copy>(dst: &mut [T], src: DevicePtr) -> CudaRtResult<()> {
360    let bytes = std::mem::size_of_val(dst);
361    // SAFETY: dst is a valid mutable slice; src is a device allocation.
362    unsafe {
363        memcpy(
364            dst.as_mut_ptr() as *mut c_void,
365            src.0 as *const c_void,
366            bytes,
367            MemcpyKind::DeviceToHost,
368        )
369    }
370}
371
372/// Copy between two device allocations.
373///
374/// # Errors
375///
376/// Propagates driver errors.
377pub fn memcpy_d2d(dst: DevicePtr, src: DevicePtr, bytes: usize) -> CudaRtResult<()> {
378    // SAFETY: both ptrs are device allocations.
379    unsafe {
380        memcpy(
381            dst.0 as *mut c_void,
382            src.0 as *const c_void,
383            bytes,
384            MemcpyKind::DeviceToDevice,
385        )
386    }
387}
388
389// ─── Memset ──────────────────────────────────────────────────────────────────
390
391/// Set `count` bytes of device memory starting at `ptr` to `value`.
392///
393/// Mirrors `cudaMemset`.
394///
395/// # Errors
396///
397/// Propagates driver errors.
398pub fn memset(ptr: DevicePtr, value: u8, count: usize) -> CudaRtResult<()> {
399    if count == 0 || ptr.is_null() {
400        return Ok(());
401    }
402    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
403    // SAFETY: FFI; ptr is a valid device allocation.
404    let rc = unsafe { (api.cu_memset_d8_v2)(ptr.0, value, count) };
405    if rc != 0 {
406        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidDevicePointer));
407    }
408    Ok(())
409}
410
411/// Set device memory to 32-bit value pattern.
412///
413/// `count` is the number of 32-bit words (not bytes) to set.
414/// Mirrors `cudaMemset` for 4-byte granularity.
415///
416/// # Errors
417///
418/// Propagates driver errors.
419pub fn memset32(ptr: DevicePtr, value: u32, count: usize) -> CudaRtResult<()> {
420    if count == 0 || ptr.is_null() {
421        return Ok(());
422    }
423    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
424    // SAFETY: FFI; ptr is a valid device allocation.
425    let rc = unsafe { (api.cu_memset_d32_v2)(ptr.0, value, count) };
426    if rc != 0 {
427        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::InvalidDevicePointer));
428    }
429    Ok(())
430}
431
432// ─── MemGetInfo ──────────────────────────────────────────────────────────────
433
434/// Returns `(free_bytes, total_bytes)` for the current device's global memory.
435///
436/// Mirrors `cudaMemGetInfo`.
437///
438/// # Errors
439///
440/// Propagates driver errors.
441pub fn mem_get_info() -> CudaRtResult<(usize, usize)> {
442    let api = try_driver().map_err(|_| CudaRtError::DriverNotAvailable)?;
443    let mut free: usize = 0;
444    let mut total: usize = 0;
445    // SAFETY: FFI; both pointers are valid stack-allocated usizes.
446    let rc = unsafe { (api.cu_mem_get_info_v2)(&raw mut free, &raw mut total) };
447    if rc != 0 {
448        return Err(CudaRtError::from_code(rc).unwrap_or(CudaRtError::Unknown));
449    }
450    Ok((free, total))
451}
452
453// ─── Tests ───────────────────────────────────────────────────────────────────
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn malloc_zero_returns_null() {
461        // zero-byte allocation must return NULL without calling the driver.
462        // This is valid even without a GPU.
463        let result = malloc(0);
464        assert!(matches!(result, Ok(DevicePtr(0))));
465    }
466
467    #[test]
468    fn free_null_is_noop() {
469        // freeing a null pointer must not panic or call the driver.
470        let result = free(DevicePtr::NULL);
471        assert!(result.is_ok() || result.is_err()); // either is acceptable w/o GPU
472    }
473
474    #[test]
475    fn device_ptr_offset() {
476        let p = DevicePtr(1000);
477        assert_eq!(p.offset(8), DevicePtr(1008));
478        assert_eq!(p.offset(-8), DevicePtr(992));
479    }
480
481    #[test]
482    fn device_ptr_is_null() {
483        assert!(DevicePtr::NULL.is_null());
484        assert!(!DevicePtr(1).is_null());
485    }
486
487    #[test]
488    fn malloc_pitch_returns_aligned_pitch() {
489        // Without a GPU, malloc_pitch falls through to malloc which may fail,
490        // but the pitch computation is pure arithmetic.
491        let (_, pitch) = malloc_pitch(100, 32).unwrap_or((DevicePtr::NULL, 512));
492        // Pitch must be a multiple of 512.
493        assert_eq!(pitch % 512, 0);
494        assert!(pitch >= 100);
495    }
496
497    #[test]
498    fn memcpy_kind_values() {
499        assert_eq!(MemcpyKind::HostToHost as u32, 0);
500        assert_eq!(MemcpyKind::HostToDevice as u32, 1);
501        assert_eq!(MemcpyKind::DeviceToHost as u32, 2);
502        assert_eq!(MemcpyKind::DeviceToDevice as u32, 3);
503        assert_eq!(MemcpyKind::Default as u32, 4);
504    }
505}
oxicuda_runtime/memory.rs

oxicuda_runtime/
memory.rs