oxicuda_memory/
zero_copy.rs

1//! Zero-copy (host-mapped) memory.
2//!
3//! Allows GPU kernels to directly access host memory without explicit
4//! transfers.  Useful for small, frequently-updated data or when PCIe
5//! bandwidth is acceptable.
6//!
7//! # How it works
8//!
9//! Zero-copy memory is allocated on the host using `cuMemAllocHost_v2`,
10//! which allocates page-locked (pinned) memory that the CUDA driver maps
11//! into the device's address space.  A corresponding device pointer is
12//! obtained via `cuMemHostGetDevicePointer_v2`.  GPU reads and writes
13//! traverse the PCIe bus on each access, so this is best suited for data
14//! that is accessed infrequently or streamed sequentially.
15//!
16//! # Example
17//!
18//! ```rust,no_run
19//! use oxicuda_memory::zero_copy::MappedBuffer;
20//!
21//! oxicuda_driver::init()?;
22//! let _ = oxicuda_driver::primary_context::PrimaryContext::retain(
23//!     &oxicuda_driver::device::Device::get(0)?
24//! )?;
25//!
26//! let mut buf = MappedBuffer::<f32>::alloc(256)?;
27//! // Write from the host.
28//! for (i, val) in buf.as_host_slice_mut().iter_mut().enumerate() {
29//!     *val = i as f32;
30//! }
31//! // `buf.as_device_ptr()` can now be passed to a kernel.
32//! # Ok::<(), oxicuda_driver::error::CudaError>(())
33//! ```
34
35use std::ffi::c_void;
36use std::marker::PhantomData;
37use std::mem::size_of;
38
39use oxicuda_driver::error::CudaResult;
40use oxicuda_driver::ffi::CUdeviceptr;
41use oxicuda_driver::loader::try_driver;
42
43// ---------------------------------------------------------------------------
44// MappedBuffer<T>
45// ---------------------------------------------------------------------------
46
47/// A host-allocated, device-mapped (zero-copy) memory buffer.
48///
49/// The host memory is page-locked and accessible from both CPU code and GPU
50/// kernels.  GPU accesses traverse the PCIe bus, making this suitable for
51/// small or infrequently-accessed data where the overhead of explicit
52/// transfers is not justified.
53///
54/// The buffer is freed automatically on drop via `cuMemFreeHost`.
55pub struct MappedBuffer<T: Copy> {
56    /// Host pointer to the pinned allocation.
57    host_ptr: *mut T,
58    /// Corresponding device pointer for kernel access.
59    device_ptr: CUdeviceptr,
60    /// Number of `T` elements.
61    len: usize,
62    /// Marker for the element type.
63    _phantom: PhantomData<T>,
64}
65
66// SAFETY: The page-locked host memory is not thread-local; both the host
67// and device pointers are valid for Send/Sync if T is.
68unsafe impl<T: Copy + Send> Send for MappedBuffer<T> {}
69unsafe impl<T: Copy + Sync> Sync for MappedBuffer<T> {}
70
71impl<T: Copy> MappedBuffer<T> {
72    /// Allocates a zero-copy host-mapped buffer of `n` elements.
73    ///
74    /// The allocation uses `cuMemAllocHost_v2` (page-locked pinned memory)
75    /// and retrieves the corresponding device pointer via
76    /// `cuMemHostGetDevicePointer_v2`.  A CUDA context must be current on
77    /// the calling thread.
78    ///
79    /// # Errors
80    ///
81    /// Returns a CUDA driver error if allocation or mapping fails.
82    pub fn alloc(n: usize) -> CudaResult<Self> {
83        let api = try_driver()?;
84        let byte_size = n.saturating_mul(size_of::<T>());
85
86        // Allocate page-locked host memory.
87        let mut raw_ptr: *mut c_void = std::ptr::null_mut();
88        oxicuda_driver::error::check(unsafe {
89            (api.cu_mem_alloc_host_v2)(&mut raw_ptr, byte_size)
90        })?;
91        let host_ptr = raw_ptr.cast::<T>();
92
93        // Obtain the device-side pointer for this pinned region.
94        let mut device_ptr: CUdeviceptr = 0;
95        let result = oxicuda_driver::error::check(unsafe {
96            (api.cu_mem_host_get_device_pointer_v2)(&mut device_ptr, raw_ptr, 0)
97        });
98        if let Err(e) = result {
99            // Free the pinned allocation before propagating the error.
100            unsafe { (api.cu_mem_free_host)(raw_ptr) };
101            return Err(e);
102        }
103
104        Ok(Self {
105            host_ptr,
106            device_ptr,
107            len: n,
108            _phantom: PhantomData,
109        })
110    }
111
112    /// Returns the number of `T` elements in this buffer.
113    #[inline]
114    pub fn len(&self) -> usize {
115        self.len
116    }
117
118    /// Returns `true` if the buffer contains zero elements.
119    #[inline]
120    pub fn is_empty(&self) -> bool {
121        self.len == 0
122    }
123
124    /// Returns the byte size of this buffer.
125    #[inline]
126    pub fn byte_size(&self) -> usize {
127        self.len * size_of::<T>()
128    }
129
130    /// Returns the raw device pointer for use in kernel parameters.
131    #[inline]
132    pub fn as_device_ptr(&self) -> CUdeviceptr {
133        self.device_ptr
134    }
135
136    /// Returns a raw const pointer to the host-side data.
137    #[inline]
138    pub fn as_host_ptr(&self) -> *const T {
139        self.host_ptr
140    }
141
142    /// Returns a raw mutable pointer to the host-side data.
143    #[inline]
144    pub fn as_host_ptr_mut(&mut self) -> *mut T {
145        self.host_ptr
146    }
147
148    /// Returns a shared slice over the host-side data.
149    ///
150    /// # Safety
151    ///
152    /// The caller must ensure no concurrent GPU writes are in flight.
153    pub fn as_host_slice(&self) -> &[T] {
154        // SAFETY: host_ptr is valid for `len` elements allocated by cuMemAllocHost.
155        unsafe { std::slice::from_raw_parts(self.host_ptr, self.len) }
156    }
157
158    /// Returns a mutable slice over the host-side data.
159    ///
160    /// # Safety
161    ///
162    /// The caller must ensure no concurrent GPU reads or writes are in flight.
163    pub fn as_host_slice_mut(&mut self) -> &mut [T] {
164        // SAFETY: host_ptr is valid for `len` elements allocated by cuMemAllocHost.
165        unsafe { std::slice::from_raw_parts_mut(self.host_ptr, self.len) }
166    }
167}
168
169impl<T: Copy> Drop for MappedBuffer<T> {
170    fn drop(&mut self) {
171        if self.host_ptr.is_null() {
172            return;
173        }
174        if let Ok(api) = try_driver() {
175            // SAFETY: host_ptr was allocated by cuMemAllocHost_v2 and has not
176            // been freed yet (Drop is called at most once).
177            unsafe { (api.cu_mem_free_host)(self.host_ptr.cast::<c_void>()) };
178        }
179    }
180}
oxicuda_memory/zero_copy.rs

oxicuda_memory/
zero_copy.rs