oxicuda_memory/zero_copy.rs
1//! Zero-copy (host-mapped) memory.
2//!
3//! Allows GPU kernels to directly access host memory without explicit
4//! transfers. Useful for small, frequently-updated data or when PCIe
5//! bandwidth is acceptable.
6//!
7//! # How it works
8//!
9//! Zero-copy memory is allocated on the host using `cuMemAllocHost_v2`,
10//! which allocates page-locked (pinned) memory that the CUDA driver maps
11//! into the device's address space. A corresponding device pointer is
12//! obtained via `cuMemHostGetDevicePointer_v2`. GPU reads and writes
13//! traverse the PCIe bus on each access, so this is best suited for data
14//! that is accessed infrequently or streamed sequentially.
15//!
16//! # Example
17//!
18//! ```rust,no_run
19//! use oxicuda_memory::zero_copy::MappedBuffer;
20//!
21//! oxicuda_driver::init()?;
22//! let _ = oxicuda_driver::primary_context::PrimaryContext::retain(
23//! &oxicuda_driver::device::Device::get(0)?
24//! )?;
25//!
26//! let mut buf = MappedBuffer::<f32>::alloc(256)?;
27//! // Write from the host.
28//! for (i, val) in buf.as_host_slice_mut().iter_mut().enumerate() {
29//! *val = i as f32;
30//! }
31//! // `buf.as_device_ptr()` can now be passed to a kernel.
32//! # Ok::<(), oxicuda_driver::error::CudaError>(())
33//! ```
34
35use std::ffi::c_void;
36use std::marker::PhantomData;
37use std::mem::size_of;
38
39use oxicuda_driver::error::CudaResult;
40use oxicuda_driver::ffi::CUdeviceptr;
41use oxicuda_driver::loader::try_driver;
42
43// ---------------------------------------------------------------------------
44// MappedBuffer<T>
45// ---------------------------------------------------------------------------
46
47/// A host-allocated, device-mapped (zero-copy) memory buffer.
48///
49/// The host memory is page-locked and accessible from both CPU code and GPU
50/// kernels. GPU accesses traverse the PCIe bus, making this suitable for
51/// small or infrequently-accessed data where the overhead of explicit
52/// transfers is not justified.
53///
54/// The buffer is freed automatically on drop via `cuMemFreeHost`.
55pub struct MappedBuffer<T: Copy> {
56 /// Host pointer to the pinned allocation.
57 host_ptr: *mut T,
58 /// Corresponding device pointer for kernel access.
59 device_ptr: CUdeviceptr,
60 /// Number of `T` elements.
61 len: usize,
62 /// Marker for the element type.
63 _phantom: PhantomData<T>,
64}
65
66// SAFETY: The page-locked host memory is not thread-local; both the host
67// and device pointers are valid for Send/Sync if T is.
68unsafe impl<T: Copy + Send> Send for MappedBuffer<T> {}
69unsafe impl<T: Copy + Sync> Sync for MappedBuffer<T> {}
70
71impl<T: Copy> MappedBuffer<T> {
72 /// Allocates a zero-copy host-mapped buffer of `n` elements.
73 ///
74 /// The allocation uses `cuMemAllocHost_v2` (page-locked pinned memory)
75 /// and retrieves the corresponding device pointer via
76 /// `cuMemHostGetDevicePointer_v2`. A CUDA context must be current on
77 /// the calling thread.
78 ///
79 /// # Errors
80 ///
81 /// Returns a CUDA driver error if allocation or mapping fails.
82 pub fn alloc(n: usize) -> CudaResult<Self> {
83 let api = try_driver()?;
84 let byte_size = n.saturating_mul(size_of::<T>());
85
86 // Allocate page-locked host memory.
87 let mut raw_ptr: *mut c_void = std::ptr::null_mut();
88 oxicuda_driver::error::check(unsafe {
89 (api.cu_mem_alloc_host_v2)(&mut raw_ptr, byte_size)
90 })?;
91 let host_ptr = raw_ptr.cast::<T>();
92
93 // Obtain the device-side pointer for this pinned region.
94 let mut device_ptr: CUdeviceptr = 0;
95 let result = oxicuda_driver::error::check(unsafe {
96 (api.cu_mem_host_get_device_pointer_v2)(&mut device_ptr, raw_ptr, 0)
97 });
98 if let Err(e) = result {
99 // Free the pinned allocation before propagating the error.
100 unsafe { (api.cu_mem_free_host)(raw_ptr) };
101 return Err(e);
102 }
103
104 Ok(Self {
105 host_ptr,
106 device_ptr,
107 len: n,
108 _phantom: PhantomData,
109 })
110 }
111
112 /// Returns the number of `T` elements in this buffer.
113 #[inline]
114 pub fn len(&self) -> usize {
115 self.len
116 }
117
118 /// Returns `true` if the buffer contains zero elements.
119 #[inline]
120 pub fn is_empty(&self) -> bool {
121 self.len == 0
122 }
123
124 /// Returns the byte size of this buffer.
125 #[inline]
126 pub fn byte_size(&self) -> usize {
127 self.len * size_of::<T>()
128 }
129
130 /// Returns the raw device pointer for use in kernel parameters.
131 #[inline]
132 pub fn as_device_ptr(&self) -> CUdeviceptr {
133 self.device_ptr
134 }
135
136 /// Returns a raw const pointer to the host-side data.
137 #[inline]
138 pub fn as_host_ptr(&self) -> *const T {
139 self.host_ptr
140 }
141
142 /// Returns a raw mutable pointer to the host-side data.
143 #[inline]
144 pub fn as_host_ptr_mut(&mut self) -> *mut T {
145 self.host_ptr
146 }
147
148 /// Returns a shared slice over the host-side data.
149 ///
150 /// # Safety
151 ///
152 /// The caller must ensure no concurrent GPU writes are in flight.
153 pub fn as_host_slice(&self) -> &[T] {
154 // SAFETY: host_ptr is valid for `len` elements allocated by cuMemAllocHost.
155 unsafe { std::slice::from_raw_parts(self.host_ptr, self.len) }
156 }
157
158 /// Returns a mutable slice over the host-side data.
159 ///
160 /// # Safety
161 ///
162 /// The caller must ensure no concurrent GPU reads or writes are in flight.
163 pub fn as_host_slice_mut(&mut self) -> &mut [T] {
164 // SAFETY: host_ptr is valid for `len` elements allocated by cuMemAllocHost.
165 unsafe { std::slice::from_raw_parts_mut(self.host_ptr, self.len) }
166 }
167}
168
169impl<T: Copy> Drop for MappedBuffer<T> {
170 fn drop(&mut self) {
171 if self.host_ptr.is_null() {
172 return;
173 }
174 if let Ok(api) = try_driver() {
175 // SAFETY: host_ptr was allocated by cuMemAllocHost_v2 and has not
176 // been freed yet (Drop is called at most once).
177 unsafe { (api.cu_mem_free_host)(self.host_ptr.cast::<c_void>()) };
178 }
179 }
180}