async_cuda_core/memory/host.rs
1use crate::ffi;
2use crate::memory::DeviceBuffer;
3use crate::runtime::Future;
4use crate::stream::Stream;
5
6type Result<T> = std::result::Result<T, crate::error::Error>;
7
8/// A host buffer.
9///
10/// # Performance
11///
12/// Host buffers are managed by CUDA and can be used for pinned memory transfer. Pinned memory
13/// transfer speeds are usually higher compared to paged memory transfers. Pinned memory buffers are
14/// especially important for this crate because the runtime thread must do the least amount of CPU
15/// work possible. Paged transfers do require the host to move data into a CUDA managed buffer first
16/// (an extra memory copy) whilst pinned transfers do not.
17pub struct HostBuffer<T: Copy + 'static> {
18 inner: ffi::memory::HostBuffer<T>,
19}
20
21impl<T: Copy + 'static> HostBuffer<T> {
22 /// Allocates memory on the host. This creates a pinned buffer. Any transfers to and from this
23 /// buffer automatically become pinned transfers, and will be much faster.
24 ///
25 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c)
26 ///
27 /// # Arguments
28 ///
29 /// * `num_elements` - Number of elements to allocate.
30 pub async fn new(num_elements: usize) -> Self {
31 let inner = Future::new(move || ffi::memory::HostBuffer::<T>::new(num_elements)).await;
32 Self { inner }
33 }
34
35 /// Allocates memory on the host and copies the provided data into it.
36 ///
37 /// This creates a pinned buffer. Any transfers to and from this buffer automatically become
38 /// pinned transfers, and will be much faster.
39 ///
40 /// This is a convenience function that allows the caller to quickly put data into a host
41 /// buffer. It is roughly similar to `buffer.copy_from_slice(slice)`.
42 ///
43 /// # Arguments
44 ///
45 /// * `slice` - Data to copy into the new host buffer.
46 pub async fn from_slice(slice: &[T]) -> Self {
47 let mut this = Self::new(slice.len()).await;
48 this.copy_from_slice(slice);
49 this
50 }
51
52 /// Allocates memory on the host and copies the provided array into it.
53 ///
54 /// This creates a pinned buffer. Any transfers to and from this buffer automatically become
55 /// pinned transfers, and will be much faster.
56 ///
57 /// This is a convenience function that allows the caller to quickly put data into a host
58 /// buffer. It is roughly similar to `buffer.copy_from_array(slice)`.
59 ///
60 /// # Arguments
61 ///
62 /// * `array` - Array to copy into the new host buffer.
63 #[cfg(feature = "ndarray")]
64 pub async fn from_array<D: ndarray::Dimension>(array: &ndarray::ArrayView<'_, T, D>) -> Self {
65 let mut this = Self::new(array.len()).await;
66 this.copy_from_array(array);
67 this
68 }
69
70 /// Copies memory from the provided device buffer to this buffer.
71 ///
72 /// This function synchronizes the stream implicitly.
73 ///
74 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
75 ///
76 /// # Pinned transfer
77 ///
78 /// This function is guaranteed to produce a pinned transfer on the runtime thread.
79 ///
80 /// # Stream ordered semantics
81 ///
82 /// This function uses stream ordered semantics. It can only be guaranteed to complete
83 /// sequentially relative to operations scheduled on the same stream or the default stream.
84 ///
85 /// # Arguments
86 ///
87 /// * `other` - Device buffer to copy from.
88 /// * `stream` - Stream to use.
89 #[inline(always)]
90 pub async fn copy_from(&mut self, other: &DeviceBuffer<T>, stream: &Stream) -> Result<()> {
91 other.copy_to(self, stream).await
92 }
93
94 /// Copies memory from the provided device buffer to this buffer.
95 ///
96 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
97 ///
98 /// # Pinned transfer
99 ///
100 /// This function is guaranteed to produce a pinned transfer on the runtime thread.
101 ///
102 /// # Stream ordered semantics
103 ///
104 /// This function uses stream ordered semantics. It can only be guaranteed to complete
105 /// sequentially relative to operations scheduled on the same stream or the default stream.
106 ///
107 /// # Safety
108 ///
109 /// This function is unsafe because the operation might not have completed when the function
110 /// returns, and thus the state of the buffer is undefined.
111 ///
112 /// # Arguments
113 ///
114 /// * `other` - Device buffer to copy from.
115 /// * `stream` - Stream to use.
116 #[inline(always)]
117 pub async unsafe fn copy_from_async(
118 &mut self,
119 other: &DeviceBuffer<T>,
120 stream: &Stream,
121 ) -> Result<()> {
122 other.copy_to_async(self, stream).await
123 }
124
125 /// Copies memory from this buffer to the provided device buffer.
126 ///
127 /// This function synchronizes the stream implicitly.
128 ///
129 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
130 ///
131 /// # Pinned transfer
132 ///
133 /// This function is guaranteed to produce a pinned transfer on the runtime thread.
134 ///
135 /// # Stream ordered semantics
136 ///
137 /// This function uses stream ordered semantics. It can only be guaranteed to complete
138 /// sequentially relative to operations scheduled on the same stream or the default stream.
139 ///
140 /// # Arguments
141 ///
142 /// * `other` - Device buffer to copy to.
143 /// * `stream` - Stream to use.
144 #[inline(always)]
145 pub async fn copy_to(&self, other: &mut DeviceBuffer<T>, stream: &Stream) -> Result<()> {
146 other.copy_from(self, stream).await
147 }
148
149 /// Copies memory from this buffer to the provided device buffer.
150 ///
151 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
152 ///
153 /// # Pinned transfer
154 ///
155 /// This function is guaranteed to produce a pinned transfer on the runtime thread.
156 ///
157 /// # Stream ordered semantics
158 ///
159 /// This function uses stream ordered semantics. It can only be guaranteed to complete
160 /// sequentially relative to operations scheduled on the same stream or the default stream.
161 ///
162 /// # Safety
163 ///
164 /// This function is unsafe because the operation might not have completed when the function
165 /// returns, and thus the state of the buffer is undefined.
166 ///
167 /// # Arguments
168 ///
169 /// * `other` - Device buffer to copy to.
170 /// * `stream` - Stream to use.
171 #[inline(always)]
172 pub async unsafe fn copy_to_async(
173 &self,
174 other: &mut DeviceBuffer<T>,
175 stream: &Stream,
176 ) -> Result<()> {
177 other.copy_from_async(self, stream).await
178 }
179
180 /// Copy data into the host buffer from a slice.
181 ///
182 /// # Synchronization safety
183 ///
184 /// This call is only synchronization-safe if all streams that have previously been used for
185 /// copy operations either from or to this host buffer have been synchronized, and no operations
186 /// have been scheduled since.
187 ///
188 /// # Arguments
189 ///
190 /// * `slice` - Data to copy into the new host buffer.
191 ///
192 /// # Example
193 ///
194 /// ```
195 /// # use async_cuda_core::HostBuffer;
196 /// # tokio_test::block_on(async {
197 /// let mut host_buffer = HostBuffer::<u8>::new(100).await;
198 /// let some_data = vec![10; 100];
199 /// host_buffer.copy_from_slice(&some_data);
200 /// # })
201 /// ```
202 #[inline(always)]
203 pub fn copy_from_slice(&mut self, slice: &[T]) {
204 self.inner.copy_from_slice(slice);
205 }
206
207 /// Copy array into the host buffer from a slice.
208 ///
209 /// # Synchronization safety
210 ///
211 /// This call is only synchronization-safe if all streams that have previously been used for
212 /// copy operations either from or to this host buffer have been synchronized, and no operations
213 /// have been scheduled since.
214 ///
215 /// # Arguments
216 ///
217 /// * `array` - Array to copy into the new host buffer.
218 #[cfg(feature = "ndarray")]
219 #[inline(always)]
220 pub fn copy_from_array<D: ndarray::Dimension>(&mut self, array: &ndarray::ArrayView<T, D>) {
221 self.inner.copy_from_array(array)
222 }
223
224 /// Copy the data to a [`Vec`] and return it.
225 #[inline(always)]
226 pub fn to_vec(&self) -> Vec<T> {
227 self.inner.to_vec()
228 }
229
230 /// Copy the data to an [`ndarray::Array`] and return it.
231 ///
232 /// Function panics if provided shape does not match size of array.
233 ///
234 /// # Arguments
235 ///
236 /// * `shape` - Shape for array.
237 #[cfg(feature = "ndarray")]
238 #[inline(always)]
239 pub fn to_array_with_shape<D: ndarray::Dimension>(
240 &self,
241 shape: impl Into<ndarray::StrideShape<D>>,
242 ) -> ndarray::Array<T, D> {
243 self.inner.to_array_with_shape::<D>(shape)
244 }
245
246 /// Get number of elements in buffer.
247 #[inline(always)]
248 pub fn num_elements(&self) -> usize {
249 self.inner.num_elements
250 }
251
252 /// Access the inner synchronous implementation of [`HostBuffer`].
253 #[inline(always)]
254 pub fn inner(&self) -> &ffi::memory::HostBuffer<T> {
255 &self.inner
256 }
257
258 /// Access the inner synchronous implementation of [`HostBuffer`].
259 #[inline(always)]
260 pub fn inner_mut(&mut self) -> &mut ffi::memory::HostBuffer<T> {
261 &mut self.inner
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 #[tokio::test]
270 async fn test_new() {
271 let buffer = HostBuffer::<u32>::new(100).await;
272 assert_eq!(buffer.num_elements(), 100);
273 assert_eq!(buffer.to_vec().len(), 100);
274 }
275
276 #[tokio::test]
277 async fn test_from_slice() {
278 let all_ones = vec![1_u32; 200];
279 let buffer = HostBuffer::from_slice(all_ones.as_slice()).await;
280 assert_eq!(buffer.num_elements(), 200);
281 let data = buffer.to_vec();
282 assert_eq!(data.len(), 200);
283 assert!(data.into_iter().all(|v| v == 1_u32));
284 }
285
286 #[tokio::test]
287 async fn test_copy() {
288 let stream = Stream::new().await.unwrap();
289 let all_ones = vec![1_u32; 100];
290 let host_buffer = HostBuffer::from_slice(all_ones.as_slice()).await;
291
292 let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
293 unsafe {
294 host_buffer
295 .copy_to_async(&mut device_buffer, &stream)
296 .await
297 .unwrap();
298 }
299
300 let mut return_host_buffer = HostBuffer::<u32>::new(100).await;
301 unsafe {
302 return_host_buffer
303 .copy_from_async(&device_buffer, &stream)
304 .await
305 .unwrap();
306 }
307
308 stream.synchronize().await.unwrap();
309
310 assert_eq!(return_host_buffer.num_elements(), 100);
311 let return_data = return_host_buffer.to_vec();
312 assert_eq!(return_data.len(), 100);
313 assert!(return_data.into_iter().all(|v| v == 1_u32));
314 }
315
316 #[tokio::test]
317 #[should_panic]
318 async fn test_it_panics_when_copying_invalid_size() {
319 let stream = Stream::new().await.unwrap();
320 let host_buffer = HostBuffer::<u32>::new(100).await;
321 let mut device_buffer = DeviceBuffer::<u32>::new(101, &Stream::null()).await;
322 let _ = unsafe { host_buffer.copy_to_async(&mut device_buffer, &stream).await };
323 }
324}